src/system/kernel/vm/vm.cpp

   1 /*
   2  * Copyright 2009-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
   3  * Copyright 2002-2010, Axel Dörfler, axeld@pinc-software.de.
   4  * Distributed under the terms of the MIT License.
   5  *
   6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
   7  * Distributed under the terms of the NewOS License.
   8  */
   9
  10
  11 #include <vm/vm.h>
  12
  13 #include <ctype.h>
  14 #include <stdlib.h>
  15 #include <stdio.h>
  16 #include <string.h>
  17 #include <sys/mman.h>
  18
  19 #include <algorithm>
  20
  21 #include <OS.h>
  22 #include <KernelExport.h>
  23
  24 #include <AutoDeleter.h>
  25
  26 #include <symbol_versioning.h>
  27
  28 #include <arch/cpu.h>
  29 #include <arch/vm.h>
  30 #include <arch/user_memory.h>
  31 #include <boot/elf.h>
  32 #include <boot/stage2.h>
  33 #include <condition_variable.h>
  34 #include <console.h>
  35 #include <debug.h>
  36 #include <file_cache.h>
  37 #include <fs/fd.h>
  38 #include <heap.h>
  39 #include <kernel.h>
  40 #include <int.h>
  41 #include <lock.h>
  42 #include <low_resource_manager.h>
  43 #include <slab/Slab.h>
  44 #include <smp.h>
  45 #include <system_info.h>
  46 #include <thread.h>
  47 #include <team.h>
  48 #include <tracing.h>
  49 #include <util/AutoLock.h>
  50 #include <vm/vm_page.h>
  51 #include <vm/vm_priv.h>
  52 #include <vm/VMAddressSpace.h>
  53 #include <vm/VMArea.h>
  54 #include <vm/VMCache.h>
  55
  56 #include "VMAddressSpaceLocking.h"
  57 #include "VMAnonymousCache.h"
  58 #include "VMAnonymousNoSwapCache.h"
  59 #include "IORequest.h"
  60
  61
  62 //#define TRACE_VM
  63 //#define TRACE_FAULTS
  64 #ifdef TRACE_VM
  65 #       define TRACE(x) dprintf x
  66 #else
  67 #       define TRACE(x) ;
  68 #endif
  69 #ifdef TRACE_FAULTS
  70 #       define FTRACE(x) dprintf x
  71 #else
  72 #       define FTRACE(x) ;
  73 #endif
  74
  75
  76 namespace {
  77
  78 class AreaCacheLocking {
  79 public:
  80         inline bool Lock(VMCache* lockable)
  81         {
  82                 return false;
  83         }
  84
  85         inline void Unlock(VMCache* lockable)
  86         {
  87                 vm_area_put_locked_cache(lockable);
  88         }
  89 };
  90
  91 class AreaCacheLocker : public AutoLocker<VMCache, AreaCacheLocking> {
  92 public:
  93         inline AreaCacheLocker(VMCache* cache = NULL)
  94                 : AutoLocker<VMCache, AreaCacheLocking>(cache, true)
  95         {
  96         }
  97
  98         inline AreaCacheLocker(VMArea* area)
  99                 : AutoLocker<VMCache, AreaCacheLocking>()
 100         {
 101                 SetTo(area);
 102         }
 103
 104         inline void SetTo(VMCache* cache, bool alreadyLocked)
 105         {
 106                 AutoLocker<VMCache, AreaCacheLocking>::SetTo(cache, alreadyLocked);
 107         }
 108
 109         inline void SetTo(VMArea* area)
 110         {
 111                 return AutoLocker<VMCache, AreaCacheLocking>::SetTo(
 112                         area != NULL ? vm_area_get_locked_cache(area) : NULL, true, true);
 113         }
 114 };
 115
 116
 117 class VMCacheChainLocker {
 118 public:
 119         VMCacheChainLocker()
 120                 :
 121                 fTopCache(NULL),
 122                 fBottomCache(NULL)
 123         {
 124         }
 125
 126         VMCacheChainLocker(VMCache* topCache)
 127                 :
 128                 fTopCache(topCache),
 129                 fBottomCache(topCache)
 130         {
 131         }
 132
 133         ~VMCacheChainLocker()
 134         {
 135                 Unlock();
 136         }
 137
 138         void SetTo(VMCache* topCache)
 139         {
 140                 fTopCache = topCache;
 141                 fBottomCache = topCache;
 142
 143                 if (topCache != NULL)
 144                         topCache->SetUserData(NULL);
 145         }
 146
 147         VMCache* LockSourceCache()
 148         {
 149                 if (fBottomCache == NULL || fBottomCache->source == NULL)
 150                         return NULL;
 151
 152                 VMCache* previousCache = fBottomCache;
 153
 154                 fBottomCache = fBottomCache->source;
 155                 fBottomCache->Lock();
 156                 fBottomCache->AcquireRefLocked();
 157                 fBottomCache->SetUserData(previousCache);
 158
 159                 return fBottomCache;
 160         }
 161
 162         void LockAllSourceCaches()
 163         {
 164                 while (LockSourceCache() != NULL) {
 165                 }
 166         }
 167
 168         void Unlock(VMCache* exceptCache = NULL)
 169         {
 170                 if (fTopCache == NULL)
 171                         return;
 172
 173                 // Unlock caches in source -> consumer direction. This is important to
 174                 // avoid double-locking and a reversal of locking order in case a cache
 175                 // is eligable for merging.
 176                 VMCache* cache = fBottomCache;
 177                 while (cache != NULL) {
 178                         VMCache* nextCache = (VMCache*)cache->UserData();
 179                         if (cache != exceptCache)
 180                                 cache->ReleaseRefAndUnlock(cache != fTopCache);
 181
 182                         if (cache == fTopCache)
 183                                 break;
 184
 185                         cache = nextCache;
 186                 }
 187
 188                 fTopCache = NULL;
 189                 fBottomCache = NULL;
 190         }
 191
 192         void UnlockKeepRefs(bool keepTopCacheLocked)
 193         {
 194                 if (fTopCache == NULL)
 195                         return;
 196
 197                 VMCache* nextCache = fBottomCache;
 198                 VMCache* cache = NULL;
 199
 200                 while (keepTopCacheLocked
 201                                 ? nextCache != fTopCache : cache != fTopCache) {
 202                         cache = nextCache;
 203                         nextCache = (VMCache*)cache->UserData();
 204                         cache->Unlock(cache != fTopCache);
 205                 }
 206         }
 207
 208         void RelockCaches(bool topCacheLocked)
 209         {
 210                 if (fTopCache == NULL)
 211                         return;
 212
 213                 VMCache* nextCache = fTopCache;
 214                 VMCache* cache = NULL;
 215                 if (topCacheLocked) {
 216                         cache = nextCache;
 217                         nextCache = cache->source;
 218                 }
 219
 220                 while (cache != fBottomCache && nextCache != NULL) {
 221                         VMCache* consumer = cache;
 222                         cache = nextCache;
 223                         nextCache = cache->source;
 224                         cache->Lock();
 225                         cache->SetUserData(consumer);
 226                 }
 227         }
 228
 229 private:
 230         VMCache*        fTopCache;
 231         VMCache*        fBottomCache;
 232 };
 233
 234 } // namespace
 235
 236
 237 // The memory reserve an allocation of the certain priority must not touch.
 238 static const size_t kMemoryReserveForPriority[] = {
 239         VM_MEMORY_RESERVE_USER,         // user
 240         VM_MEMORY_RESERVE_SYSTEM,       // system
 241         0                                                       // VIP
 242 };
 243
 244
 245 ObjectCache* gPageMappingsObjectCache;
 246
 247 static rw_lock sAreaCacheLock = RW_LOCK_INITIALIZER("area->cache");
 248
 249 static off_t sAvailableMemory;
 250 static off_t sNeededMemory;
 251 static mutex sAvailableMemoryLock = MUTEX_INITIALIZER("available memory lock");
 252 static uint32 sPageFaults;
 253
 254 static VMPhysicalPageMapper* sPhysicalPageMapper;
 255
 256 #if DEBUG_CACHE_LIST
 257
 258 struct cache_info {
 259         VMCache*        cache;
 260         addr_t          page_count;
 261         addr_t          committed;
 262 };
 263
 264 static const int kCacheInfoTableCount = 100 * 1024;
 265 static cache_info* sCacheInfoTable;
 266
 267 #endif  // DEBUG_CACHE_LIST
 268
 269
 270 // function declarations
 271 static void delete_area(VMAddressSpace* addressSpace, VMArea* area,
 272         bool addressSpaceCleanup);
 273 static status_t vm_soft_fault(VMAddressSpace* addressSpace, addr_t address,
 274         bool isWrite, bool isExecute, bool isUser, vm_page** wirePage);
 275 static status_t map_backing_store(VMAddressSpace* addressSpace,
 276         VMCache* cache, off_t offset, const char* areaName, addr_t size, int wiring,
 277         int protection, int mapping, uint32 flags,
 278         const virtual_address_restrictions* addressRestrictions, bool kernel,
 279         VMArea** _area, void** _virtualAddress);
 280 static void fix_protection(uint32* protection);
 281
 282
 283 //      #pragma mark -
 284
 285
 286 #if VM_PAGE_FAULT_TRACING
 287
 288 namespace VMPageFaultTracing {
 289
 290 class PageFaultStart : public AbstractTraceEntry {
 291 public:
 292         PageFaultStart(addr_t address, bool write, bool user, addr_t pc)
 293                 :
 294                 fAddress(address),
 295                 fPC(pc),
 296                 fWrite(write),
 297                 fUser(user)
 298         {
 299                 Initialized();
 300         }
 301
 302         virtual void AddDump(TraceOutput& out)
 303         {
 304                 out.Print("page fault %#lx %s %s, pc: %#lx", fAddress,
 305                         fWrite ? "write" : "read", fUser ? "user" : "kernel", fPC);
 306         }
 307
 308 private:
 309         addr_t  fAddress;
 310         addr_t  fPC;
 311         bool    fWrite;
 312         bool    fUser;
 313 };
 314
 315
 316 // page fault errors
 317 enum {
 318         PAGE_FAULT_ERROR_NO_AREA                = 0,
 319         PAGE_FAULT_ERROR_KERNEL_ONLY,
 320         PAGE_FAULT_ERROR_WRITE_PROTECTED,
 321         PAGE_FAULT_ERROR_READ_PROTECTED,
 322         PAGE_FAULT_ERROR_EXECUTE_PROTECTED,
 323         PAGE_FAULT_ERROR_KERNEL_BAD_USER_MEMORY,
 324         PAGE_FAULT_ERROR_NO_ADDRESS_SPACE
 325 };
 326
 327
 328 class PageFaultError : public AbstractTraceEntry {
 329 public:
 330         PageFaultError(area_id area, status_t error)
 331                 :
 332                 fArea(area),
 333                 fError(error)
 334         {
 335                 Initialized();
 336         }
 337
 338         virtual void AddDump(TraceOutput& out)
 339         {
 340                 switch (fError) {
 341                         case PAGE_FAULT_ERROR_NO_AREA:
 342                                 out.Print("page fault error: no area");
 343                                 break;
 344                         case PAGE_FAULT_ERROR_KERNEL_ONLY:
 345                                 out.Print("page fault error: area: %ld, kernel only", fArea);
 346                                 break;
 347                         case PAGE_FAULT_ERROR_WRITE_PROTECTED:
 348                                 out.Print("page fault error: area: %ld, write protected",
 349                                         fArea);
 350                                 break;
 351                         case PAGE_FAULT_ERROR_READ_PROTECTED:
 352                                 out.Print("page fault error: area: %ld, read protected", fArea);
 353                                 break;
 354                         case PAGE_FAULT_ERROR_EXECUTE_PROTECTED:
 355                                 out.Print("page fault error: area: %ld, execute protected",
 356                                         fArea);
 357                                 break;
 358                         case PAGE_FAULT_ERROR_KERNEL_BAD_USER_MEMORY:
 359                                 out.Print("page fault error: kernel touching bad user memory");
 360                                 break;
 361                         case PAGE_FAULT_ERROR_NO_ADDRESS_SPACE:
 362                                 out.Print("page fault error: no address space");
 363                                 break;
 364                         default:
 365                                 out.Print("page fault error: area: %ld, error: %s", fArea,
 366                                         strerror(fError));
 367                                 break;
 368                 }
 369         }
 370
 371 private:
 372         area_id         fArea;
 373         status_t        fError;
 374 };
 375
 376
 377 class PageFaultDone : public AbstractTraceEntry {
 378 public:
 379         PageFaultDone(area_id area, VMCache* topCache, VMCache* cache,
 380                         vm_page* page)
 381                 :
 382                 fArea(area),
 383                 fTopCache(topCache),
 384                 fCache(cache),
 385                 fPage(page)
 386         {
 387                 Initialized();
 388         }
 389
 390         virtual void AddDump(TraceOutput& out)
 391         {
 392                 out.Print("page fault done: area: %ld, top cache: %p, cache: %p, "
 393                         "page: %p", fArea, fTopCache, fCache, fPage);
 394         }
 395
 396 private:
 397         area_id         fArea;
 398         VMCache*        fTopCache;
 399         VMCache*        fCache;
 400         vm_page*        fPage;
 401 };
 402
 403 }       // namespace VMPageFaultTracing
 404
 405 #       define TPF(x) new(std::nothrow) VMPageFaultTracing::x;
 406 #else
 407 #       define TPF(x) ;
 408 #endif  // VM_PAGE_FAULT_TRACING
 409
 410
 411 //      #pragma mark -
 412
 413
 414 /*!     The page's cache must be locked.
 415 */
 416 static inline void
 417 increment_page_wired_count(vm_page* page)
 418 {
 419         if (!page->IsMapped())
 420                 atomic_add(&gMappedPagesCount, 1);
 421         page->IncrementWiredCount();
 422 }
 423
 424
 425 /*!     The page's cache must be locked.
 426 */
 427 static inline void
 428 decrement_page_wired_count(vm_page* page)
 429 {
 430         page->DecrementWiredCount();
 431         if (!page->IsMapped())
 432                 atomic_add(&gMappedPagesCount, -1);
 433 }
 434
 435
 436 static inline addr_t
 437 virtual_page_address(VMArea* area, vm_page* page)
 438 {
 439         return area->Base()
 440                 + ((page->cache_offset << PAGE_SHIFT) - area->cache_offset);
 441 }
 442
 443
 444 //! You need to have the address space locked when calling this function
 445 static VMArea*
 446 lookup_area(VMAddressSpace* addressSpace, area_id id)
 447 {
 448         VMAreaHash::ReadLock();
 449
 450         VMArea* area = VMAreaHash::LookupLocked(id);
 451         if (area != NULL && area->address_space != addressSpace)
 452                 area = NULL;
 453
 454         VMAreaHash::ReadUnlock();
 455
 456         return area;
 457 }
 458
 459
 460 static status_t
 461 allocate_area_page_protections(VMArea* area)
 462 {
 463         // In the page protections we store only the three user protections,
 464         // so we use 4 bits per page.
 465         uint32 bytes = (area->Size() / B_PAGE_SIZE + 1) / 2;
 466         area->page_protections = (uint8*)malloc_etc(bytes,
 467                 HEAP_DONT_LOCK_KERNEL_SPACE);
 468         if (area->page_protections == NULL)
 469                 return B_NO_MEMORY;
 470
 471         // init the page protections for all pages to that of the area
 472         uint32 areaProtection = area->protection
 473                 & (B_READ_AREA | B_WRITE_AREA | B_EXECUTE_AREA);
 474         memset(area->page_protections, areaProtection | (areaProtection << 4),
 475                 bytes);
 476         return B_OK;
 477 }
 478
 479
 480 static inline void
 481 set_area_page_protection(VMArea* area, addr_t pageAddress, uint32 protection)
 482 {
 483         protection &= B_READ_AREA | B_WRITE_AREA | B_EXECUTE_AREA;
 484         uint32 pageIndex = (pageAddress - area->Base()) / B_PAGE_SIZE;
 485         uint8& entry = area->page_protections[pageIndex / 2];
 486         if (pageIndex % 2 == 0)
 487                 entry = (entry & 0xf0) | protection;
 488         else
 489                 entry = (entry & 0x0f) | (protection << 4);
 490 }
 491
 492
 493 static inline uint32
 494 get_area_page_protection(VMArea* area, addr_t pageAddress)
 495 {
 496         if (area->page_protections == NULL)
 497                 return area->protection;
 498
 499         uint32 pageIndex = (pageAddress - area->Base()) / B_PAGE_SIZE;
 500         uint32 protection = area->page_protections[pageIndex / 2];
 501         if (pageIndex % 2 == 0)
 502                 protection &= 0x0f;
 503         else
 504                 protection >>= 4;
 505
 506         // If this is a kernel area we translate the user flags to kernel flags.
 507         if (area->address_space == VMAddressSpace::Kernel()) {
 508                 uint32 kernelProtection = 0;
 509                 if ((protection & B_READ_AREA) != 0)
 510                         kernelProtection |= B_KERNEL_READ_AREA;
 511                 if ((protection & B_WRITE_AREA) != 0)
 512                         kernelProtection |= B_KERNEL_WRITE_AREA;
 513
 514                 return kernelProtection;
 515         }
 516
 517         return protection | B_KERNEL_READ_AREA
 518                 | (protection & B_WRITE_AREA ? B_KERNEL_WRITE_AREA : 0);
 519 }
 520
 521
 522 /*!     The caller must have reserved enough pages the translation map
 523         implementation might need to map this page.
 524         The page's cache must be locked.
 525 */
 526 static status_t
 527 map_page(VMArea* area, vm_page* page, addr_t address, uint32 protection,
 528         vm_page_reservation* reservation)
 529 {
 530         VMTranslationMap* map = area->address_space->TranslationMap();
 531
 532         bool wasMapped = page->IsMapped();
 533
 534         if (area->wiring == B_NO_LOCK) {
 535                 DEBUG_PAGE_ACCESS_CHECK(page);
 536
 537                 bool isKernelSpace = area->address_space == VMAddressSpace::Kernel();
 538                 vm_page_mapping* mapping = (vm_page_mapping*)object_cache_alloc(
 539                         gPageMappingsObjectCache,
 540                         CACHE_DONT_WAIT_FOR_MEMORY
 541                                 | (isKernelSpace ? CACHE_DONT_LOCK_KERNEL_SPACE : 0));
 542                 if (mapping == NULL)
 543                         return B_NO_MEMORY;
 544
 545                 mapping->page = page;
 546                 mapping->area = area;
 547
 548                 map->Lock();
 549
 550                 map->Map(address, page->physical_page_number * B_PAGE_SIZE, protection,
 551                         area->MemoryType(), reservation);
 552
 553                 // insert mapping into lists
 554                 if (!page->IsMapped())
 555                         atomic_add(&gMappedPagesCount, 1);
 556
 557                 page->mappings.Add(mapping);
 558                 area->mappings.Add(mapping);
 559
 560                 map->Unlock();
 561         } else {
 562                 DEBUG_PAGE_ACCESS_CHECK(page);
 563
 564                 map->Lock();
 565                 map->Map(address, page->physical_page_number * B_PAGE_SIZE, protection,
 566                         area->MemoryType(), reservation);
 567                 map->Unlock();
 568
 569                 increment_page_wired_count(page);
 570         }
 571
 572         if (!wasMapped) {
 573                 // The page is mapped now, so we must not remain in the cached queue.
 574                 // It also makes sense to move it from the inactive to the active, since
 575                 // otherwise the page daemon wouldn't come to keep track of it (in idle
 576                 // mode) -- if the page isn't touched, it will be deactivated after a
 577                 // full iteration through the queue at the latest.
 578                 if (page->State() == PAGE_STATE_CACHED
 579                                 || page->State() == PAGE_STATE_INACTIVE) {
 580                         vm_page_set_state(page, PAGE_STATE_ACTIVE);
 581                 }
 582         }
 583
 584         return B_OK;
 585 }
 586
 587
 588 /*!     If \a preserveModified is \c true, the caller must hold the lock of the
 589         page's cache.
 590 */
 591 static inline bool
 592 unmap_page(VMArea* area, addr_t virtualAddress)
 593 {
 594         return area->address_space->TranslationMap()->UnmapPage(area,
 595                 virtualAddress, true);
 596 }
 597
 598
 599 /*!     If \a preserveModified is \c true, the caller must hold the lock of all
 600         mapped pages' caches.
 601 */
 602 static inline void
 603 unmap_pages(VMArea* area, addr_t base, size_t size)
 604 {
 605         area->address_space->TranslationMap()->UnmapPages(area, base, size, true);
 606 }
 607
 608
 609 /*!     Cuts a piece out of an area. If the given cut range covers the complete
 610         area, it is deleted. If it covers the beginning or the end, the area is
 611         resized accordingly. If the range covers some part in the middle of the
 612         area, it is split in two; in this case the second area is returned via
 613         \a _secondArea (the variable is left untouched in the other cases).
 614         The address space must be write locked.
 615         The caller must ensure that no part of the given range is wired.
 616 */
 617 static status_t
 618 cut_area(VMAddressSpace* addressSpace, VMArea* area, addr_t address,
 619         addr_t lastAddress, VMArea** _secondArea, bool kernel)
 620 {
 621         // Does the cut range intersect with the area at all?
 622         addr_t areaLast = area->Base() + (area->Size() - 1);
 623         if (area->Base() > lastAddress || areaLast < address)
 624                 return B_OK;
 625
 626         // Is the area fully covered?
 627         if (area->Base() >= address && areaLast <= lastAddress) {
 628                 delete_area(addressSpace, area, false);
 629                 return B_OK;
 630         }
 631
 632         int priority;
 633         uint32 allocationFlags;
 634         if (addressSpace == VMAddressSpace::Kernel()) {
 635                 priority = VM_PRIORITY_SYSTEM;
 636                 allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
 637                         | HEAP_DONT_LOCK_KERNEL_SPACE;
 638         } else {
 639                 priority = VM_PRIORITY_USER;
 640                 allocationFlags = 0;
 641         }
 642
 643         VMCache* cache = vm_area_get_locked_cache(area);
 644         VMCacheChainLocker cacheChainLocker(cache);
 645         cacheChainLocker.LockAllSourceCaches();
 646
 647         // Cut the end only?
 648         if (areaLast <= lastAddress) {
 649                 size_t oldSize = area->Size();
 650                 size_t newSize = address - area->Base();
 651
 652                 status_t error = addressSpace->ShrinkAreaTail(area, newSize,
 653                         allocationFlags);
 654                 if (error != B_OK)
 655                         return error;
 656
 657                 // unmap pages
 658                 unmap_pages(area, address, oldSize - newSize);
 659
 660                 // If no one else uses the area's cache, we can resize it, too.
 661                 if (cache->areas == area && area->cache_next == NULL
 662                         && cache->consumers.IsEmpty()
 663                         && cache->type == CACHE_TYPE_RAM) {
 664                         // Since VMCache::Resize() can temporarily drop the lock, we must
 665                         // unlock all lower caches to prevent locking order inversion.
 666                         cacheChainLocker.Unlock(cache);
 667                         cache->Resize(cache->virtual_base + newSize, priority);
 668                         cache->ReleaseRefAndUnlock();
 669                 }
 670
 671                 return B_OK;
 672         }
 673
 674         // Cut the beginning only?
 675         if (area->Base() >= address) {
 676                 addr_t oldBase = area->Base();
 677                 addr_t newBase = lastAddress + 1;
 678                 size_t newSize = areaLast - lastAddress;
 679
 680                 // unmap pages
 681                 unmap_pages(area, oldBase, newBase - oldBase);
 682
 683                 // resize the area
 684                 status_t error = addressSpace->ShrinkAreaHead(area, newSize,
 685                         allocationFlags);
 686                 if (error != B_OK)
 687                         return error;
 688
 689                 // TODO: If no one else uses the area's cache, we should resize it, too!
 690
 691                 area->cache_offset += newBase - oldBase;
 692
 693                 return B_OK;
 694         }
 695
 696         // The tough part -- cut a piece out of the middle of the area.
 697         // We do that by shrinking the area to the begin section and creating a
 698         // new area for the end section.
 699
 700         addr_t firstNewSize = address - area->Base();
 701         addr_t secondBase = lastAddress + 1;
 702         addr_t secondSize = areaLast - lastAddress;
 703
 704         // unmap pages
 705         unmap_pages(area, address, area->Size() - firstNewSize);
 706
 707         // resize the area
 708         addr_t oldSize = area->Size();
 709         status_t error = addressSpace->ShrinkAreaTail(area, firstNewSize,
 710                 allocationFlags);
 711         if (error != B_OK)
 712                 return error;
 713
 714         // TODO: If no one else uses the area's cache, we might want to create a
 715         // new cache for the second area, transfer the concerned pages from the
 716         // first cache to it and resize the first cache.
 717
 718         // map the second area
 719         virtual_address_restrictions addressRestrictions = {};
 720         addressRestrictions.address = (void*)secondBase;
 721         addressRestrictions.address_specification = B_EXACT_ADDRESS;
 722         VMArea* secondArea;
 723         error = map_backing_store(addressSpace, cache,
 724                 area->cache_offset + (secondBase - area->Base()), area->name,
 725                 secondSize, area->wiring, area->protection, REGION_NO_PRIVATE_MAP, 0,
 726                 &addressRestrictions, kernel, &secondArea, NULL);
 727         if (error != B_OK) {
 728                 addressSpace->ShrinkAreaTail(area, oldSize, allocationFlags);
 729                 return error;
 730         }
 731
 732         // We need a cache reference for the new area.
 733         cache->AcquireRefLocked();
 734
 735         if (_secondArea != NULL)
 736                 *_secondArea = secondArea;
 737
 738         return B_OK;
 739 }
 740
 741
 742 /*!     Deletes all areas in the given address range.
 743         The address space must be write-locked.
 744         The caller must ensure that no part of the given range is wired.
 745 */
 746 static status_t
 747 unmap_address_range(VMAddressSpace* addressSpace, addr_t address, addr_t size,
 748         bool kernel)
 749 {
 750         size = PAGE_ALIGN(size);
 751         addr_t lastAddress = address + (size - 1);
 752
 753         // Check, whether the caller is allowed to modify the concerned areas.
 754         if (!kernel) {
 755                 for (VMAddressSpace::AreaIterator it = addressSpace->GetAreaIterator();
 756                                 VMArea* area = it.Next();) {
 757                         addr_t areaLast = area->Base() + (area->Size() - 1);
 758                         if (area->Base() < lastAddress && address < areaLast) {
 759                                 if ((area->protection & B_KERNEL_AREA) != 0)
 760                                         return B_NOT_ALLOWED;
 761                         }
 762                 }
 763         }
 764
 765         for (VMAddressSpace::AreaIterator it = addressSpace->GetAreaIterator();
 766                         VMArea* area = it.Next();) {
 767                 addr_t areaLast = area->Base() + (area->Size() - 1);
 768                 if (area->Base() < lastAddress && address < areaLast) {
 769                         status_t error = cut_area(addressSpace, area, address,
 770                                 lastAddress, NULL, kernel);
 771                         if (error != B_OK)
 772                                 return error;
 773                                 // Failing after already messing with areas is ugly, but we
 774                                 // can't do anything about it.
 775                 }
 776         }
 777
 778         return B_OK;
 779 }
 780
 781
 782 /*! You need to hold the lock of the cache and the write lock of the address
 783         space when calling this function.
 784         Note, that in case of error your cache will be temporarily unlocked.
 785         If \a addressSpec is \c B_EXACT_ADDRESS and the
 786         \c CREATE_AREA_UNMAP_ADDRESS_RANGE flag is specified, the caller must ensure
 787         that no part of the specified address range (base \c *_virtualAddress, size
 788         \a size) is wired.
 789 */
 790 static status_t
 791 map_backing_store(VMAddressSpace* addressSpace, VMCache* cache, off_t offset,
 792         const char* areaName, addr_t size, int wiring, int protection, int mapping,
 793         uint32 flags, const virtual_address_restrictions* addressRestrictions,
 794         bool kernel, VMArea** _area, void** _virtualAddress)
 795 {
 796         TRACE(("map_backing_store: aspace %p, cache %p, virtual %p, offset 0x%"
 797                 B_PRIx64 ", size %" B_PRIuADDR ", addressSpec %" B_PRIu32 ", wiring %d"
 798                 ", protection %d, area %p, areaName '%s'\n", addressSpace, cache,
 799                 addressRestrictions->address, offset, size,
 800                 addressRestrictions->address_specification, wiring, protection,
 801                 _area, areaName));
 802         cache->AssertLocked();
 803
 804         uint32 allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
 805                 | HEAP_DONT_LOCK_KERNEL_SPACE;
 806         int priority;
 807         if (addressSpace != VMAddressSpace::Kernel()) {
 808                 priority = VM_PRIORITY_USER;
 809         } else if ((flags & CREATE_AREA_PRIORITY_VIP) != 0) {
 810                 priority = VM_PRIORITY_VIP;
 811                 allocationFlags |= HEAP_PRIORITY_VIP;
 812         } else
 813                 priority = VM_PRIORITY_SYSTEM;
 814
 815         VMArea* area = addressSpace->CreateArea(areaName, wiring, protection,
 816                 allocationFlags);
 817         if (area == NULL)
 818                 return B_NO_MEMORY;
 819
 820         status_t status;
 821
 822         // if this is a private map, we need to create a new cache
 823         // to handle the private copies of pages as they are written to
 824         VMCache* sourceCache = cache;
 825         if (mapping == REGION_PRIVATE_MAP) {
 826                 VMCache* newCache;
 827
 828                 // create an anonymous cache
 829                 status = VMCacheFactory::CreateAnonymousCache(newCache,
 830                         (protection & B_STACK_AREA) != 0
 831                                 || (protection & B_OVERCOMMITTING_AREA) != 0, 0,
 832                         cache->GuardSize() / B_PAGE_SIZE, true, VM_PRIORITY_USER);
 833                 if (status != B_OK)
 834                         goto err1;
 835
 836                 newCache->Lock();
 837                 newCache->temporary = 1;
 838                 newCache->virtual_base = offset;
 839                 newCache->virtual_end = offset + size;
 840
 841                 cache->AddConsumer(newCache);
 842
 843                 cache = newCache;
 844         }
 845
 846         if ((flags & CREATE_AREA_DONT_COMMIT_MEMORY) == 0) {
 847                 status = cache->SetMinimalCommitment(size, priority);
 848                 if (status != B_OK)
 849                         goto err2;
 850         }
 851
 852         // check to see if this address space has entered DELETE state
 853         if (addressSpace->IsBeingDeleted()) {
 854                 // okay, someone is trying to delete this address space now, so we can't
 855                 // insert the area, so back out
 856                 status = B_BAD_TEAM_ID;
 857                 goto err2;
 858         }
 859
 860         if (addressRestrictions->address_specification == B_EXACT_ADDRESS
 861                         && (flags & CREATE_AREA_UNMAP_ADDRESS_RANGE) != 0) {
 862                 status = unmap_address_range(addressSpace,
 863                         (addr_t)addressRestrictions->address, size, kernel);
 864                 if (status != B_OK)
 865                         goto err2;
 866         }
 867
 868         status = addressSpace->InsertArea(area, size, addressRestrictions,
 869                 allocationFlags, _virtualAddress);
 870         if (status != B_OK) {
 871                 // TODO: wait and try again once this is working in the backend
 872 #if 0
 873                 if (status == B_NO_MEMORY && addressSpec == B_ANY_KERNEL_ADDRESS) {
 874                         low_resource(B_KERNEL_RESOURCE_ADDRESS_SPACE, size,
 875                                 0, 0);
 876                 }
 877 #endif
 878                 goto err2;
 879         }
 880
 881         // attach the cache to the area
 882         area->cache = cache;
 883         area->cache_offset = offset;
 884
 885         // point the cache back to the area
 886         cache->InsertAreaLocked(area);
 887         if (mapping == REGION_PRIVATE_MAP)
 888                 cache->Unlock();
 889
 890         // insert the area in the global area hash table
 891         VMAreaHash::Insert(area);
 892
 893         // grab a ref to the address space (the area holds this)
 894         addressSpace->Get();
 895
 896 //      ktrace_printf("map_backing_store: cache: %p (source: %p), \"%s\" -> %p",
 897 //              cache, sourceCache, areaName, area);
 898
 899         *_area = area;
 900         return B_OK;
 901
 902 err2:
 903         if (mapping == REGION_PRIVATE_MAP) {
 904                 // We created this cache, so we must delete it again. Note, that we
 905                 // need to temporarily unlock the source cache or we'll otherwise
 906                 // deadlock, since VMCache::_RemoveConsumer() will try to lock it, too.
 907                 sourceCache->Unlock();
 908                 cache->ReleaseRefAndUnlock();
 909                 sourceCache->Lock();
 910         }
 911 err1:
 912         addressSpace->DeleteArea(area, allocationFlags);
 913         return status;
 914 }
 915
 916
 917 /*!     Equivalent to wait_if_area_range_is_wired(area, area->Base(), area->Size(),
 918           locker1, locker2).
 919 */
 920 template<typename LockerType1, typename LockerType2>
 921 static inline bool
 922 wait_if_area_is_wired(VMArea* area, LockerType1* locker1, LockerType2* locker2)
 923 {
 924         area->cache->AssertLocked();
 925
 926         VMAreaUnwiredWaiter waiter;
 927         if (!area->AddWaiterIfWired(&waiter))
 928                 return false;
 929
 930         // unlock everything and wait
 931         if (locker1 != NULL)
 932                 locker1->Unlock();
 933         if (locker2 != NULL)
 934                 locker2->Unlock();
 935
 936         waiter.waitEntry.Wait();
 937
 938         return true;
 939 }
 940
 941
 942 /*!     Checks whether the given area has any wired ranges intersecting with the
 943         specified range and waits, if so.
 944
 945         When it has to wait, the function calls \c Unlock() on both \a locker1
 946         and \a locker2, if given.
 947         The area's top cache must be locked and must be unlocked as a side effect
 948         of calling \c Unlock() on either \a locker1 or \a locker2.
 949
 950         If the function does not have to wait it does not modify or unlock any
 951         object.
 952
 953         \param area The area to be checked.
 954         \param base The base address of the range to check.
 955         \param size The size of the address range to check.
 956         \param locker1 An object to be unlocked when before starting to wait (may
 957                 be \c NULL).
 958         \param locker2 An object to be unlocked when before starting to wait (may
 959                 be \c NULL).
 960         \return \c true, if the function had to wait, \c false otherwise.
 961 */
 962 template<typename LockerType1, typename LockerType2>
 963 static inline bool
 964 wait_if_area_range_is_wired(VMArea* area, addr_t base, size_t size,
 965         LockerType1* locker1, LockerType2* locker2)
 966 {
 967         area->cache->AssertLocked();
 968
 969         VMAreaUnwiredWaiter waiter;
 970         if (!area->AddWaiterIfWired(&waiter, base, size))
 971                 return false;
 972
 973         // unlock everything and wait
 974         if (locker1 != NULL)
 975                 locker1->Unlock();
 976         if (locker2 != NULL)
 977                 locker2->Unlock();
 978
 979         waiter.waitEntry.Wait();
 980
 981         return true;
 982 }
 983
 984
 985 /*!     Checks whether the given address space has any wired ranges intersecting
 986         with the specified range and waits, if so.
 987
 988         Similar to wait_if_area_range_is_wired(), with the following differences:
 989         - All areas intersecting with the range are checked (respectively all until
 990           one is found that contains a wired range intersecting with the given
 991           range).
 992         - The given address space must at least be read-locked and must be unlocked
 993           when \c Unlock() is called on \a locker.
 994         - None of the areas' caches are allowed to be locked.
 995 */
 996 template<typename LockerType>
 997 static inline bool
 998 wait_if_address_range_is_wired(VMAddressSpace* addressSpace, addr_t base,
 999         size_t size, LockerType* locker)
1000 {
1001         addr_t end = base + size - 1;
1002         for (VMAddressSpace::AreaIterator it = addressSpace->GetAreaIterator();
1003                         VMArea* area = it.Next();) {
1004                 // TODO: Introduce a VMAddressSpace method to get a close iterator!
1005                 if (area->Base() > end)
1006                         return false;
1007
1008                 if (base >= area->Base() + area->Size() - 1)
1009                         continue;
1010
1011                 AreaCacheLocker cacheLocker(vm_area_get_locked_cache(area));
1012
1013                 if (wait_if_area_range_is_wired(area, base, size, locker, &cacheLocker))
1014                         return true;
1015         }
1016
1017         return false;
1018 }
1019
1020
1021 /*!     Prepares an area to be used for vm_set_kernel_area_debug_protection().
1022         It must be called in a situation where the kernel address space may be
1023         locked.
1024 */
1025 status_t
1026 vm_prepare_kernel_area_debug_protection(area_id id, void** cookie)
1027 {
1028         AddressSpaceReadLocker locker;
1029         VMArea* area;
1030         status_t status = locker.SetFromArea(id, area);
1031         if (status != B_OK)
1032                 return status;
1033
1034         if (area->page_protections == NULL) {
1035                 status = allocate_area_page_protections(area);
1036                 if (status != B_OK)
1037                         return status;
1038         }
1039
1040         *cookie = (void*)area;
1041         return B_OK;
1042 }
1043
1044
1045 /*!     This is a debug helper function that can only be used with very specific
1046         use cases.
1047         Sets protection for the given address range to the protection specified.
1048         If \a protection is 0 then the involved pages will be marked non-present
1049         in the translation map to cause a fault on access. The pages aren't
1050         actually unmapped however so that they can be marked present again with
1051         additional calls to this function. For this to work the area must be
1052         fully locked in memory so that the pages aren't otherwise touched.
1053         This function does not lock the kernel address space and needs to be
1054         supplied with a \a cookie retrieved from a successful call to
1055         vm_prepare_kernel_area_debug_protection().
1056 */
1057 status_t
1058 vm_set_kernel_area_debug_protection(void* cookie, void* _address, size_t size,
1059         uint32 protection)
1060 {
1061         // check address range
1062         addr_t address = (addr_t)_address;
1063         size = PAGE_ALIGN(size);
1064
1065         if ((address % B_PAGE_SIZE) != 0
1066                 || (addr_t)address + size < (addr_t)address
1067                 || !IS_KERNEL_ADDRESS(address)
1068                 || !IS_KERNEL_ADDRESS((addr_t)address + size)) {
1069                 return B_BAD_VALUE;
1070         }
1071
1072         // Translate the kernel protection to user protection as we only store that.
1073         if ((protection & B_KERNEL_READ_AREA) != 0)
1074                 protection |= B_READ_AREA;
1075         if ((protection & B_KERNEL_WRITE_AREA) != 0)
1076                 protection |= B_WRITE_AREA;
1077
1078         VMAddressSpace* addressSpace = VMAddressSpace::GetKernel();
1079         VMTranslationMap* map = addressSpace->TranslationMap();
1080         VMArea* area = (VMArea*)cookie;
1081
1082         addr_t offset = address - area->Base();
1083         if (area->Size() - offset < size) {
1084                 panic("protect range not fully within supplied area");
1085                 return B_BAD_VALUE;
1086         }
1087
1088         if (area->page_protections == NULL) {
1089                 panic("area has no page protections");
1090                 return B_BAD_VALUE;
1091         }
1092
1093         // Invalidate the mapping entries so any access to them will fault or
1094         // restore the mapping entries unchanged so that lookup will success again.
1095         map->Lock();
1096         map->DebugMarkRangePresent(address, address + size, protection != 0);
1097         map->Unlock();
1098
1099         // And set the proper page protections so that the fault case will actually
1100         // fail and not simply try to map a new page.
1101         for (addr_t pageAddress = address; pageAddress < address + size;
1102                         pageAddress += B_PAGE_SIZE) {
1103                 set_area_page_protection(area, pageAddress, protection);
1104         }
1105
1106         return B_OK;
1107 }
1108
1109
1110 status_t
1111 vm_block_address_range(const char* name, void* address, addr_t size)
1112 {
1113         if (!arch_vm_supports_protection(0))
1114                 return B_NOT_SUPPORTED;
1115
1116         AddressSpaceWriteLocker locker;
1117         status_t status = locker.SetTo(VMAddressSpace::KernelID());
1118         if (status != B_OK)
1119                 return status;
1120
1121         VMAddressSpace* addressSpace = locker.AddressSpace();
1122
1123         // create an anonymous cache
1124         VMCache* cache;
1125         status = VMCacheFactory::CreateAnonymousCache(cache, false, 0, 0, false,
1126                 VM_PRIORITY_SYSTEM);
1127         if (status != B_OK)
1128                 return status;
1129
1130         cache->temporary = 1;
1131         cache->virtual_end = size;
1132         cache->Lock();
1133
1134         VMArea* area;
1135         virtual_address_restrictions addressRestrictions = {};
1136         addressRestrictions.address = address;
1137         addressRestrictions.address_specification = B_EXACT_ADDRESS;
1138         status = map_backing_store(addressSpace, cache, 0, name, size,
1139                 B_ALREADY_WIRED, 0, REGION_NO_PRIVATE_MAP, 0, &addressRestrictions,
1140                 true, &area, NULL);
1141         if (status != B_OK) {
1142                 cache->ReleaseRefAndUnlock();
1143                 return status;
1144         }
1145
1146         cache->Unlock();
1147         area->cache_type = CACHE_TYPE_RAM;
1148         return area->id;
1149 }
1150
1151
1152 status_t
1153 vm_unreserve_address_range(team_id team, void* address, addr_t size)
1154 {
1155         AddressSpaceWriteLocker locker(team);
1156         if (!locker.IsLocked())
1157                 return B_BAD_TEAM_ID;
1158
1159         VMAddressSpace* addressSpace = locker.AddressSpace();
1160         return addressSpace->UnreserveAddressRange((addr_t)address, size,
1161                 addressSpace == VMAddressSpace::Kernel()
1162                         ? HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE : 0);
1163 }
1164
1165
1166 status_t
1167 vm_reserve_address_range(team_id team, void** _address, uint32 addressSpec,
1168         addr_t size, uint32 flags)
1169 {
1170         if (size == 0)
1171                 return B_BAD_VALUE;
1172
1173         AddressSpaceWriteLocker locker(team);
1174         if (!locker.IsLocked())
1175                 return B_BAD_TEAM_ID;
1176
1177         virtual_address_restrictions addressRestrictions = {};
1178         addressRestrictions.address = *_address;
1179         addressRestrictions.address_specification = addressSpec;
1180         VMAddressSpace* addressSpace = locker.AddressSpace();
1181         return addressSpace->ReserveAddressRange(size, &addressRestrictions, flags,
1182                 addressSpace == VMAddressSpace::Kernel()
1183                         ? HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE : 0,
1184                 _address);
1185 }
1186
1187
1188 area_id
1189 vm_create_anonymous_area(team_id team, const char *name, addr_t size,
1190         uint32 wiring, uint32 protection, uint32 flags, addr_t guardSize,
1191         const virtual_address_restrictions* virtualAddressRestrictions,
1192         const physical_address_restrictions* physicalAddressRestrictions,
1193         bool kernel, void** _address)
1194 {
1195         VMArea* area;
1196         VMCache* cache;
1197         vm_page* page = NULL;
1198         bool isStack = (protection & B_STACK_AREA) != 0;
1199         page_num_t guardPages;
1200         bool canOvercommit = false;
1201         uint32 pageAllocFlags = (flags & CREATE_AREA_DONT_CLEAR) == 0
1202                 ? VM_PAGE_ALLOC_CLEAR : 0;
1203
1204         TRACE(("create_anonymous_area [%" B_PRId32 "] %s: size 0x%" B_PRIxADDR "\n",
1205                 team, name, size));
1206
1207         size = PAGE_ALIGN(size);
1208         guardSize = PAGE_ALIGN(guardSize);
1209         guardPages = guardSize / B_PAGE_SIZE;
1210
1211         if (size == 0 || size < guardSize)
1212                 return B_BAD_VALUE;
1213         if (!arch_vm_supports_protection(protection))
1214                 return B_NOT_SUPPORTED;
1215
1216         if (isStack || (protection & B_OVERCOMMITTING_AREA) != 0)
1217                 canOvercommit = true;
1218
1219 #ifdef DEBUG_KERNEL_STACKS
1220         if ((protection & B_KERNEL_STACK_AREA) != 0)
1221                 isStack = true;
1222 #endif
1223
1224         // check parameters
1225         switch (virtualAddressRestrictions->address_specification) {
1226                 case B_ANY_ADDRESS:
1227                 case B_EXACT_ADDRESS:
1228                 case B_BASE_ADDRESS:
1229                 case B_ANY_KERNEL_ADDRESS:
1230                 case B_ANY_KERNEL_BLOCK_ADDRESS:
1231                 case B_RANDOMIZED_ANY_ADDRESS:
1232                 case B_RANDOMIZED_BASE_ADDRESS:
1233                         break;
1234
1235                 default:
1236                         return B_BAD_VALUE;
1237         }
1238
1239         // If low or high physical address restrictions are given, we force
1240         // B_CONTIGUOUS wiring, since only then we'll use
1241         // vm_page_allocate_page_run() which deals with those restrictions.
1242         if (physicalAddressRestrictions->low_address != 0
1243                 || physicalAddressRestrictions->high_address != 0) {
1244                 wiring = B_CONTIGUOUS;
1245         }
1246
1247         physical_address_restrictions stackPhysicalRestrictions;
1248         bool doReserveMemory = false;
1249         switch (wiring) {
1250                 case B_NO_LOCK:
1251                         break;
1252                 case B_FULL_LOCK:
1253                 case B_LAZY_LOCK:
1254                 case B_CONTIGUOUS:
1255                         doReserveMemory = true;
1256                         break;
1257                 case B_ALREADY_WIRED:
1258                         break;
1259                 case B_LOMEM:
1260                         stackPhysicalRestrictions = *physicalAddressRestrictions;
1261                         stackPhysicalRestrictions.high_address = 16 * 1024 * 1024;
1262                         physicalAddressRestrictions = &stackPhysicalRestrictions;
1263                         wiring = B_CONTIGUOUS;
1264                         doReserveMemory = true;
1265                         break;
1266                 case B_32_BIT_FULL_LOCK:
1267                         if (B_HAIKU_PHYSICAL_BITS <= 32
1268                                 || (uint64)vm_page_max_address() < (uint64)1 << 32) {
1269                                 wiring = B_FULL_LOCK;
1270                                 doReserveMemory = true;
1271                                 break;
1272                         }
1273                         // TODO: We don't really support this mode efficiently. Just fall
1274                         // through for now ...
1275                 case B_32_BIT_CONTIGUOUS:
1276                         #if B_HAIKU_PHYSICAL_BITS > 32
1277                                 if (vm_page_max_address() >= (phys_addr_t)1 << 32) {
1278                                         stackPhysicalRestrictions = *physicalAddressRestrictions;
1279                                         stackPhysicalRestrictions.high_address
1280                                                 = (phys_addr_t)1 << 32;
1281                                         physicalAddressRestrictions = &stackPhysicalRestrictions;
1282                                 }
1283                         #endif
1284                         wiring = B_CONTIGUOUS;
1285                         doReserveMemory = true;
1286                         break;
1287                 default:
1288                         return B_BAD_VALUE;
1289         }
1290
1291         // Optimization: For a single-page contiguous allocation without low/high
1292         // memory restriction B_FULL_LOCK wiring suffices.
1293         if (wiring == B_CONTIGUOUS && size == B_PAGE_SIZE
1294                 && physicalAddressRestrictions->low_address == 0
1295                 && physicalAddressRestrictions->high_address == 0) {
1296                 wiring = B_FULL_LOCK;
1297         }
1298
1299         // For full lock or contiguous areas we're also going to map the pages and
1300         // thus need to reserve pages for the mapping backend upfront.
1301         addr_t reservedMapPages = 0;
1302         if (wiring == B_FULL_LOCK || wiring == B_CONTIGUOUS) {
1303                 AddressSpaceWriteLocker locker;
1304                 status_t status = locker.SetTo(team);
1305                 if (status != B_OK)
1306                         return status;
1307
1308                 VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
1309                 reservedMapPages = map->MaxPagesNeededToMap(0, size - 1);
1310         }
1311
1312         int priority;
1313         if (team != VMAddressSpace::KernelID())
1314                 priority = VM_PRIORITY_USER;
1315         else if ((flags & CREATE_AREA_PRIORITY_VIP) != 0)
1316                 priority = VM_PRIORITY_VIP;
1317         else
1318                 priority = VM_PRIORITY_SYSTEM;
1319
1320         // Reserve memory before acquiring the address space lock. This reduces the
1321         // chances of failure, since while holding the write lock to the address
1322         // space (if it is the kernel address space that is), the low memory handler
1323         // won't be able to free anything for us.
1324         addr_t reservedMemory = 0;
1325         if (doReserveMemory) {
1326                 bigtime_t timeout = (flags & CREATE_AREA_DONT_WAIT) != 0 ? 0 : 1000000;
1327                 if (vm_try_reserve_memory(size, priority, timeout) != B_OK)
1328                         return B_NO_MEMORY;
1329                 reservedMemory = size;
1330                 // TODO: We don't reserve the memory for the pages for the page
1331                 // directories/tables. We actually need to do since we currently don't
1332                 // reclaim them (and probably can't reclaim all of them anyway). Thus
1333                 // there are actually less physical pages than there should be, which
1334                 // can get the VM into trouble in low memory situations.
1335         }
1336
1337         AddressSpaceWriteLocker locker;
1338         VMAddressSpace* addressSpace;
1339         status_t status;
1340
1341         // For full lock areas reserve the pages before locking the address
1342         // space. E.g. block caches can't release their memory while we hold the
1343         // address space lock.
1344         page_num_t reservedPages = reservedMapPages;
1345         if (wiring == B_FULL_LOCK)
1346                 reservedPages += size / B_PAGE_SIZE;
1347
1348         vm_page_reservation reservation;
1349         if (reservedPages > 0) {
1350                 if ((flags & CREATE_AREA_DONT_WAIT) != 0) {
1351                         if (!vm_page_try_reserve_pages(&reservation, reservedPages,
1352                                         priority)) {
1353                                 reservedPages = 0;
1354                                 status = B_WOULD_BLOCK;
1355                                 goto err0;
1356                         }
1357                 } else
1358                         vm_page_reserve_pages(&reservation, reservedPages, priority);
1359         }
1360
1361         if (wiring == B_CONTIGUOUS) {
1362                 // we try to allocate the page run here upfront as this may easily
1363                 // fail for obvious reasons
1364                 page = vm_page_allocate_page_run(PAGE_STATE_WIRED | pageAllocFlags,
1365                         size / B_PAGE_SIZE, physicalAddressRestrictions, priority);
1366                 if (page == NULL) {
1367                         status = B_NO_MEMORY;
1368                         goto err0;
1369                 }
1370         }
1371
1372         // Lock the address space and, if B_EXACT_ADDRESS and
1373         // CREATE_AREA_UNMAP_ADDRESS_RANGE were specified, ensure the address range
1374         // is not wired.
1375         do {
1376                 status = locker.SetTo(team);
1377                 if (status != B_OK)
1378                         goto err1;
1379
1380                 addressSpace = locker.AddressSpace();
1381         } while (virtualAddressRestrictions->address_specification
1382                         == B_EXACT_ADDRESS
1383                 && (flags & CREATE_AREA_UNMAP_ADDRESS_RANGE) != 0
1384                 && wait_if_address_range_is_wired(addressSpace,
1385                         (addr_t)virtualAddressRestrictions->address, size, &locker));
1386
1387         // create an anonymous cache
1388         // if it's a stack, make sure that two pages are available at least
1389         status = VMCacheFactory::CreateAnonymousCache(cache, canOvercommit,
1390                 isStack ? (min_c(2, size / B_PAGE_SIZE - guardPages)) : 0, guardPages,
1391                 wiring == B_NO_LOCK, priority);
1392         if (status != B_OK)
1393                 goto err1;
1394
1395         cache->temporary = 1;
1396         cache->virtual_end = size;
1397         cache->committed_size = reservedMemory;
1398                 // TODO: This should be done via a method.
1399         reservedMemory = 0;
1400
1401         cache->Lock();
1402
1403         status = map_backing_store(addressSpace, cache, 0, name, size, wiring,
1404                 protection, REGION_NO_PRIVATE_MAP, flags, virtualAddressRestrictions,
1405                 kernel, &area, _address);
1406
1407         if (status != B_OK) {
1408                 cache->ReleaseRefAndUnlock();
1409                 goto err1;
1410         }
1411
1412         locker.DegradeToReadLock();
1413
1414         switch (wiring) {
1415                 case B_NO_LOCK:
1416                 case B_LAZY_LOCK:
1417                         // do nothing - the pages are mapped in as needed
1418                         break;
1419
1420                 case B_FULL_LOCK:
1421                 {
1422                         // Allocate and map all pages for this area
1423
1424                         off_t offset = 0;
1425                         for (addr_t address = area->Base();
1426                                         address < area->Base() + (area->Size() - 1);
1427                                         address += B_PAGE_SIZE, offset += B_PAGE_SIZE) {
1428 #ifdef DEBUG_KERNEL_STACKS
1429 #       ifdef STACK_GROWS_DOWNWARDS
1430                                 if (isStack && address < area->Base()
1431                                                 + KERNEL_STACK_GUARD_PAGES * B_PAGE_SIZE)
1432 #       else
1433                                 if (isStack && address >= area->Base() + area->Size()
1434                                                 - KERNEL_STACK_GUARD_PAGES * B_PAGE_SIZE)
1435 #       endif
1436                                         continue;
1437 #endif
1438                                 vm_page* page = vm_page_allocate_page(&reservation,
1439                                         PAGE_STATE_WIRED | pageAllocFlags);
1440                                 cache->InsertPage(page, offset);
1441                                 map_page(area, page, address, protection, &reservation);
1442
1443                                 DEBUG_PAGE_ACCESS_END(page);
1444                         }
1445
1446                         break;
1447                 }
1448
1449                 case B_ALREADY_WIRED:
1450                 {
1451                         // The pages should already be mapped. This is only really useful
1452                         // during boot time. Find the appropriate vm_page objects and stick
1453                         // them in the cache object.
1454                         VMTranslationMap* map = addressSpace->TranslationMap();
1455                         off_t offset = 0;
1456
1457                         if (!gKernelStartup)
1458                                 panic("ALREADY_WIRED flag used outside kernel startup\n");
1459
1460                         map->Lock();
1461
1462                         for (addr_t virtualAddress = area->Base();
1463                                         virtualAddress < area->Base() + (area->Size() - 1);
1464                                         virtualAddress += B_PAGE_SIZE, offset += B_PAGE_SIZE) {
1465                                 phys_addr_t physicalAddress;
1466                                 uint32 flags;
1467                                 status = map->Query(virtualAddress, &physicalAddress, &flags);
1468                                 if (status < B_OK) {
1469                                         panic("looking up mapping failed for va 0x%lx\n",
1470                                                 virtualAddress);
1471                                 }
1472                                 page = vm_lookup_page(physicalAddress / B_PAGE_SIZE);
1473                                 if (page == NULL) {
1474                                         panic("looking up page failed for pa %#" B_PRIxPHYSADDR
1475                                                 "\n", physicalAddress);
1476                                 }
1477
1478                                 DEBUG_PAGE_ACCESS_START(page);
1479
1480                                 cache->InsertPage(page, offset);
1481                                 increment_page_wired_count(page);
1482                                 vm_page_set_state(page, PAGE_STATE_WIRED);
1483                                 page->busy = false;
1484
1485                                 DEBUG_PAGE_ACCESS_END(page);
1486                         }
1487
1488                         map->Unlock();
1489                         break;
1490                 }
1491
1492                 case B_CONTIGUOUS:
1493                 {
1494                         // We have already allocated our continuous pages run, so we can now
1495                         // just map them in the address space
1496                         VMTranslationMap* map = addressSpace->TranslationMap();
1497                         phys_addr_t physicalAddress
1498                                 = (phys_addr_t)page->physical_page_number * B_PAGE_SIZE;
1499                         addr_t virtualAddress = area->Base();
1500                         off_t offset = 0;
1501
1502                         map->Lock();
1503
1504                         for (virtualAddress = area->Base(); virtualAddress < area->Base()
1505                                         + (area->Size() - 1); virtualAddress += B_PAGE_SIZE,
1506                                         offset += B_PAGE_SIZE, physicalAddress += B_PAGE_SIZE) {
1507                                 page = vm_lookup_page(physicalAddress / B_PAGE_SIZE);
1508                                 if (page == NULL)
1509                                         panic("couldn't lookup physical page just allocated\n");
1510
1511                                 status = map->Map(virtualAddress, physicalAddress, protection,
1512                                         area->MemoryType(), &reservation);
1513                                 if (status < B_OK)
1514                                         panic("couldn't map physical page in page run\n");
1515
1516                                 cache->InsertPage(page, offset);
1517                                 increment_page_wired_count(page);
1518
1519                                 DEBUG_PAGE_ACCESS_END(page);
1520                         }
1521
1522                         map->Unlock();
1523                         break;
1524                 }
1525
1526                 default:
1527                         break;
1528         }
1529
1530         cache->Unlock();
1531
1532         if (reservedPages > 0)
1533                 vm_page_unreserve_pages(&reservation);
1534
1535         TRACE(("vm_create_anonymous_area: done\n"));
1536
1537         area->cache_type = CACHE_TYPE_RAM;
1538         return area->id;
1539
1540 err1:
1541         if (wiring == B_CONTIGUOUS) {
1542                 // we had reserved the area space upfront...
1543                 phys_addr_t pageNumber = page->physical_page_number;
1544                 int32 i;
1545                 for (i = size / B_PAGE_SIZE; i-- > 0; pageNumber++) {
1546                         page = vm_lookup_page(pageNumber);
1547                         if (page == NULL)
1548                                 panic("couldn't lookup physical page just allocated\n");
1549
1550                         vm_page_set_state(page, PAGE_STATE_FREE);
1551                 }
1552         }
1553
1554 err0:
1555         if (reservedPages > 0)
1556                 vm_page_unreserve_pages(&reservation);
1557         if (reservedMemory > 0)
1558                 vm_unreserve_memory(reservedMemory);
1559
1560         return status;
1561 }
1562
1563
1564 area_id
1565 vm_map_physical_memory(team_id team, const char* name, void** _address,
1566         uint32 addressSpec, addr_t size, uint32 protection,
1567         phys_addr_t physicalAddress, bool alreadyWired)
1568 {
1569         VMArea* area;
1570         VMCache* cache;
1571         addr_t mapOffset;
1572
1573         TRACE(("vm_map_physical_memory(aspace = %" B_PRId32 ", \"%s\", virtual = %p"
1574                 ", spec = %" B_PRIu32 ", size = %" B_PRIxADDR ", protection = %"
1575                 B_PRIu32 ", phys = %#" B_PRIxPHYSADDR ")\n", team, name, *_address,
1576                 addressSpec, size, protection, physicalAddress));
1577
1578         if (!arch_vm_supports_protection(protection))
1579                 return B_NOT_SUPPORTED;
1580
1581         AddressSpaceWriteLocker locker(team);
1582         if (!locker.IsLocked())
1583                 return B_BAD_TEAM_ID;
1584
1585         // if the physical address is somewhat inside a page,
1586         // move the actual area down to align on a page boundary
1587         mapOffset = physicalAddress % B_PAGE_SIZE;
1588         size += mapOffset;
1589         physicalAddress -= mapOffset;
1590
1591         size = PAGE_ALIGN(size);
1592
1593         // create a device cache
1594         status_t status = VMCacheFactory::CreateDeviceCache(cache, physicalAddress);
1595         if (status != B_OK)
1596                 return status;
1597
1598         cache->virtual_end = size;
1599
1600         cache->Lock();
1601
1602         virtual_address_restrictions addressRestrictions = {};
1603         addressRestrictions.address = *_address;
1604         addressRestrictions.address_specification = addressSpec & ~B_MTR_MASK;
1605         status = map_backing_store(locker.AddressSpace(), cache, 0, name, size,
1606                 B_FULL_LOCK, protection, REGION_NO_PRIVATE_MAP, 0, &addressRestrictions,
1607                 true, &area, _address);
1608
1609         if (status < B_OK)
1610                 cache->ReleaseRefLocked();
1611
1612         cache->Unlock();
1613
1614         if (status == B_OK) {
1615                 // set requested memory type -- use uncached, if not given
1616                 uint32 memoryType = addressSpec & B_MTR_MASK;
1617                 if (memoryType == 0)
1618                         memoryType = B_MTR_UC;
1619
1620                 area->SetMemoryType(memoryType);
1621
1622                 status = arch_vm_set_memory_type(area, physicalAddress, memoryType);
1623                 if (status != B_OK)
1624                         delete_area(locker.AddressSpace(), area, false);
1625         }
1626
1627         if (status != B_OK)
1628                 return status;
1629
1630         VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
1631
1632         if (alreadyWired) {
1633                 // The area is already mapped, but possibly not with the right
1634                 // memory type.
1635                 map->Lock();
1636                 map->ProtectArea(area, area->protection);
1637                 map->Unlock();
1638         } else {
1639                 // Map the area completely.
1640
1641                 // reserve pages needed for the mapping
1642                 size_t reservePages = map->MaxPagesNeededToMap(area->Base(),
1643                         area->Base() + (size - 1));
1644                 vm_page_reservation reservation;
1645                 vm_page_reserve_pages(&reservation, reservePages,
1646                         team == VMAddressSpace::KernelID()
1647                                 ? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
1648
1649                 map->Lock();
1650
1651                 for (addr_t offset = 0; offset < size; offset += B_PAGE_SIZE) {
1652                         map->Map(area->Base() + offset, physicalAddress + offset,
1653                                 protection, area->MemoryType(), &reservation);
1654                 }
1655
1656                 map->Unlock();
1657
1658                 vm_page_unreserve_pages(&reservation);
1659         }
1660
1661         // modify the pointer returned to be offset back into the new area
1662         // the same way the physical address in was offset
1663         *_address = (void*)((addr_t)*_address + mapOffset);
1664
1665         area->cache_type = CACHE_TYPE_DEVICE;
1666         return area->id;
1667 }
1668
1669
1670 /*!     Don't use!
1671         TODO: This function was introduced to map physical page vecs to
1672         contiguous virtual memory in IOBuffer::GetNextVirtualVec(). It does
1673         use a device cache and does not track vm_page::wired_count!
1674 */
1675 area_id
1676 vm_map_physical_memory_vecs(team_id team, const char* name, void** _address,
1677         uint32 addressSpec, addr_t* _size, uint32 protection,
1678         struct generic_io_vec* vecs, uint32 vecCount)
1679 {
1680         TRACE(("vm_map_physical_memory_vecs(team = %" B_PRId32 ", \"%s\", virtual "
1681                 "= %p, spec = %" B_PRIu32 ", _size = %p, protection = %" B_PRIu32 ", "
1682                 "vecs = %p, vecCount = %" B_PRIu32 ")\n", team, name, *_address,
1683                 addressSpec, _size, protection, vecs, vecCount));
1684
1685         if (!arch_vm_supports_protection(protection)
1686                 || (addressSpec & B_MTR_MASK) != 0) {
1687                 return B_NOT_SUPPORTED;
1688         }
1689
1690         AddressSpaceWriteLocker locker(team);
1691         if (!locker.IsLocked())
1692                 return B_BAD_TEAM_ID;
1693
1694         if (vecCount == 0)
1695                 return B_BAD_VALUE;
1696
1697         addr_t size = 0;
1698         for (uint32 i = 0; i < vecCount; i++) {
1699                 if (vecs[i].base % B_PAGE_SIZE != 0
1700                         || vecs[i].length % B_PAGE_SIZE != 0) {
1701                         return B_BAD_VALUE;
1702                 }
1703
1704                 size += vecs[i].length;
1705         }
1706
1707         // create a device cache
1708         VMCache* cache;
1709         status_t result = VMCacheFactory::CreateDeviceCache(cache, vecs[0].base);
1710         if (result != B_OK)
1711                 return result;
1712
1713         cache->virtual_end = size;
1714
1715         cache->Lock();
1716
1717         VMArea* area;
1718         virtual_address_restrictions addressRestrictions = {};
1719         addressRestrictions.address = *_address;
1720         addressRestrictions.address_specification = addressSpec & ~B_MTR_MASK;
1721         result = map_backing_store(locker.AddressSpace(), cache, 0, name,
1722                 size, B_FULL_LOCK, protection, REGION_NO_PRIVATE_MAP, 0,
1723                 &addressRestrictions, true, &area, _address);
1724
1725         if (result != B_OK)
1726                 cache->ReleaseRefLocked();
1727
1728         cache->Unlock();
1729
1730         if (result != B_OK)
1731                 return result;
1732
1733         VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
1734         size_t reservePages = map->MaxPagesNeededToMap(area->Base(),
1735                 area->Base() + (size - 1));
1736
1737         vm_page_reservation reservation;
1738         vm_page_reserve_pages(&reservation, reservePages,
1739                         team == VMAddressSpace::KernelID()
1740                                 ? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
1741         map->Lock();
1742
1743         uint32 vecIndex = 0;
1744         size_t vecOffset = 0;
1745         for (addr_t offset = 0; offset < size; offset += B_PAGE_SIZE) {
1746                 while (vecOffset >= vecs[vecIndex].length && vecIndex < vecCount) {
1747                         vecOffset = 0;
1748                         vecIndex++;
1749                 }
1750
1751                 if (vecIndex >= vecCount)
1752                         break;
1753
1754                 map->Map(area->Base() + offset, vecs[vecIndex].base + vecOffset,
1755                         protection, area->MemoryType(), &reservation);
1756
1757                 vecOffset += B_PAGE_SIZE;
1758         }
1759
1760         map->Unlock();
1761         vm_page_unreserve_pages(&reservation);
1762
1763         if (_size != NULL)
1764                 *_size = size;
1765
1766         area->cache_type = CACHE_TYPE_DEVICE;
1767         return area->id;
1768 }
1769
1770
1771 area_id
1772 vm_create_null_area(team_id team, const char* name, void** address,
1773         uint32 addressSpec, addr_t size, uint32 flags)
1774 {
1775         size = PAGE_ALIGN(size);
1776
1777         // Lock the address space and, if B_EXACT_ADDRESS and
1778         // CREATE_AREA_UNMAP_ADDRESS_RANGE were specified, ensure the address range
1779         // is not wired.
1780         AddressSpaceWriteLocker locker;
1781         do {
1782                 if (locker.SetTo(team) != B_OK)
1783                         return B_BAD_TEAM_ID;
1784         } while (addressSpec == B_EXACT_ADDRESS
1785                 && (flags & CREATE_AREA_UNMAP_ADDRESS_RANGE) != 0
1786                 && wait_if_address_range_is_wired(locker.AddressSpace(),
1787                         (addr_t)*address, size, &locker));
1788
1789         // create a null cache
1790         int priority = (flags & CREATE_AREA_PRIORITY_VIP) != 0
1791                 ? VM_PRIORITY_VIP : VM_PRIORITY_SYSTEM;
1792         VMCache* cache;
1793         status_t status = VMCacheFactory::CreateNullCache(priority, cache);
1794         if (status != B_OK)
1795                 return status;
1796
1797         cache->temporary = 1;
1798         cache->virtual_end = size;
1799
1800         cache->Lock();
1801
1802         VMArea* area;
1803         virtual_address_restrictions addressRestrictions = {};
1804         addressRestrictions.address = *address;
1805         addressRestrictions.address_specification = addressSpec;
1806         status = map_backing_store(locker.AddressSpace(), cache, 0, name, size,
1807                 B_LAZY_LOCK, B_KERNEL_READ_AREA, REGION_NO_PRIVATE_MAP, flags,
1808                 &addressRestrictions, true, &area, address);
1809
1810         if (status < B_OK) {
1811                 cache->ReleaseRefAndUnlock();
1812                 return status;
1813         }
1814
1815         cache->Unlock();
1816
1817         area->cache_type = CACHE_TYPE_NULL;
1818         return area->id;
1819 }
1820
1821
1822 /*!     Creates the vnode cache for the specified \a vnode.
1823         The vnode has to be marked busy when calling this function.
1824 */
1825 status_t
1826 vm_create_vnode_cache(struct vnode* vnode, struct VMCache** cache)
1827 {
1828         return VMCacheFactory::CreateVnodeCache(*cache, vnode);
1829 }
1830
1831
1832 /*!     \a cache must be locked. The area's address space must be read-locked.
1833 */
1834 static void
1835 pre_map_area_pages(VMArea* area, VMCache* cache,
1836         vm_page_reservation* reservation)
1837 {
1838         addr_t baseAddress = area->Base();
1839         addr_t cacheOffset = area->cache_offset;
1840         page_num_t firstPage = cacheOffset / B_PAGE_SIZE;
1841         page_num_t endPage = firstPage + area->Size() / B_PAGE_SIZE;
1842
1843         for (VMCachePagesTree::Iterator it
1844                                 = cache->pages.GetIterator(firstPage, true, true);
1845                         vm_page* page = it.Next();) {
1846                 if (page->cache_offset >= endPage)
1847                         break;
1848
1849                 // skip busy and inactive pages
1850                 if (page->busy || page->usage_count == 0)
1851                         continue;
1852
1853                 DEBUG_PAGE_ACCESS_START(page);
1854                 map_page(area, page,
1855                         baseAddress + (page->cache_offset * B_PAGE_SIZE - cacheOffset),
1856                         B_READ_AREA | B_KERNEL_READ_AREA, reservation);
1857                 DEBUG_PAGE_ACCESS_END(page);
1858         }
1859 }
1860
1861
1862 /*!     Will map the file specified by \a fd to an area in memory.
1863         The file will be mirrored beginning at the specified \a offset. The
1864         \a offset and \a size arguments have to be page aligned.
1865 */
1866 static area_id
1867 _vm_map_file(team_id team, const char* name, void** _address,
1868         uint32 addressSpec, size_t size, uint32 protection, uint32 mapping,
1869         bool unmapAddressRange, int fd, off_t offset, bool kernel)
1870 {
1871         // TODO: for binary files, we want to make sure that they get the
1872         //      copy of a file at a given time, ie. later changes should not
1873         //      make it into the mapped copy -- this will need quite some changes
1874         //      to be done in a nice way
1875         TRACE(("_vm_map_file(fd = %d, offset = %" B_PRIdOFF ", size = %lu, mapping "
1876                 "%" B_PRIu32 ")\n", fd, offset, size, mapping));
1877
1878         offset = ROUNDDOWN(offset, B_PAGE_SIZE);
1879         size = PAGE_ALIGN(size);
1880
1881         if (mapping == REGION_NO_PRIVATE_MAP)
1882                 protection |= B_SHARED_AREA;
1883         if (addressSpec != B_EXACT_ADDRESS)
1884                 unmapAddressRange = false;
1885
1886         if (fd < 0) {
1887                 uint32 flags = unmapAddressRange ? CREATE_AREA_UNMAP_ADDRESS_RANGE : 0;
1888                 virtual_address_restrictions virtualRestrictions = {};
1889                 virtualRestrictions.address = *_address;
1890                 virtualRestrictions.address_specification = addressSpec;
1891                 physical_address_restrictions physicalRestrictions = {};
1892                 return vm_create_anonymous_area(team, name, size, B_NO_LOCK, protection,
1893                         flags, 0, &virtualRestrictions, &physicalRestrictions, kernel,
1894                         _address);
1895         }
1896
1897         // get the open flags of the FD
1898         file_descriptor* descriptor = get_fd(get_current_io_context(kernel), fd);
1899         if (descriptor == NULL)
1900                 return EBADF;
1901         int32 openMode = descriptor->open_mode;
1902         put_fd(descriptor);
1903
1904         // The FD must open for reading at any rate. For shared mapping with write
1905         // access, additionally the FD must be open for writing.
1906         if ((openMode & O_ACCMODE) == O_WRONLY
1907                 || (mapping == REGION_NO_PRIVATE_MAP
1908                         && (protection & (B_WRITE_AREA | B_KERNEL_WRITE_AREA)) != 0
1909                         && (openMode & O_ACCMODE) == O_RDONLY)) {
1910                 return EACCES;
1911         }
1912
1913         // get the vnode for the object, this also grabs a ref to it
1914         struct vnode* vnode = NULL;
1915         status_t status = vfs_get_vnode_from_fd(fd, kernel, &vnode);
1916         if (status < B_OK)
1917                 return status;
1918         CObjectDeleter<struct vnode> vnodePutter(vnode, vfs_put_vnode);
1919
1920         // If we're going to pre-map pages, we need to reserve the pages needed by
1921         // the mapping backend upfront.
1922         page_num_t reservedPreMapPages = 0;
1923         vm_page_reservation reservation;
1924         if ((protection & B_READ_AREA) != 0) {
1925                 AddressSpaceWriteLocker locker;
1926                 status = locker.SetTo(team);
1927                 if (status != B_OK)
1928                         return status;
1929
1930                 VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
1931                 reservedPreMapPages = map->MaxPagesNeededToMap(0, size - 1);
1932
1933                 locker.Unlock();
1934
1935                 vm_page_reserve_pages(&reservation, reservedPreMapPages,
1936                         team == VMAddressSpace::KernelID()
1937                                 ? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
1938         }
1939
1940         struct PageUnreserver {
1941                 PageUnreserver(vm_page_reservation* reservation)
1942                         :
1943                         fReservation(reservation)
1944                 {
1945                 }
1946
1947                 ~PageUnreserver()
1948                 {
1949                         if (fReservation != NULL)
1950                                 vm_page_unreserve_pages(fReservation);
1951                 }
1952
1953                 vm_page_reservation* fReservation;
1954         } pageUnreserver(reservedPreMapPages > 0 ? &reservation : NULL);
1955
1956         // Lock the address space and, if the specified address range shall be
1957         // unmapped, ensure it is not wired.
1958         AddressSpaceWriteLocker locker;
1959         do {
1960                 if (locker.SetTo(team) != B_OK)
1961                         return B_BAD_TEAM_ID;
1962         } while (unmapAddressRange
1963                 && wait_if_address_range_is_wired(locker.AddressSpace(),
1964                         (addr_t)*_address, size, &locker));
1965
1966         // TODO: this only works for file systems that use the file cache
1967         VMCache* cache;
1968         status = vfs_get_vnode_cache(vnode, &cache, false);
1969         if (status < B_OK)
1970                 return status;
1971
1972         cache->Lock();
1973
1974         VMArea* area;
1975         virtual_address_restrictions addressRestrictions = {};
1976         addressRestrictions.address = *_address;
1977         addressRestrictions.address_specification = addressSpec;
1978         status = map_backing_store(locker.AddressSpace(), cache, offset, name, size,
1979                 0, protection, mapping,
1980                 unmapAddressRange ? CREATE_AREA_UNMAP_ADDRESS_RANGE : 0,
1981                 &addressRestrictions, kernel, &area, _address);
1982
1983         if (status != B_OK || mapping == REGION_PRIVATE_MAP) {
1984                 // map_backing_store() cannot know we no longer need the ref
1985                 cache->ReleaseRefLocked();
1986         }
1987
1988         if (status == B_OK && (protection & B_READ_AREA) != 0)
1989                 pre_map_area_pages(area, cache, &reservation);
1990
1991         cache->Unlock();
1992
1993         if (status == B_OK) {
1994                 // TODO: this probably deserves a smarter solution, ie. don't always
1995                 // prefetch stuff, and also, probably don't trigger it at this place.
1996                 cache_prefetch_vnode(vnode, offset, min_c(size, 10LL * 1024 * 1024));
1997                         // prefetches at max 10 MB starting from "offset"
1998         }
1999
2000         if (status != B_OK)
2001                 return status;
2002
2003         area->cache_type = CACHE_TYPE_VNODE;
2004         return area->id;
2005 }
2006
2007
2008 area_id
2009 vm_map_file(team_id aid, const char* name, void** address, uint32 addressSpec,
2010         addr_t size, uint32 protection, uint32 mapping, bool unmapAddressRange,
2011         int fd, off_t offset)
2012 {
2013         if (!arch_vm_supports_protection(protection))
2014                 return B_NOT_SUPPORTED;
2015
2016         return _vm_map_file(aid, name, address, addressSpec, size, protection,
2017                 mapping, unmapAddressRange, fd, offset, true);
2018 }
2019
2020
2021 VMCache*
2022 vm_area_get_locked_cache(VMArea* area)
2023 {
2024         rw_lock_read_lock(&sAreaCacheLock);
2025
2026         while (true) {
2027                 VMCache* cache = area->cache;
2028
2029                 if (!cache->SwitchFromReadLock(&sAreaCacheLock)) {
2030                         // cache has been deleted
2031                         rw_lock_read_lock(&sAreaCacheLock);
2032                         continue;
2033                 }
2034
2035                 rw_lock_read_lock(&sAreaCacheLock);
2036
2037                 if (cache == area->cache) {
2038                         cache->AcquireRefLocked();
2039                         rw_lock_read_unlock(&sAreaCacheLock);
2040                         return cache;
2041                 }
2042
2043                 // the cache changed in the meantime
2044                 cache->Unlock();
2045         }
2046 }
2047
2048
2049 void
2050 vm_area_put_locked_cache(VMCache* cache)
2051 {
2052         cache->ReleaseRefAndUnlock();
2053 }
2054
2055
2056 area_id
2057 vm_clone_area(team_id team, const char* name, void** address,
2058         uint32 addressSpec, uint32 protection, uint32 mapping, area_id sourceID,
2059         bool kernel)
2060 {
2061         VMArea* newArea = NULL;
2062         VMArea* sourceArea;
2063
2064         // Check whether the source area exists and is cloneable. If so, mark it
2065         // B_SHARED_AREA, so that we don't get problems with copy-on-write.
2066         {
2067                 AddressSpaceWriteLocker locker;
2068                 status_t status = locker.SetFromArea(sourceID, sourceArea);
2069                 if (status != B_OK)
2070                         return status;
2071
2072                 if (!kernel && (sourceArea->protection & B_KERNEL_AREA) != 0)
2073                         return B_NOT_ALLOWED;
2074
2075                 sourceArea->protection |= B_SHARED_AREA;
2076                 protection |= B_SHARED_AREA;
2077         }
2078
2079         // Now lock both address spaces and actually do the cloning.
2080
2081         MultiAddressSpaceLocker locker;
2082         VMAddressSpace* sourceAddressSpace;
2083         status_t status = locker.AddArea(sourceID, false, &sourceAddressSpace);
2084         if (status != B_OK)
2085                 return status;
2086
2087         VMAddressSpace* targetAddressSpace;
2088         status = locker.AddTeam(team, true, &targetAddressSpace);
2089         if (status != B_OK)
2090                 return status;
2091
2092         status = locker.Lock();
2093         if (status != B_OK)
2094                 return status;
2095
2096         sourceArea = lookup_area(sourceAddressSpace, sourceID);
2097         if (sourceArea == NULL)
2098                 return B_BAD_VALUE;
2099
2100         if (!kernel && (sourceArea->protection & B_KERNEL_AREA) != 0)
2101                 return B_NOT_ALLOWED;
2102
2103         VMCache* cache = vm_area_get_locked_cache(sourceArea);
2104
2105         // TODO: for now, B_USER_CLONEABLE is disabled, until all drivers
2106         //      have been adapted. Maybe it should be part of the kernel settings,
2107         //      anyway (so that old drivers can always work).
2108 #if 0
2109         if (sourceArea->aspace == VMAddressSpace::Kernel()
2110                 && addressSpace != VMAddressSpace::Kernel()
2111                 && !(sourceArea->protection & B_USER_CLONEABLE_AREA)) {
2112                 // kernel areas must not be cloned in userland, unless explicitly
2113                 // declared user-cloneable upon construction
2114                 status = B_NOT_ALLOWED;
2115         } else
2116 #endif
2117         if (sourceArea->cache_type == CACHE_TYPE_NULL)
2118                 status = B_NOT_ALLOWED;
2119         else {
2120                 virtual_address_restrictions addressRestrictions = {};
2121                 addressRestrictions.address = *address;
2122                 addressRestrictions.address_specification = addressSpec;
2123                 status = map_backing_store(targetAddressSpace, cache,
2124                         sourceArea->cache_offset, name, sourceArea->Size(),
2125                         sourceArea->wiring, protection, mapping, 0, &addressRestrictions,
2126                         kernel, &newArea, address);
2127         }
2128         if (status == B_OK && mapping != REGION_PRIVATE_MAP) {
2129                 // If the mapping is REGION_PRIVATE_MAP, map_backing_store() needed
2130                 // to create a new cache, and has therefore already acquired a reference
2131                 // to the source cache - but otherwise it has no idea that we need
2132                 // one.
2133                 cache->AcquireRefLocked();
2134         }
2135         if (status == B_OK && newArea->wiring == B_FULL_LOCK) {
2136                 // we need to map in everything at this point
2137                 if (sourceArea->cache_type == CACHE_TYPE_DEVICE) {
2138                         // we don't have actual pages to map but a physical area
2139                         VMTranslationMap* map
2140                                 = sourceArea->address_space->TranslationMap();
2141                         map->Lock();
2142
2143                         phys_addr_t physicalAddress;
2144                         uint32 oldProtection;
2145                         map->Query(sourceArea->Base(), &physicalAddress, &oldProtection);
2146
2147                         map->Unlock();
2148
2149                         map = targetAddressSpace->TranslationMap();
2150                         size_t reservePages = map->MaxPagesNeededToMap(newArea->Base(),
2151                                 newArea->Base() + (newArea->Size() - 1));
2152
2153                         vm_page_reservation reservation;
2154                         vm_page_reserve_pages(&reservation, reservePages,
2155                                 targetAddressSpace == VMAddressSpace::Kernel()
2156                                         ? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2157                         map->Lock();
2158
2159                         for (addr_t offset = 0; offset < newArea->Size();
2160                                         offset += B_PAGE_SIZE) {
2161                                 map->Map(newArea->Base() + offset, physicalAddress + offset,
2162                                         protection, newArea->MemoryType(), &reservation);
2163                         }
2164
2165                         map->Unlock();
2166                         vm_page_unreserve_pages(&reservation);
2167                 } else {
2168                         VMTranslationMap* map = targetAddressSpace->TranslationMap();
2169                         size_t reservePages = map->MaxPagesNeededToMap(
2170                                 newArea->Base(), newArea->Base() + (newArea->Size() - 1));
2171                         vm_page_reservation reservation;
2172                         vm_page_reserve_pages(&reservation, reservePages,
2173                                 targetAddressSpace == VMAddressSpace::Kernel()
2174                                         ? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2175
2176                         // map in all pages from source
2177                         for (VMCachePagesTree::Iterator it = cache->pages.GetIterator();
2178                                         vm_page* page  = it.Next();) {
2179                                 if (!page->busy) {
2180                                         DEBUG_PAGE_ACCESS_START(page);
2181                                         map_page(newArea, page,
2182                                                 newArea->Base() + ((page->cache_offset << PAGE_SHIFT)
2183                                                         - newArea->cache_offset),
2184                                                 protection, &reservation);
2185                                         DEBUG_PAGE_ACCESS_END(page);
2186                                 }
2187                         }
2188                         // TODO: B_FULL_LOCK means that all pages are locked. We are not
2189                         // ensuring that!
2190
2191                         vm_page_unreserve_pages(&reservation);
2192                 }
2193         }
2194         if (status == B_OK)
2195                 newArea->cache_type = sourceArea->cache_type;
2196
2197         vm_area_put_locked_cache(cache);
2198
2199         if (status < B_OK)
2200                 return status;
2201
2202         return newArea->id;
2203 }
2204
2205
2206 /*!     Deletes the specified area of the given address space.
2207
2208         The address space must be write-locked.
2209         The caller must ensure that the area does not have any wired ranges.
2210
2211         \param addressSpace The address space containing the area.
2212         \param area The area to be deleted.
2213         \param deletingAddressSpace \c true, if the address space is in the process
2214                 of being deleted.
2215 */
2216 static void
2217 delete_area(VMAddressSpace* addressSpace, VMArea* area,
2218         bool deletingAddressSpace)
2219 {
2220         ASSERT(!area->IsWired());
2221
2222         VMAreaHash::Remove(area);
2223
2224         // At this point the area is removed from the global hash table, but
2225         // still exists in the area list.
2226
2227         // Unmap the virtual address space the area occupied.
2228         {
2229                 // We need to lock the complete cache chain.
2230                 VMCache* topCache = vm_area_get_locked_cache(area);
2231                 VMCacheChainLocker cacheChainLocker(topCache);
2232                 cacheChainLocker.LockAllSourceCaches();
2233
2234                 // If the area's top cache is a temporary cache and the area is the only
2235                 // one referencing it (besides us currently holding a second reference),
2236                 // the unmapping code doesn't need to care about preserving the accessed
2237                 // and dirty flags of the top cache page mappings.
2238                 bool ignoreTopCachePageFlags
2239                         = topCache->temporary && topCache->RefCount() == 2;
2240
2241                 area->address_space->TranslationMap()->UnmapArea(area,
2242                         deletingAddressSpace, ignoreTopCachePageFlags);
2243         }
2244
2245         if (!area->cache->temporary)
2246                 area->cache->WriteModified();
2247
2248         uint32 allocationFlags = addressSpace == VMAddressSpace::Kernel()
2249                 ? HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE : 0;
2250
2251         arch_vm_unset_memory_type(area);
2252         addressSpace->RemoveArea(area, allocationFlags);
2253         addressSpace->Put();
2254
2255         area->cache->RemoveArea(area);
2256         area->cache->ReleaseRef();
2257
2258         addressSpace->DeleteArea(area, allocationFlags);
2259 }
2260
2261
2262 status_t
2263 vm_delete_area(team_id team, area_id id, bool kernel)
2264 {
2265         TRACE(("vm_delete_area(team = 0x%" B_PRIx32 ", area = 0x%" B_PRIx32 ")\n",
2266                 team, id));
2267
2268         // lock the address space and make sure the area isn't wired
2269         AddressSpaceWriteLocker locker;
2270         VMArea* area;
2271         AreaCacheLocker cacheLocker;
2272
2273         do {
2274                 status_t status = locker.SetFromArea(team, id, area);
2275                 if (status != B_OK)
2276                         return status;
2277
2278                 cacheLocker.SetTo(area);
2279         } while (wait_if_area_is_wired(area, &locker, &cacheLocker));
2280
2281         cacheLocker.Unlock();
2282
2283         if (!kernel && (area->protection & B_KERNEL_AREA) != 0)
2284                 return B_NOT_ALLOWED;
2285
2286         delete_area(locker.AddressSpace(), area, false);
2287         return B_OK;
2288 }
2289
2290
2291 /*!     Creates a new cache on top of given cache, moves all areas from
2292         the old cache to the new one, and changes the protection of all affected
2293         areas' pages to read-only. If requested, wired pages are moved up to the
2294         new cache and copies are added to the old cache in their place.
2295         Preconditions:
2296         - The given cache must be locked.
2297         - All of the cache's areas' address spaces must be read locked.
2298         - Either the cache must not have any wired ranges or a page reservation for
2299           all wired pages must be provided, so they can be copied.
2300
2301         \param lowerCache The cache on top of which a new cache shall be created.
2302         \param wiredPagesReservation If \c NULL there must not be any wired pages
2303                 in \a lowerCache. Otherwise as many pages must be reserved as the cache
2304                 has wired page. The wired pages are copied in this case.
2305 */
2306 static status_t
2307 vm_copy_on_write_area(VMCache* lowerCache,
2308         vm_page_reservation* wiredPagesReservation)
2309 {
2310         VMCache* upperCache;
2311
2312         TRACE(("vm_copy_on_write_area(cache = %p)\n", lowerCache));
2313
2314         // We need to separate the cache from its areas. The cache goes one level
2315         // deeper and we create a new cache inbetween.
2316
2317         // create an anonymous cache
2318         status_t status = VMCacheFactory::CreateAnonymousCache(upperCache, false, 0,
2319                 lowerCache->GuardSize() / B_PAGE_SIZE,
2320                 dynamic_cast<VMAnonymousNoSwapCache*>(lowerCache) == NULL,
2321                 VM_PRIORITY_USER);
2322         if (status != B_OK)
2323                 return status;
2324
2325         upperCache->Lock();
2326
2327         upperCache->temporary = 1;
2328         upperCache->virtual_base = lowerCache->virtual_base;
2329         upperCache->virtual_end = lowerCache->virtual_end;
2330
2331         // transfer the lower cache areas to the upper cache
2332         rw_lock_write_lock(&sAreaCacheLock);
2333         upperCache->TransferAreas(lowerCache);
2334         rw_lock_write_unlock(&sAreaCacheLock);
2335
2336         lowerCache->AddConsumer(upperCache);
2337
2338         // We now need to remap all pages from all of the cache's areas read-only,
2339         // so that a copy will be created on next write access. If there are wired
2340         // pages, we keep their protection, move them to the upper cache and create
2341         // copies for the lower cache.
2342         if (wiredPagesReservation != NULL) {
2343                 // We need to handle wired pages -- iterate through the cache's pages.
2344                 for (VMCachePagesTree::Iterator it = lowerCache->pages.GetIterator();
2345                                 vm_page* page = it.Next();) {
2346                         if (page->WiredCount() > 0) {
2347                                 // allocate a new page and copy the wired one
2348                                 vm_page* copiedPage = vm_page_allocate_page(
2349                                         wiredPagesReservation, PAGE_STATE_ACTIVE);
2350
2351                                 vm_memcpy_physical_page(
2352                                         copiedPage->physical_page_number * B_PAGE_SIZE,
2353                                         page->physical_page_number * B_PAGE_SIZE);
2354
2355                                 // move the wired page to the upper cache (note: removing is OK
2356                                 // with the SplayTree iterator) and insert the copy
2357                                 upperCache->MovePage(page);
2358                                 lowerCache->InsertPage(copiedPage,
2359                                         page->cache_offset * B_PAGE_SIZE);
2360
2361                                 DEBUG_PAGE_ACCESS_END(copiedPage);
2362                         } else {
2363                                 // Change the protection of this page in all areas.
2364                                 for (VMArea* tempArea = upperCache->areas; tempArea != NULL;
2365                                                 tempArea = tempArea->cache_next) {
2366                                         // The area must be readable in the same way it was
2367                                         // previously writable.
2368                                         uint32 protection = B_KERNEL_READ_AREA;
2369                                         if ((tempArea->protection & B_READ_AREA) != 0)
2370                                                 protection |= B_READ_AREA;
2371
2372                                         VMTranslationMap* map
2373                                                 = tempArea->address_space->TranslationMap();
2374                                         map->Lock();
2375                                         map->ProtectPage(tempArea,
2376                                                 virtual_page_address(tempArea, page), protection);
2377                                         map->Unlock();
2378                                 }
2379                         }
2380                 }
2381         } else {
2382                 ASSERT(lowerCache->WiredPagesCount() == 0);
2383
2384                 // just change the protection of all areas
2385                 for (VMArea* tempArea = upperCache->areas; tempArea != NULL;
2386                                 tempArea = tempArea->cache_next) {
2387                         // The area must be readable in the same way it was previously
2388                         // writable.
2389                         uint32 protection = B_KERNEL_READ_AREA;
2390                         if ((tempArea->protection & B_READ_AREA) != 0)
2391                                 protection |= B_READ_AREA;
2392
2393                         VMTranslationMap* map = tempArea->address_space->TranslationMap();
2394                         map->Lock();
2395                         map->ProtectArea(tempArea, protection);
2396                         map->Unlock();
2397                 }
2398         }
2399
2400         vm_area_put_locked_cache(upperCache);
2401
2402         return B_OK;
2403 }
2404
2405
2406 area_id
2407 vm_copy_area(team_id team, const char* name, void** _address,
2408         uint32 addressSpec, uint32 protection, area_id sourceID)
2409 {
2410         bool writableCopy = (protection & (B_KERNEL_WRITE_AREA | B_WRITE_AREA)) != 0;
2411
2412         if ((protection & B_KERNEL_PROTECTION) == 0) {
2413                 // set the same protection for the kernel as for userland
2414                 protection |= B_KERNEL_READ_AREA;
2415                 if (writableCopy)
2416                         protection |= B_KERNEL_WRITE_AREA;
2417         }
2418
2419         // Do the locking: target address space, all address spaces associated with
2420         // the source cache, and the cache itself.
2421         MultiAddressSpaceLocker locker;
2422         VMAddressSpace* targetAddressSpace;
2423         VMCache* cache;
2424         VMArea* source;
2425         AreaCacheLocker cacheLocker;
2426         status_t status;
2427         bool sharedArea;
2428
2429         page_num_t wiredPages = 0;
2430         vm_page_reservation wiredPagesReservation;
2431
2432         bool restart;
2433         do {
2434                 restart = false;
2435
2436                 locker.Unset();
2437                 status = locker.AddTeam(team, true, &targetAddressSpace);
2438                 if (status == B_OK) {
2439                         status = locker.AddAreaCacheAndLock(sourceID, false, false, source,
2440                                 &cache);
2441                 }
2442                 if (status != B_OK)
2443                         return status;
2444
2445                 cacheLocker.SetTo(cache, true); // already locked
2446
2447                 sharedArea = (source->protection & B_SHARED_AREA) != 0;
2448
2449                 page_num_t oldWiredPages = wiredPages;
2450                 wiredPages = 0;
2451
2452                 // If the source area isn't shared, count the number of wired pages in
2453                 // the cache and reserve as many pages.
2454                 if (!sharedArea) {
2455                         wiredPages = cache->WiredPagesCount();
2456
2457                         if (wiredPages > oldWiredPages) {
2458                                 cacheLocker.Unlock();
2459                                 locker.Unlock();
2460
2461                                 if (oldWiredPages > 0)
2462                                         vm_page_unreserve_pages(&wiredPagesReservation);
2463
2464                                 vm_page_reserve_pages(&wiredPagesReservation, wiredPages,
2465                                         VM_PRIORITY_USER);
2466
2467                                 restart = true;
2468                         }
2469                 } else if (oldWiredPages > 0)
2470                         vm_page_unreserve_pages(&wiredPagesReservation);
2471         } while (restart);
2472
2473         // unreserve pages later
2474         struct PagesUnreserver {
2475                 PagesUnreserver(vm_page_reservation* reservation)
2476                         :
2477                         fReservation(reservation)
2478                 {
2479                 }
2480
2481                 ~PagesUnreserver()
2482                 {
2483                         if (fReservation != NULL)
2484                                 vm_page_unreserve_pages(fReservation);
2485                 }
2486
2487         private:
2488                 vm_page_reservation*    fReservation;
2489         } pagesUnreserver(wiredPages > 0 ? &wiredPagesReservation : NULL);
2490
2491         if (addressSpec == B_CLONE_ADDRESS) {
2492                 addressSpec = B_EXACT_ADDRESS;
2493                 *_address = (void*)source->Base();
2494         }
2495
2496         // First, create a cache on top of the source area, respectively use the
2497         // existing one, if this is a shared area.
2498
2499         VMArea* target;
2500         virtual_address_restrictions addressRestrictions = {};
2501         addressRestrictions.address = *_address;
2502         addressRestrictions.address_specification = addressSpec;
2503         status = map_backing_store(targetAddressSpace, cache, source->cache_offset,
2504                 name, source->Size(), source->wiring, protection,
2505                 sharedArea ? REGION_NO_PRIVATE_MAP : REGION_PRIVATE_MAP,
2506                 writableCopy ? 0 : CREATE_AREA_DONT_COMMIT_MEMORY,
2507                 &addressRestrictions, true, &target, _address);
2508         if (status < B_OK)
2509                 return status;
2510
2511         if (sharedArea) {
2512                 // The new area uses the old area's cache, but map_backing_store()
2513                 // hasn't acquired a ref. So we have to do that now.
2514                 cache->AcquireRefLocked();
2515         }
2516
2517         // If the source area is writable, we need to move it one layer up as well
2518
2519         if (!sharedArea) {
2520                 if ((source->protection & (B_KERNEL_WRITE_AREA | B_WRITE_AREA)) != 0) {
2521                         // TODO: do something more useful if this fails!
2522                         if (vm_copy_on_write_area(cache,
2523                                         wiredPages > 0 ? &wiredPagesReservation : NULL) < B_OK) {
2524                                 panic("vm_copy_on_write_area() failed!\n");
2525                         }
2526                 }
2527         }
2528
2529         // we return the ID of the newly created area
2530         return target->id;
2531 }
2532
2533
2534 status_t
2535 vm_set_area_protection(team_id team, area_id areaID, uint32 newProtection,
2536         bool kernel)
2537 {
2538         fix_protection(&newProtection);
2539
2540         TRACE(("vm_set_area_protection(team = %#" B_PRIx32 ", area = %#" B_PRIx32
2541                 ", protection = %#" B_PRIx32 ")\n", team, areaID, newProtection));
2542
2543         if (!arch_vm_supports_protection(newProtection))
2544                 return B_NOT_SUPPORTED;
2545
2546         bool becomesWritable
2547                 = (newProtection & (B_WRITE_AREA | B_KERNEL_WRITE_AREA)) != 0;
2548
2549         // lock address spaces and cache
2550         MultiAddressSpaceLocker locker;
2551         VMCache* cache;
2552         VMArea* area;
2553         status_t status;
2554         AreaCacheLocker cacheLocker;
2555         bool isWritable;
2556
2557         bool restart;
2558         do {
2559                 restart = false;
2560
2561                 locker.Unset();
2562                 status = locker.AddAreaCacheAndLock(areaID, true, false, area, &cache);
2563                 if (status != B_OK)
2564                         return status;
2565
2566                 cacheLocker.SetTo(cache, true); // already locked
2567
2568                 if (!kernel && (area->protection & B_KERNEL_AREA) != 0)
2569                         return B_NOT_ALLOWED;
2570
2571                 if (area->protection == newProtection)
2572                         return B_OK;
2573
2574                 if (team != VMAddressSpace::KernelID()
2575                         && area->address_space->ID() != team) {
2576                         // unless you're the kernel, you are only allowed to set
2577                         // the protection of your own areas
2578                         return B_NOT_ALLOWED;
2579                 }
2580
2581                 isWritable
2582                         = (area->protection & (B_WRITE_AREA | B_KERNEL_WRITE_AREA)) != 0;
2583
2584                 // Make sure the area (respectively, if we're going to call
2585                 // vm_copy_on_write_area(), all areas of the cache) doesn't have any
2586                 // wired ranges.
2587                 if (!isWritable && becomesWritable && !cache->consumers.IsEmpty()) {
2588                         for (VMArea* otherArea = cache->areas; otherArea != NULL;
2589                                         otherArea = otherArea->cache_next) {
2590                                 if (wait_if_area_is_wired(otherArea, &locker, &cacheLocker)) {
2591                                         restart = true;
2592                                         break;
2593                                 }
2594                         }
2595                 } else {
2596                         if (wait_if_area_is_wired(area, &locker, &cacheLocker))
2597                                 restart = true;
2598                 }
2599         } while (restart);
2600
2601         bool changePageProtection = true;
2602         bool changeTopCachePagesOnly = false;
2603
2604         if (isWritable && !becomesWritable) {
2605                 // writable -> !writable
2606
2607                 if (cache->source != NULL && cache->temporary) {
2608                         if (cache->CountWritableAreas(area) == 0) {
2609                                 // Since this cache now lives from the pages in its source cache,
2610                                 // we can change the cache's commitment to take only those pages
2611                                 // into account that really are in this cache.
2612
2613                                 status = cache->Commit(cache->page_count * B_PAGE_SIZE,
2614                                         team == VMAddressSpace::KernelID()
2615                                                 ? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2616
2617                                 // TODO: we may be able to join with our source cache, if
2618                                 // count == 0
2619                         }
2620                 }
2621
2622                 // If only the writability changes, we can just remap the pages of the
2623                 // top cache, since the pages of lower caches are mapped read-only
2624                 // anyway. That's advantageous only, if the number of pages in the cache
2625                 // is significantly smaller than the number of pages in the area,
2626                 // though.
2627                 if (newProtection
2628                                 == (area->protection & ~(B_WRITE_AREA | B_KERNEL_WRITE_AREA))
2629                         && cache->page_count * 2 < area->Size() / B_PAGE_SIZE) {
2630                         changeTopCachePagesOnly = true;
2631                 }
2632         } else if (!isWritable && becomesWritable) {
2633                 // !writable -> writable
2634
2635                 if (!cache->consumers.IsEmpty()) {
2636                         // There are consumers -- we have to insert a new cache. Fortunately
2637                         // vm_copy_on_write_area() does everything that's needed.
2638                         changePageProtection = false;
2639                         status = vm_copy_on_write_area(cache, NULL);
2640                 } else {
2641                         // No consumers, so we don't need to insert a new one.
2642                         if (cache->source != NULL && cache->temporary) {
2643                                 // the cache's commitment must contain all possible pages
2644                                 status = cache->Commit(cache->virtual_end - cache->virtual_base,
2645                                         team == VMAddressSpace::KernelID()
2646                                                 ? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
2647                         }
2648
2649                         if (status == B_OK && cache->source != NULL) {
2650                                 // There's a source cache, hence we can't just change all pages'
2651                                 // protection or we might allow writing into pages belonging to
2652                                 // a lower cache.
2653                                 changeTopCachePagesOnly = true;
2654                         }
2655                 }
2656         } else {
2657                 // we don't have anything special to do in all other cases
2658         }
2659
2660         if (status == B_OK) {
2661                 // remap existing pages in this cache
2662                 if (changePageProtection) {
2663                         VMTranslationMap* map = area->address_space->TranslationMap();
2664                         map->Lock();
2665
2666                         if (changeTopCachePagesOnly) {
2667                                 page_num_t firstPageOffset = area->cache_offset / B_PAGE_SIZE;
2668                                 page_num_t lastPageOffset
2669                                         = firstPageOffset + area->Size() / B_PAGE_SIZE;
2670                                 for (VMCachePagesTree::Iterator it = cache->pages.GetIterator();
2671                                                 vm_page* page = it.Next();) {
2672                                         if (page->cache_offset >= firstPageOffset
2673                                                 && page->cache_offset <= lastPageOffset) {
2674                                                 addr_t address = virtual_page_address(area, page);
2675                                                 map->ProtectPage(area, address, newProtection);
2676                                         }
2677                                 }
2678                         } else
2679                                 map->ProtectArea(area, newProtection);
2680
2681                         map->Unlock();
2682                 }
2683
2684                 area->protection = newProtection;
2685         }
2686
2687         return status;
2688 }
2689
2690
2691 status_t
2692 vm_get_page_mapping(team_id team, addr_t vaddr, phys_addr_t* paddr)
2693 {
2694         VMAddressSpace* addressSpace = VMAddressSpace::Get(team);
2695         if (addressSpace == NULL)
2696                 return B_BAD_TEAM_ID;
2697
2698         VMTranslationMap* map = addressSpace->TranslationMap();
2699
2700         map->Lock();
2701         uint32 dummyFlags;
2702         status_t status = map->Query(vaddr, paddr, &dummyFlags);
2703         map->Unlock();
2704
2705         addressSpace->Put();
2706         return status;
2707 }
2708
2709
2710 /*!     The page's cache must be locked.
2711 */
2712 bool
2713 vm_test_map_modification(vm_page* page)
2714 {
2715         if (page->modified)
2716                 return true;
2717
2718         vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
2719         vm_page_mapping* mapping;
2720         while ((mapping = iterator.Next()) != NULL) {
2721                 VMArea* area = mapping->area;
2722                 VMTranslationMap* map = area->address_space->TranslationMap();
2723
2724                 phys_addr_t physicalAddress;
2725                 uint32 flags;
2726                 map->Lock();
2727                 map->Query(virtual_page_address(area, page), &physicalAddress, &flags);
2728                 map->Unlock();
2729
2730                 if ((flags & PAGE_MODIFIED) != 0)
2731                         return true;
2732         }
2733
2734         return false;
2735 }
2736
2737
2738 /*!     The page's cache must be locked.
2739 */
2740 void
2741 vm_clear_map_flags(vm_page* page, uint32 flags)
2742 {
2743         if ((flags & PAGE_ACCESSED) != 0)
2744                 page->accessed = false;
2745         if ((flags & PAGE_MODIFIED) != 0)
2746                 page->modified = false;
2747
2748         vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
2749         vm_page_mapping* mapping;
2750         while ((mapping = iterator.Next()) != NULL) {
2751                 VMArea* area = mapping->area;
2752                 VMTranslationMap* map = area->address_space->TranslationMap();
2753
2754                 map->Lock();
2755                 map->ClearFlags(virtual_page_address(area, page), flags);
2756                 map->Unlock();
2757         }
2758 }
2759
2760
2761 /*!     Removes all mappings from a page.
2762         After you've called this function, the page is unmapped from memory and
2763         the page's \c accessed and \c modified flags have been updated according
2764         to the state of the mappings.
2765         The page's cache must be locked.
2766 */
2767 void
2768 vm_remove_all_page_mappings(vm_page* page)
2769 {
2770         while (vm_page_mapping* mapping = page->mappings.Head()) {
2771                 VMArea* area = mapping->area;
2772                 VMTranslationMap* map = area->address_space->TranslationMap();
2773                 addr_t address = virtual_page_address(area, page);
2774                 map->UnmapPage(area, address, false);
2775         }
2776 }
2777
2778
2779 int32
2780 vm_clear_page_mapping_accessed_flags(struct vm_page *page)
2781 {
2782         int32 count = 0;
2783
2784         vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
2785         vm_page_mapping* mapping;
2786         while ((mapping = iterator.Next()) != NULL) {
2787                 VMArea* area = mapping->area;
2788                 VMTranslationMap* map = area->address_space->TranslationMap();
2789
2790                 bool modified;
2791                 if (map->ClearAccessedAndModified(area,
2792                                 virtual_page_address(area, page), false, modified)) {
2793                         count++;
2794                 }
2795
2796                 page->modified |= modified;
2797         }
2798
2799
2800         if (page->accessed) {
2801                 count++;
2802                 page->accessed = false;
2803         }
2804
2805         return count;
2806 }
2807
2808
2809 /*!     Removes all mappings of a page and/or clears the accessed bits of the
2810         mappings.
2811         The function iterates through the page mappings and removes them until
2812         encountering one that has been accessed. From then on it will continue to
2813         iterate, but only clear the accessed flag of the mapping. The page's
2814         \c modified bit will be updated accordingly, the \c accessed bit will be
2815         cleared.
2816         \return The number of mapping accessed bits encountered, including the
2817                 \c accessed bit of the page itself. If \c 0 is returned, all mappings
2818                 of the page have been removed.
2819 */
2820 int32
2821 vm_remove_all_page_mappings_if_unaccessed(struct vm_page *page)
2822 {
2823         ASSERT(page->WiredCount() == 0);
2824
2825         if (page->accessed)
2826                 return vm_clear_page_mapping_accessed_flags(page);
2827
2828         while (vm_page_mapping* mapping = page->mappings.Head()) {
2829                 VMArea* area = mapping->area;
2830                 VMTranslationMap* map = area->address_space->TranslationMap();
2831                 addr_t address = virtual_page_address(area, page);
2832                 bool modified = false;
2833                 if (map->ClearAccessedAndModified(area, address, true, modified)) {
2834                         page->accessed = true;
2835                         page->modified |= modified;
2836                         return vm_clear_page_mapping_accessed_flags(page);
2837                 }
2838                 page->modified |= modified;
2839         }
2840
2841         return 0;
2842 }
2843
2844
2845 static int
2846 display_mem(int argc, char** argv)
2847 {
2848         bool physical = false;
2849         addr_t copyAddress;
2850         int32 displayWidth;
2851         int32 itemSize;
2852         int32 num = -1;
2853         addr_t address;
2854         int i = 1, j;
2855
2856         if (argc > 1 && argv[1][0] == '-') {
2857                 if (!strcmp(argv[1], "-p") || !strcmp(argv[1], "--physical")) {
2858                         physical = true;
2859                         i++;
2860                 } else
2861                         i = 99;
2862         }
2863
2864         if (argc < i + 1 || argc > i + 2) {
2865                 kprintf("usage: dl/dw/ds/db/string [-p|--physical] <address> [num]\n"
2866                         "\tdl - 8 bytes\n"
2867                         "\tdw - 4 bytes\n"
2868                         "\tds - 2 bytes\n"
2869                         "\tdb - 1 byte\n"
2870                         "\tstring - a whole string\n"
2871                         "  -p or --physical only allows memory from a single page to be "
2872                         "displayed.\n");
2873                 return 0;
2874         }
2875
2876         address = parse_expression(argv[i]);
2877
2878         if (argc > i + 1)
2879                 num = parse_expression(argv[i + 1]);
2880
2881         // build the format string
2882         if (strcmp(argv[0], "db") == 0) {
2883                 itemSize = 1;
2884                 displayWidth = 16;
2885         } else if (strcmp(argv[0], "ds") == 0) {
2886                 itemSize = 2;
2887                 displayWidth = 8;
2888         } else if (strcmp(argv[0], "dw") == 0) {
2889                 itemSize = 4;
2890                 displayWidth = 4;
2891         } else if (strcmp(argv[0], "dl") == 0) {
2892                 itemSize = 8;
2893                 displayWidth = 2;
2894         } else if (strcmp(argv[0], "string") == 0) {
2895                 itemSize = 1;
2896                 displayWidth = -1;
2897         } else {
2898                 kprintf("display_mem called in an invalid way!\n");
2899                 return 0;
2900         }
2901
2902         if (num <= 0)
2903                 num = displayWidth;
2904
2905         void* physicalPageHandle = NULL;
2906
2907         if (physical) {
2908                 int32 offset = address & (B_PAGE_SIZE - 1);
2909                 if (num * itemSize + offset > B_PAGE_SIZE) {
2910                         num = (B_PAGE_SIZE - offset) / itemSize;
2911                         kprintf("NOTE: number of bytes has been cut to page size\n");
2912                 }
2913
2914                 address = ROUNDDOWN(address, B_PAGE_SIZE);
2915
2916                 if (vm_get_physical_page_debug(address, &copyAddress,
2917                                 &physicalPageHandle) != B_OK) {
2918                         kprintf("getting the hardware page failed.");
2919                         return 0;
2920                 }
2921
2922                 address += offset;
2923                 copyAddress += offset;
2924         } else
2925                 copyAddress = address;
2926
2927         if (!strcmp(argv[0], "string")) {
2928                 kprintf("%p \"", (char*)copyAddress);
2929
2930                 // string mode
2931                 for (i = 0; true; i++) {
2932                         char c;
2933                         if (debug_memcpy(B_CURRENT_TEAM, &c, (char*)copyAddress + i, 1)
2934                                         != B_OK
2935                                 || c == '\0') {
2936                                 break;
2937                         }
2938
2939                         if (c == '\n')
2940                                 kprintf("\\n");
2941                         else if (c == '\t')
2942                                 kprintf("\\t");
2943                         else {
2944                                 if (!isprint(c))
2945                                         c = '.';
2946
2947                                 kprintf("%c", c);
2948                         }
2949                 }
2950
2951                 kprintf("\"\n");
2952         } else {
2953                 // number mode
2954                 for (i = 0; i < num; i++) {
2955                         uint32 value;
2956
2957                         if ((i % displayWidth) == 0) {
2958                                 int32 displayed = min_c(displayWidth, (num-i)) * itemSize;
2959                                 if (i != 0)
2960                                         kprintf("\n");
2961
2962                                 kprintf("[0x%lx]  ", address + i * itemSize);
2963
2964                                 for (j = 0; j < displayed; j++) {
2965                                         char c;
2966                                         if (debug_memcpy(B_CURRENT_TEAM, &c,
2967                                                         (char*)copyAddress + i * itemSize + j, 1) != B_OK) {
2968                                                 displayed = j;
2969                                                 break;
2970                                         }
2971                                         if (!isprint(c))
2972                                                 c = '.';
2973
2974                                         kprintf("%c", c);
2975                                 }
2976                                 if (num > displayWidth) {
2977                                         // make sure the spacing in the last line is correct
2978                                         for (j = displayed; j < displayWidth * itemSize; j++)
2979                                                 kprintf(" ");
2980                                 }
2981                                 kprintf("  ");
2982                         }
2983
2984                         if (debug_memcpy(B_CURRENT_TEAM, &value,
2985                                         (uint8*)copyAddress + i * itemSize, itemSize) != B_OK) {
2986                                 kprintf("read fault");
2987                                 break;
2988                         }
2989
2990                         switch (itemSize) {
2991                                 case 1:
2992                                         kprintf(" %02" B_PRIx8, *(uint8*)&value);
2993                                         break;
2994                                 case 2:
2995                                         kprintf(" %04" B_PRIx16, *(uint16*)&value);
2996                                         break;
2997                                 case 4:
2998                                         kprintf(" %08" B_PRIx32, *(uint32*)&value);
2999                                         break;
3000                                 case 8:
3001                                         kprintf(" %016" B_PRIx64, *(uint64*)&value);
3002                                         break;
3003                         }
3004                 }
3005
3006                 kprintf("\n");
3007         }
3008
3009         if (physical) {
3010                 copyAddress = ROUNDDOWN(copyAddress, B_PAGE_SIZE);
3011                 vm_put_physical_page_debug(copyAddress, physicalPageHandle);
3012         }
3013         return 0;
3014 }
3015
3016
3017 static void
3018 dump_cache_tree_recursively(VMCache* cache, int level,
3019         VMCache* highlightCache)
3020 {
3021         // print this cache
3022         for (int i = 0; i < level; i++)
3023                 kprintf("  ");
3024         if (cache == highlightCache)
3025                 kprintf("%p <--\n", cache);
3026         else
3027                 kprintf("%p\n", cache);
3028
3029         // recursively print its consumers
3030         for (VMCache::ConsumerList::Iterator it = cache->consumers.GetIterator();
3031                         VMCache* consumer = it.Next();) {
3032                 dump_cache_tree_recursively(consumer, level + 1, highlightCache);
3033         }
3034 }
3035
3036
3037 static int
3038 dump_cache_tree(int argc, char** argv)
3039 {
3040         if (argc != 2 || !strcmp(argv[1], "--help")) {
3041                 kprintf("usage: %s <address>\n", argv[0]);
3042                 return 0;
3043         }
3044
3045         addr_t address = parse_expression(argv[1]);
3046         if (address == 0)
3047                 return 0;
3048
3049         VMCache* cache = (VMCache*)address;
3050         VMCache* root = cache;
3051
3052         // find the root cache (the transitive source)
3053         while (root->source != NULL)
3054                 root = root->source;
3055
3056         dump_cache_tree_recursively(root, 0, cache);
3057
3058         return 0;
3059 }
3060
3061
3062 const char*
3063 vm_cache_type_to_string(int32 type)
3064 {
3065         switch (type) {
3066                 case CACHE_TYPE_RAM:
3067                         return "RAM";
3068                 case CACHE_TYPE_DEVICE:
3069                         return "device";
3070                 case CACHE_TYPE_VNODE:
3071                         return "vnode";
3072                 case CACHE_TYPE_NULL:
3073                         return "null";
3074
3075                 default:
3076                         return "unknown";
3077         }
3078 }
3079
3080
3081 #if DEBUG_CACHE_LIST
3082
3083 static void
3084 update_cache_info_recursively(VMCache* cache, cache_info& info)
3085 {
3086         info.page_count += cache->page_count;
3087         if (cache->type == CACHE_TYPE_RAM)
3088                 info.committed += cache->committed_size;
3089
3090         // recurse
3091         for (VMCache::ConsumerList::Iterator it = cache->consumers.GetIterator();
3092                         VMCache* consumer = it.Next();) {
3093                 update_cache_info_recursively(consumer, info);
3094         }
3095 }
3096
3097
3098 static int
3099 cache_info_compare_page_count(const void* _a, const void* _b)
3100 {
3101         const cache_info* a = (const cache_info*)_a;
3102         const cache_info* b = (const cache_info*)_b;
3103         if (a->page_count == b->page_count)
3104                 return 0;
3105         return a->page_count < b->page_count ? 1 : -1;
3106 }
3107
3108
3109 static int
3110 cache_info_compare_committed(const void* _a, const void* _b)
3111 {
3112         const cache_info* a = (const cache_info*)_a;
3113         const cache_info* b = (const cache_info*)_b;
3114         if (a->committed == b->committed)
3115                 return 0;
3116         return a->committed < b->committed ? 1 : -1;
3117 }
3118
3119
3120 static void
3121 dump_caches_recursively(VMCache* cache, cache_info& info, int level)
3122 {
3123         for (int i = 0; i < level; i++)
3124                 kprintf("  ");
3125
3126         kprintf("%p: type: %s, base: %" B_PRIdOFF ", size: %" B_PRIdOFF ", "
3127                 "pages: %" B_PRIu32, cache, vm_cache_type_to_string(cache->type),
3128                 cache->virtual_base, cache->virtual_end, cache->page_count);
3129
3130         if (level == 0)
3131                 kprintf("/%lu", info.page_count);
3132
3133         if (cache->type == CACHE_TYPE_RAM || (level == 0 && info.committed > 0)) {
3134                 kprintf(", committed: %" B_PRIdOFF, cache->committed_size);
3135
3136                 if (level == 0)
3137                         kprintf("/%lu", info.committed);
3138         }
3139
3140         // areas
3141         if (cache->areas != NULL) {
3142                 VMArea* area = cache->areas;
3143                 kprintf(", areas: %" B_PRId32 " (%s, team: %" B_PRId32 ")", area->id,
3144                         area->name, area->address_space->ID());
3145
3146                 while (area->cache_next != NULL) {
3147                         area = area->cache_next;
3148                         kprintf(", %" B_PRId32, area->id);
3149                 }
3150         }
3151
3152         kputs("\n");
3153
3154         // recurse
3155         for (VMCache::ConsumerList::Iterator it = cache->consumers.GetIterator();
3156                         VMCache* consumer = it.Next();) {
3157                 dump_caches_recursively(consumer, info, level + 1);
3158         }
3159 }
3160
3161
3162 static int
3163 dump_caches(int argc, char** argv)
3164 {
3165         if (sCacheInfoTable == NULL) {
3166                 kprintf("No cache info table!\n");
3167                 return 0;
3168         }
3169
3170         bool sortByPageCount = true;
3171
3172         for (int32 i = 1; i < argc; i++) {
3173                 if (strcmp(argv[i], "-c") == 0) {
3174                         sortByPageCount = false;
3175                 } else {
3176                         print_debugger_command_usage(argv[0]);
3177                         return 0;
3178                 }
3179         }
3180
3181         uint32 totalCount = 0;
3182         uint32 rootCount = 0;
3183         off_t totalCommitted = 0;
3184         page_num_t totalPages = 0;
3185
3186         VMCache* cache = gDebugCacheList;
3187         while (cache) {
3188                 totalCount++;
3189                 if (cache->source == NULL) {
3190                         cache_info stackInfo;
3191                         cache_info& info = rootCount < (uint32)kCacheInfoTableCount
3192                                 ? sCacheInfoTable[rootCount] : stackInfo;
3193                         rootCount++;
3194                         info.cache = cache;
3195                         info.page_count = 0;
3196                         info.committed = 0;
3197                         update_cache_info_recursively(cache, info);
3198                         totalCommitted += info.committed;
3199                         totalPages += info.page_count;
3200                 }
3201
3202                 cache = cache->debug_next;
3203         }
3204
3205         if (rootCount <= (uint32)kCacheInfoTableCount) {
3206                 qsort(sCacheInfoTable, rootCount, sizeof(cache_info),
3207                         sortByPageCount
3208                                 ? &cache_info_compare_page_count
3209                                 : &cache_info_compare_committed);
3210         }
3211
3212         kprintf("total committed memory: %" B_PRIdOFF ", total used pages: %"
3213                 B_PRIuPHYSADDR "\n", totalCommitted, totalPages);
3214         kprintf("%" B_PRIu32 " caches (%" B_PRIu32 " root caches), sorted by %s "
3215                 "per cache tree...\n\n", totalCount, rootCount, sortByPageCount ?
3216                         "page count" : "committed size");
3217
3218         if (rootCount <= (uint32)kCacheInfoTableCount) {
3219                 for (uint32 i = 0; i < rootCount; i++) {
3220                         cache_info& info = sCacheInfoTable[i];
3221                         dump_caches_recursively(info.cache, info, 0);
3222                 }
3223         } else
3224                 kprintf("Cache info table too small! Can't sort and print caches!\n");
3225
3226         return 0;
3227 }
3228
3229 #endif  // DEBUG_CACHE_LIST
3230
3231
3232 static int
3233 dump_cache(int argc, char** argv)
3234 {
3235         VMCache* cache;
3236         bool showPages = false;
3237         int i = 1;
3238
3239         if (argc < 2 || !strcmp(argv[1], "--help")) {
3240                 kprintf("usage: %s [-ps] <address>\n"
3241                         "  if -p is specified, all pages are shown, if -s is used\n"
3242                         "  only the cache info is shown respectively.\n", argv[0]);
3243                 return 0;
3244         }
3245         while (argv[i][0] == '-') {
3246                 char* arg = argv[i] + 1;
3247                 while (arg[0]) {
3248                         if (arg[0] == 'p')
3249                                 showPages = true;
3250                         arg++;
3251                 }
3252                 i++;
3253         }
3254         if (argv[i] == NULL) {
3255                 kprintf("%s: invalid argument, pass address\n", argv[0]);
3256                 return 0;
3257         }
3258
3259         addr_t address = parse_expression(argv[i]);
3260         if (address == 0)
3261                 return 0;
3262
3263         cache = (VMCache*)address;
3264
3265         cache->Dump(showPages);
3266
3267         set_debug_variable("_sourceCache", (addr_t)cache->source);
3268
3269         return 0;
3270 }
3271
3272
3273 static void
3274 dump_area_struct(VMArea* area, bool mappings)
3275 {
3276         kprintf("AREA: %p\n", area);
3277         kprintf("name:\t\t'%s'\n", area->name);
3278         kprintf("owner:\t\t0x%" B_PRIx32 "\n", area->address_space->ID());
3279         kprintf("id:\t\t0x%" B_PRIx32 "\n", area->id);
3280         kprintf("base:\t\t0x%lx\n", area->Base());
3281         kprintf("size:\t\t0x%lx\n", area->Size());
3282         kprintf("protection:\t0x%" B_PRIx32 "\n", area->protection);
3283         kprintf("wiring:\t\t0x%x\n", area->wiring);
3284         kprintf("memory_type:\t%#" B_PRIx32 "\n", area->MemoryType());
3285         kprintf("cache:\t\t%p\n", area->cache);
3286         kprintf("cache_type:\t%s\n", vm_cache_type_to_string(area->cache_type));
3287         kprintf("cache_offset:\t0x%" B_PRIx64 "\n", area->cache_offset);
3288         kprintf("cache_next:\t%p\n", area->cache_next);
3289         kprintf("cache_prev:\t%p\n", area->cache_prev);
3290
3291         VMAreaMappings::Iterator iterator = area->mappings.GetIterator();
3292         if (mappings) {
3293                 kprintf("page mappings:\n");
3294                 while (iterator.HasNext()) {
3295                         vm_page_mapping* mapping = iterator.Next();
3296                         kprintf("  %p", mapping->page);
3297                 }
3298                 kprintf("\n");
3299         } else {
3300                 uint32 count = 0;
3301                 while (iterator.Next() != NULL) {
3302                         count++;
3303                 }
3304                 kprintf("page mappings:\t%" B_PRIu32 "\n", count);
3305         }
3306 }
3307
3308
3309 static int
3310 dump_area(int argc, char** argv)
3311 {
3312         bool mappings = false;
3313         bool found = false;
3314         int32 index = 1;
3315         VMArea* area;
3316         addr_t num;
3317
3318         if (argc < 2 || !strcmp(argv[1], "--help")) {
3319                 kprintf("usage: area [-m] [id|contains|address|name] <id|address|name>\n"
3320                         "All areas matching either id/address/name are listed. You can\n"
3321                         "force to check only a specific item by prefixing the specifier\n"
3322                         "with the id/contains/address/name keywords.\n"
3323                         "-m shows the area's mappings as well.\n");
3324                 return 0;
3325         }
3326
3327         if (!strcmp(argv[1], "-m")) {
3328                 mappings = true;
3329                 index++;
3330         }
3331
3332         int32 mode = 0xf;
3333         if (!strcmp(argv[index], "id"))
3334                 mode = 1;
3335         else if (!strcmp(argv[index], "contains"))
3336                 mode = 2;
3337         else if (!strcmp(argv[index], "name"))
3338                 mode = 4;
3339         else if (!strcmp(argv[index], "address"))
3340                 mode = 0;
3341         if (mode != 0xf)
3342                 index++;
3343
3344         if (index >= argc) {
3345                 kprintf("No area specifier given.\n");
3346                 return 0;
3347         }
3348
3349         num = parse_expression(argv[index]);
3350
3351         if (mode == 0) {
3352                 dump_area_struct((struct VMArea*)num, mappings);
3353         } else {
3354                 // walk through the area list, looking for the arguments as a name
3355
3356                 VMAreaHashTable::Iterator it = VMAreaHash::GetIterator();
3357                 while ((area = it.Next()) != NULL) {
3358                         if (((mode & 4) != 0 && area->name != NULL
3359                                         && !strcmp(argv[index], area->name))
3360                                 || (num != 0 && (((mode & 1) != 0 && (addr_t)area->id == num)
3361                                         || (((mode & 2) != 0 && area->Base() <= num
3362                                                 && area->Base() + area->Size() > num))))) {
3363                                 dump_area_struct(area, mappings);
3364                                 found = true;
3365                         }
3366                 }
3367
3368                 if (!found)
3369                         kprintf("could not find area %s (%ld)\n", argv[index], num);
3370         }
3371
3372         return 0;
3373 }
3374
3375
3376 static int
3377 dump_area_list(int argc, char** argv)
3378 {
3379         VMArea* area;
3380         const char* name = NULL;
3381         int32 id = 0;
3382
3383         if (argc > 1) {
3384                 id = parse_expression(argv[1]);
3385                 if (id == 0)
3386                         name = argv[1];
3387         }
3388
3389         kprintf("%-*s      id  %-*s    %-*sprotect lock  name\n",
3390                 B_PRINTF_POINTER_WIDTH, "addr", B_PRINTF_POINTER_WIDTH, "base",
3391                 B_PRINTF_POINTER_WIDTH, "size");
3392
3393         VMAreaHashTable::Iterator it = VMAreaHash::GetIterator();
3394         while ((area = it.Next()) != NULL) {
3395                 if ((id != 0 && area->address_space->ID() != id)
3396                         || (name != NULL && strstr(area->name, name) == NULL))
3397                         continue;
3398
3399                 kprintf("%p %5" B_PRIx32 "  %p  %p %4" B_PRIx32 " %4d  %s\n", area,
3400                         area->id, (void*)area->Base(), (void*)area->Size(),
3401                         area->protection, area->wiring, area->name);
3402         }
3403         return 0;
3404 }
3405
3406
3407 static int
3408 dump_available_memory(int argc, char** argv)
3409 {
3410         kprintf("Available memory: %" B_PRIdOFF "/%" B_PRIuPHYSADDR " bytes\n",
3411                 sAvailableMemory, (phys_addr_t)vm_page_num_pages() * B_PAGE_SIZE);
3412         return 0;
3413 }
3414
3415
3416 static int
3417 dump_mapping_info(int argc, char** argv)
3418 {
3419         bool reverseLookup = false;
3420         bool pageLookup = false;
3421
3422         int argi = 1;
3423         for (; argi < argc && argv[argi][0] == '-'; argi++) {
3424                 const char* arg = argv[argi];
3425                 if (strcmp(arg, "-r") == 0) {
3426                         reverseLookup = true;
3427                 } else if (strcmp(arg, "-p") == 0) {
3428                         reverseLookup = true;
3429                         pageLookup = true;
3430                 } else {
3431                         print_debugger_command_usage(argv[0]);
3432                         return 0;
3433                 }
3434         }
3435
3436         // We need at least one argument, the address. Optionally a thread ID can be
3437         // specified.
3438         if (argi >= argc || argi + 2 < argc) {
3439                 print_debugger_command_usage(argv[0]);
3440                 return 0;
3441         }
3442
3443         uint64 addressValue;
3444         if (!evaluate_debug_expression(argv[argi++], &addressValue, false))
3445                 return 0;
3446
3447         Team* team = NULL;
3448         if (argi < argc) {
3449                 uint64 threadID;
3450                 if (!evaluate_debug_expression(argv[argi++], &threadID, false))
3451                         return 0;
3452
3453                 Thread* thread = Thread::GetDebug(threadID);
3454                 if (thread == NULL) {
3455                         kprintf("Invalid thread/team ID \"%s\"\n", argv[argi - 1]);
3456                         return 0;
3457                 }
3458
3459                 team = thread->team;
3460         }
3461
3462         if (reverseLookup) {
3463                 phys_addr_t physicalAddress;
3464                 if (pageLookup) {
3465                         vm_page* page = (vm_page*)(addr_t)addressValue;
3466                         physicalAddress = page->physical_page_number * B_PAGE_SIZE;
3467                 } else {
3468                         physicalAddress = (phys_addr_t)addressValue;
3469                         physicalAddress -= physicalAddress % B_PAGE_SIZE;
3470                 }
3471
3472                 kprintf("    Team     Virtual Address      Area\n");
3473                 kprintf("--------------------------------------\n");
3474
3475                 struct Callback : VMTranslationMap::ReverseMappingInfoCallback {
3476                         Callback()
3477                                 :
3478                                 fAddressSpace(NULL)
3479                         {
3480                         }
3481
3482                         void SetAddressSpace(VMAddressSpace* addressSpace)
3483                         {
3484                                 fAddressSpace = addressSpace;
3485                         }
3486
3487                         virtual bool HandleVirtualAddress(addr_t virtualAddress)
3488                         {
3489                                 kprintf("%8" B_PRId32 "  %#18" B_PRIxADDR, fAddressSpace->ID(),
3490                                         virtualAddress);
3491                                 if (VMArea* area = fAddressSpace->LookupArea(virtualAddress))
3492                                         kprintf("  %8" B_PRId32 " %s\n", area->id, area->name);
3493                                 else
3494                                         kprintf("\n");
3495                                 return false;
3496                         }
3497
3498                 private:
3499                         VMAddressSpace* fAddressSpace;
3500                 } callback;
3501
3502                 if (team != NULL) {
3503                         // team specified -- get its address space
3504                         VMAddressSpace* addressSpace = team->address_space;
3505                         if (addressSpace == NULL) {
3506                                 kprintf("Failed to get address space!\n");
3507                                 return 0;
3508                         }
3509
3510                         callback.SetAddressSpace(addressSpace);
3511                         addressSpace->TranslationMap()->DebugGetReverseMappingInfo(
3512                                 physicalAddress, callback);
3513                 } else {
3514                         // no team specified -- iterate through all address spaces
3515                         for (VMAddressSpace* addressSpace = VMAddressSpace::DebugFirst();
3516                                 addressSpace != NULL;
3517                                 addressSpace = VMAddressSpace::DebugNext(addressSpace)) {
3518                                 callback.SetAddressSpace(addressSpace);
3519                                 addressSpace->TranslationMap()->DebugGetReverseMappingInfo(
3520                                         physicalAddress, callback);
3521                         }
3522                 }
3523         } else {
3524                 // get the address space
3525                 addr_t virtualAddress = (addr_t)addressValue;
3526                 virtualAddress -= virtualAddress % B_PAGE_SIZE;
3527                 VMAddressSpace* addressSpace;
3528                 if (IS_KERNEL_ADDRESS(virtualAddress)) {
3529                         addressSpace = VMAddressSpace::Kernel();
3530                 } else if (team != NULL) {
3531                         addressSpace = team->address_space;
3532                 } else {
3533                         Thread* thread = debug_get_debugged_thread();
3534                         if (thread == NULL || thread->team == NULL) {
3535                                 kprintf("Failed to get team!\n");
3536                                 return 0;
3537                         }
3538
3539                         addressSpace = thread->team->address_space;
3540                 }
3541
3542                 if (addressSpace == NULL) {
3543                         kprintf("Failed to get address space!\n");
3544                         return 0;
3545                 }
3546
3547                 // let the translation map implementation do the job
3548                 addressSpace->TranslationMap()->DebugPrintMappingInfo(virtualAddress);
3549         }
3550
3551         return 0;
3552 }
3553
3554
3555 /*!     Deletes all areas and reserved regions in the given address space.
3556
3557         The caller must ensure that none of the areas has any wired ranges.
3558
3559         \param addressSpace The address space.
3560         \param deletingAddressSpace \c true, if the address space is in the process
3561                 of being deleted.
3562 */
3563 void
3564 vm_delete_areas(struct VMAddressSpace* addressSpace, bool deletingAddressSpace)
3565 {
3566         TRACE(("vm_delete_areas: called on address space 0x%" B_PRIx32 "\n",
3567                 addressSpace->ID()));
3568
3569         addressSpace->WriteLock();
3570
3571         // remove all reserved areas in this address space
3572         addressSpace->UnreserveAllAddressRanges(0);
3573
3574         // delete all the areas in this address space
3575         while (VMArea* area = addressSpace->FirstArea()) {
3576                 ASSERT(!area->IsWired());
3577                 delete_area(addressSpace, area, deletingAddressSpace);
3578         }
3579
3580         addressSpace->WriteUnlock();
3581 }
3582
3583
3584 static area_id
3585 vm_area_for(addr_t address, bool kernel)
3586 {
3587         team_id team;
3588         if (IS_USER_ADDRESS(address)) {
3589                 // we try the user team address space, if any
3590                 team = VMAddressSpace::CurrentID();
3591                 if (team < 0)
3592                         return team;
3593         } else
3594                 team = VMAddressSpace::KernelID();
3595
3596         AddressSpaceReadLocker locker(team);
3597         if (!locker.IsLocked())
3598                 return B_BAD_TEAM_ID;
3599
3600         VMArea* area = locker.AddressSpace()->LookupArea(address);
3601         if (area != NULL) {
3602                 if (!kernel && (area->protection & (B_READ_AREA | B_WRITE_AREA)) == 0)
3603                         return B_ERROR;
3604
3605                 return area->id;
3606         }
3607
3608         return B_ERROR;
3609 }
3610
3611
3612 /*!     Frees physical pages that were used during the boot process.
3613         \a end is inclusive.
3614 */
3615 static void
3616 unmap_and_free_physical_pages(VMTranslationMap* map, addr_t start, addr_t end)
3617 {
3618         // free all physical pages in the specified range
3619
3620         for (addr_t current = start; current < end; current += B_PAGE_SIZE) {
3621                 phys_addr_t physicalAddress;
3622                 uint32 flags;
3623
3624                 if (map->Query(current, &physicalAddress, &flags) == B_OK
3625                         && (flags & PAGE_PRESENT) != 0) {
3626                         vm_page* page = vm_lookup_page(physicalAddress / B_PAGE_SIZE);
3627                         if (page != NULL && page->State() != PAGE_STATE_FREE
3628                                          && page->State() != PAGE_STATE_CLEAR
3629                                          && page->State() != PAGE_STATE_UNUSED) {
3630                                 DEBUG_PAGE_ACCESS_START(page);
3631                                 vm_page_set_state(page, PAGE_STATE_FREE);
3632                         }
3633                 }
3634         }
3635
3636         // unmap the memory
3637         map->Unmap(start, end);
3638 }
3639
3640
3641 void
3642 vm_free_unused_boot_loader_range(addr_t start, addr_t size)
3643 {
3644         VMTranslationMap* map = VMAddressSpace::Kernel()->TranslationMap();
3645         addr_t end = start + (size - 1);
3646         addr_t lastEnd = start;
3647
3648         TRACE(("vm_free_unused_boot_loader_range(): asked to free %p - %p\n",
3649                 (void*)start, (void*)end));
3650
3651         // The areas are sorted in virtual address space order, so
3652         // we just have to find the holes between them that fall
3653         // into the area we should dispose
3654
3655         map->Lock();
3656
3657         for (VMAddressSpace::AreaIterator it
3658                                 = VMAddressSpace::Kernel()->GetAreaIterator();
3659                         VMArea* area = it.Next();) {
3660                 addr_t areaStart = area->Base();
3661                 addr_t areaEnd = areaStart + (area->Size() - 1);
3662
3663                 if (areaEnd < start)
3664                         continue;
3665
3666                 if (areaStart > end) {
3667                         // we are done, the area is already beyond of what we have to free
3668                         break;
3669                 }
3670
3671                 if (areaStart > lastEnd) {
3672                         // this is something we can free
3673                         TRACE(("free boot range: get rid of %p - %p\n", (void*)lastEnd,
3674                                 (void*)areaStart));
3675                         unmap_and_free_physical_pages(map, lastEnd, areaStart - 1);
3676                 }
3677
3678                 if (areaEnd >= end) {
3679                         lastEnd = areaEnd;
3680                                 // no +1 to prevent potential overflow
3681                         break;
3682                 }
3683
3684                 lastEnd = areaEnd + 1;
3685         }
3686
3687         if (lastEnd < end) {
3688                 // we can also get rid of some space at the end of the area
3689                 TRACE(("free boot range: also remove %p - %p\n", (void*)lastEnd,
3690                         (void*)end));
3691                 unmap_and_free_physical_pages(map, lastEnd, end);
3692         }
3693
3694         map->Unlock();
3695 }
3696
3697
3698 static void
3699 create_preloaded_image_areas(struct preloaded_image* _image)
3700 {
3701         preloaded_elf_image* image = static_cast<preloaded_elf_image*>(_image);
3702         char name[B_OS_NAME_LENGTH];
3703         void* address;
3704         int32 length;
3705
3706         // use file name to create a good area name
3707         char* fileName = strrchr(image->name, '/');
3708         if (fileName == NULL)
3709                 fileName = image->name;
3710         else
3711                 fileName++;
3712
3713         length = strlen(fileName);
3714         // make sure there is enough space for the suffix
3715         if (length > 25)
3716                 length = 25;
3717
3718         memcpy(name, fileName, length);
3719         strcpy(name + length, "_text");
3720         address = (void*)ROUNDDOWN(image->text_region.start, B_PAGE_SIZE);
3721         image->text_region.id = create_area(name, &address, B_EXACT_ADDRESS,
3722                 PAGE_ALIGN(image->text_region.size), B_ALREADY_WIRED,
3723                 B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3724                 // this will later be remapped read-only/executable by the
3725                 // ELF initialization code
3726
3727         strcpy(name + length, "_data");
3728         address = (void*)ROUNDDOWN(image->data_region.start, B_PAGE_SIZE);
3729         image->data_region.id = create_area(name, &address, B_EXACT_ADDRESS,
3730                 PAGE_ALIGN(image->data_region.size), B_ALREADY_WIRED,
3731                 B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3732 }
3733
3734
3735 /*!     Frees all previously kernel arguments areas from the kernel_args structure.
3736         Any boot loader resources contained in that arguments must not be accessed
3737         anymore past this point.
3738 */
3739 void
3740 vm_free_kernel_args(kernel_args* args)
3741 {
3742         uint32 i;
3743
3744         TRACE(("vm_free_kernel_args()\n"));
3745
3746         for (i = 0; i < args->num_kernel_args_ranges; i++) {
3747                 area_id area = area_for((void*)(addr_t)args->kernel_args_range[i].start);
3748                 if (area >= B_OK)
3749                         delete_area(area);
3750         }
3751 }
3752
3753
3754 static void
3755 allocate_kernel_args(kernel_args* args)
3756 {
3757         TRACE(("allocate_kernel_args()\n"));
3758
3759         for (uint32 i = 0; i < args->num_kernel_args_ranges; i++) {
3760                 void* address = (void*)(addr_t)args->kernel_args_range[i].start;
3761
3762                 create_area("_kernel args_", &address, B_EXACT_ADDRESS,
3763                         args->kernel_args_range[i].size, B_ALREADY_WIRED,
3764                         B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3765         }
3766 }
3767
3768
3769 static void
3770 unreserve_boot_loader_ranges(kernel_args* args)
3771 {
3772         TRACE(("unreserve_boot_loader_ranges()\n"));
3773
3774         for (uint32 i = 0; i < args->num_virtual_allocated_ranges; i++) {
3775                 vm_unreserve_address_range(VMAddressSpace::KernelID(),
3776                         (void*)(addr_t)args->virtual_allocated_range[i].start,
3777                         args->virtual_allocated_range[i].size);
3778         }
3779 }
3780
3781
3782 static void
3783 reserve_boot_loader_ranges(kernel_args* args)
3784 {
3785         TRACE(("reserve_boot_loader_ranges()\n"));
3786
3787         for (uint32 i = 0; i < args->num_virtual_allocated_ranges; i++) {
3788                 void* address = (void*)(addr_t)args->virtual_allocated_range[i].start;
3789
3790                 // If the address is no kernel address, we just skip it. The
3791                 // architecture specific code has to deal with it.
3792                 if (!IS_KERNEL_ADDRESS(address)) {
3793                         dprintf("reserve_boot_loader_ranges(): Skipping range: %p, %"
3794                                 B_PRIu64 "\n", address, args->virtual_allocated_range[i].size);
3795                         continue;
3796                 }
3797
3798                 status_t status = vm_reserve_address_range(VMAddressSpace::KernelID(),
3799                         &address, B_EXACT_ADDRESS, args->virtual_allocated_range[i].size, 0);
3800                 if (status < B_OK)
3801                         panic("could not reserve boot loader ranges\n");
3802         }
3803 }
3804
3805
3806 static addr_t
3807 allocate_early_virtual(kernel_args* args, size_t size, addr_t alignment)
3808 {
3809         size = PAGE_ALIGN(size);
3810
3811         // find a slot in the virtual allocation addr range
3812         for (uint32 i = 1; i < args->num_virtual_allocated_ranges; i++) {
3813                 // check to see if the space between this one and the last is big enough
3814                 addr_t rangeStart = args->virtual_allocated_range[i].start;
3815                 addr_t previousRangeEnd = args->virtual_allocated_range[i - 1].start
3816                         + args->virtual_allocated_range[i - 1].size;
3817
3818                 addr_t base = alignment > 0
3819                         ? ROUNDUP(previousRangeEnd, alignment) : previousRangeEnd;
3820
3821                 if (base >= KERNEL_BASE && base < rangeStart
3822                                 && rangeStart - base >= size) {
3823                         args->virtual_allocated_range[i - 1].size
3824                                 += base + size - previousRangeEnd;
3825                         return base;
3826                 }
3827         }
3828
3829         // we hadn't found one between allocation ranges. this is ok.
3830         // see if there's a gap after the last one
3831         int lastEntryIndex = args->num_virtual_allocated_ranges - 1;
3832         addr_t lastRangeEnd = args->virtual_allocated_range[lastEntryIndex].start
3833                 + args->virtual_allocated_range[lastEntryIndex].size;
3834         addr_t base = alignment > 0
3835                 ? ROUNDUP(lastRangeEnd, alignment) : lastRangeEnd;
3836         if (KERNEL_BASE + (KERNEL_SIZE - 1) - base >= size) {
3837                 args->virtual_allocated_range[lastEntryIndex].size
3838                         += base + size - lastRangeEnd;
3839                 return base;
3840         }
3841
3842         // see if there's a gap before the first one
3843         addr_t rangeStart = args->virtual_allocated_range[0].start;
3844         if (rangeStart > KERNEL_BASE && rangeStart - KERNEL_BASE >= size) {
3845                 base = rangeStart - size;
3846                 if (alignment > 0)
3847                         base = ROUNDDOWN(base, alignment);
3848
3849                 if (base >= KERNEL_BASE) {
3850                         args->virtual_allocated_range[0].start = base;
3851                         args->virtual_allocated_range[0].size += rangeStart - base;
3852                         return base;
3853                 }
3854         }
3855
3856         return 0;
3857 }
3858
3859
3860 static bool
3861 is_page_in_physical_memory_range(kernel_args* args, phys_addr_t address)
3862 {
3863         // TODO: horrible brute-force method of determining if the page can be
3864         // allocated
3865         for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
3866                 if (address >= args->physical_memory_range[i].start
3867                         && address < args->physical_memory_range[i].start
3868                                 + args->physical_memory_range[i].size)
3869                         return true;
3870         }
3871         return false;
3872 }
3873
3874
3875 page_num_t
3876 vm_allocate_early_physical_page(kernel_args* args)
3877 {
3878         for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
3879                 phys_addr_t nextPage;
3880
3881                 nextPage = args->physical_allocated_range[i].start
3882                         + args->physical_allocated_range[i].size;
3883                 // see if the page after the next allocated paddr run can be allocated
3884                 if (i + 1 < args->num_physical_allocated_ranges
3885                         && args->physical_allocated_range[i + 1].size != 0) {
3886                         // see if the next page will collide with the next allocated range
3887                         if (nextPage >= args->physical_allocated_range[i+1].start)
3888                                 continue;
3889                 }
3890                 // see if the next physical page fits in the memory block
3891                 if (is_page_in_physical_memory_range(args, nextPage)) {
3892                         // we got one!
3893                         args->physical_allocated_range[i].size += B_PAGE_SIZE;
3894                         return nextPage / B_PAGE_SIZE;
3895                 }
3896         }
3897
3898         // Expanding upwards didn't work, try going downwards.
3899         for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
3900                 phys_addr_t nextPage;
3901
3902                 nextPage = args->physical_allocated_range[i].start - B_PAGE_SIZE;
3903                 // see if the page after the prev allocated paddr run can be allocated
3904                 if (i > 0 && args->physical_allocated_range[i - 1].size != 0) {
3905                         // see if the next page will collide with the next allocated range
3906                         if (nextPage < args->physical_allocated_range[i-1].start
3907                                 + args->physical_allocated_range[i-1].size)
3908                                 continue;
3909                 }
3910                 // see if the next physical page fits in the memory block
3911                 if (is_page_in_physical_memory_range(args, nextPage)) {
3912                         // we got one!
3913                         args->physical_allocated_range[i].start -= B_PAGE_SIZE;
3914                         args->physical_allocated_range[i].size += B_PAGE_SIZE;
3915                         return nextPage / B_PAGE_SIZE;
3916                 }
3917         }
3918
3919         return 0;
3920                 // could not allocate a block
3921 }
3922
3923
3924 /*!     This one uses the kernel_args' physical and virtual memory ranges to
3925         allocate some pages before the VM is completely up.
3926 */
3927 addr_t
3928 vm_allocate_early(kernel_args* args, size_t virtualSize, size_t physicalSize,
3929         uint32 attributes, addr_t alignment)
3930 {
3931         if (physicalSize > virtualSize)
3932                 physicalSize = virtualSize;
3933
3934         // find the vaddr to allocate at
3935         addr_t virtualBase = allocate_early_virtual(args, virtualSize, alignment);
3936         //dprintf("vm_allocate_early: vaddr 0x%lx\n", virtualBase);
3937         if (virtualBase == 0) {
3938                 panic("vm_allocate_early: could not allocate virtual address\n");
3939                 return 0;
3940         }
3941
3942         // map the pages
3943         for (uint32 i = 0; i < PAGE_ALIGN(physicalSize) / B_PAGE_SIZE; i++) {
3944                 page_num_t physicalAddress = vm_allocate_early_physical_page(args);
3945                 if (physicalAddress == 0)
3946                         panic("error allocating early page!\n");
3947
3948                 //dprintf("vm_allocate_early: paddr 0x%lx\n", physicalAddress);
3949
3950                 arch_vm_translation_map_early_map(args, virtualBase + i * B_PAGE_SIZE,
3951                         physicalAddress * B_PAGE_SIZE, attributes,
3952                         &vm_allocate_early_physical_page);
3953         }
3954
3955         return virtualBase;
3956 }
3957
3958
3959 /*!     The main entrance point to initialize the VM. */
3960 status_t
3961 vm_init(kernel_args* args)
3962 {
3963         struct preloaded_image* image;
3964         void* address;
3965         status_t err = 0;
3966         uint32 i;
3967
3968         TRACE(("vm_init: entry\n"));
3969         err = arch_vm_translation_map_init(args, &sPhysicalPageMapper);
3970         err = arch_vm_init(args);
3971
3972         // initialize some globals
3973         vm_page_init_num_pages(args);
3974         sAvailableMemory = vm_page_num_pages() * B_PAGE_SIZE;
3975
3976         slab_init(args);
3977
3978 #if USE_DEBUG_HEAP_FOR_MALLOC || USE_GUARDED_HEAP_FOR_MALLOC
3979         off_t heapSize = INITIAL_HEAP_SIZE;
3980         // try to accomodate low memory systems
3981         while (heapSize > sAvailableMemory / 8)
3982                 heapSize /= 2;
3983         if (heapSize < 1024 * 1024)
3984                 panic("vm_init: go buy some RAM please.");
3985
3986         // map in the new heap and initialize it
3987         addr_t heapBase = vm_allocate_early(args, heapSize, heapSize,
3988                 B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA, 0);
3989         TRACE(("heap at 0x%lx\n", heapBase));
3990         heap_init(heapBase, heapSize);
3991 #endif
3992
3993         // initialize the free page list and physical page mapper
3994         vm_page_init(args);
3995
3996         // initialize the cache allocators
3997         vm_cache_init(args);
3998
3999         {
4000                 status_t error = VMAreaHash::Init();
4001                 if (error != B_OK)
4002                         panic("vm_init: error initializing area hash table\n");
4003         }
4004
4005         VMAddressSpace::Init();
4006         reserve_boot_loader_ranges(args);
4007
4008 #if USE_DEBUG_HEAP_FOR_MALLOC || USE_GUARDED_HEAP_FOR_MALLOC
4009         heap_init_post_area();
4010 #endif
4011
4012         // Do any further initialization that the architecture dependant layers may
4013         // need now
4014         arch_vm_translation_map_init_post_area(args);
4015         arch_vm_init_post_area(args);
4016         vm_page_init_post_area(args);
4017         slab_init_post_area();
4018
4019         // allocate areas to represent stuff that already exists
4020
4021 #if USE_DEBUG_HEAP_FOR_MALLOC || USE_GUARDED_HEAP_FOR_MALLOC
4022         address = (void*)ROUNDDOWN(heapBase, B_PAGE_SIZE);
4023         create_area("kernel heap", &address, B_EXACT_ADDRESS, heapSize,
4024                 B_ALREADY_WIRED, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
4025 #endif
4026
4027         allocate_kernel_args(args);
4028
4029         create_preloaded_image_areas(args->kernel_image);
4030
4031         // allocate areas for preloaded images
4032         for (image = args->preloaded_images; image != NULL; image = image->next)
4033                 create_preloaded_image_areas(image);
4034
4035         // allocate kernel stacks
4036         for (i = 0; i < args->num_cpus; i++) {
4037                 char name[64];
4038
4039                 sprintf(name, "idle thread %" B_PRIu32 " kstack", i + 1);
4040                 address = (void*)args->cpu_kstack[i].start;
4041                 create_area(name, &address, B_EXACT_ADDRESS, args->cpu_kstack[i].size,
4042                         B_ALREADY_WIRED, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
4043         }
4044
4045         void* lastPage = (void*)ROUNDDOWN(~(addr_t)0, B_PAGE_SIZE);
4046         vm_block_address_range("overflow protection", lastPage, B_PAGE_SIZE);
4047
4048 #if PARANOID_KERNEL_MALLOC
4049         vm_block_address_range("uninitialized heap memory",
4050                 (void *)ROUNDDOWN(0xcccccccc, B_PAGE_SIZE), B_PAGE_SIZE * 64);
4051 #endif
4052 #if PARANOID_KERNEL_FREE
4053         vm_block_address_range("freed heap memory",
4054                 (void *)ROUNDDOWN(0xdeadbeef, B_PAGE_SIZE), B_PAGE_SIZE * 64);
4055 #endif
4056
4057         // create the object cache for the page mappings
4058         gPageMappingsObjectCache = create_object_cache_etc("page mappings",
4059                 sizeof(vm_page_mapping), 0, 0, 64, 128, CACHE_LARGE_SLAB, NULL, NULL,
4060                 NULL, NULL);
4061         if (gPageMappingsObjectCache == NULL)
4062                 panic("failed to create page mappings object cache");
4063
4064         object_cache_set_minimum_reserve(gPageMappingsObjectCache, 1024);
4065
4066 #if DEBUG_CACHE_LIST
4067         if (vm_page_num_free_pages() >= 200 * 1024 * 1024 / B_PAGE_SIZE) {
4068                 virtual_address_restrictions virtualRestrictions = {};
4069                 virtualRestrictions.address_specification = B_ANY_KERNEL_ADDRESS;
4070                 physical_address_restrictions physicalRestrictions = {};
4071                 create_area_etc(VMAddressSpace::KernelID(), "cache info table",
4072                         ROUNDUP(kCacheInfoTableCount * sizeof(cache_info), B_PAGE_SIZE),
4073                         B_FULL_LOCK, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA,
4074                         CREATE_AREA_DONT_WAIT, 0, &virtualRestrictions,
4075                         &physicalRestrictions, (void**)&sCacheInfoTable);
4076         }
4077 #endif  // DEBUG_CACHE_LIST
4078
4079         // add some debugger commands
4080         add_debugger_command("areas", &dump_area_list, "Dump a list of all areas");
4081         add_debugger_command("area", &dump_area,
4082                 "Dump info about a particular area");
4083         add_debugger_command("cache", &dump_cache, "Dump VMCache");
4084         add_debugger_command("cache_tree", &dump_cache_tree, "Dump VMCache tree");
4085 #if DEBUG_CACHE_LIST
4086         if (sCacheInfoTable != NULL) {
4087                 add_debugger_command_etc("caches", &dump_caches,
4088                         "List all VMCache trees",
4089                         "[ \"-c\" ]\n"
4090                         "All cache trees are listed sorted in decreasing order by number "
4091                                 "of\n"
4092                         "used pages or, if \"-c\" is specified, by size of committed "
4093                                 "memory.\n",
4094                         0);
4095         }
4096 #endif
4097         add_debugger_command("avail", &dump_available_memory,
4098                 "Dump available memory");
4099         add_debugger_command("dl", &display_mem, "dump memory long words (64-bit)");
4100         add_debugger_command("dw", &display_mem, "dump memory words (32-bit)");
4101         add_debugger_command("ds", &display_mem, "dump memory shorts (16-bit)");
4102         add_debugger_command("db", &display_mem, "dump memory bytes (8-bit)");
4103         add_debugger_command("string", &display_mem, "dump strings");
4104
4105         add_debugger_command_etc("mapping", &dump_mapping_info,
4106                 "Print address mapping information",
4107                 "[ \"-r\" | \"-p\" ] <address> [ <thread ID> ]\n"
4108                 "Prints low-level page mapping information for a given address. If\n"
4109                 "neither \"-r\" nor \"-p\" are specified, <address> is a virtual\n"
4110                 "address that is looked up in the translation map of the current\n"
4111                 "team, respectively the team specified by thread ID <thread ID>. If\n"
4112                 "\"-r\" is specified, <address> is a physical address that is\n"
4113                 "searched in the translation map of all teams, respectively the team\n"
4114                 "specified by thread ID <thread ID>. If \"-p\" is specified,\n"
4115                 "<address> is the address of a vm_page structure. The behavior is\n"
4116                 "equivalent to specifying \"-r\" with the physical address of that\n"
4117                 "page.\n",
4118                 0);
4119
4120         TRACE(("vm_init: exit\n"));
4121
4122         vm_cache_init_post_heap();
4123
4124         return err;
4125 }
4126
4127
4128 status_t
4129 vm_init_post_sem(kernel_args* args)
4130 {
4131         // This frees all unused boot loader resources and makes its space available
4132         // again
4133         arch_vm_init_end(args);
4134         unreserve_boot_loader_ranges(args);
4135
4136         // fill in all of the semaphores that were not allocated before
4137         // since we're still single threaded and only the kernel address space
4138         // exists, it isn't that hard to find all of the ones we need to create
4139
4140         arch_vm_translation_map_init_post_sem(args);
4141
4142         slab_init_post_sem();
4143
4144 #if USE_DEBUG_HEAP_FOR_MALLOC || USE_GUARDED_HEAP_FOR_MALLOC
4145         heap_init_post_sem();
4146 #endif
4147
4148         return B_OK;
4149 }
4150
4151
4152 status_t
4153 vm_init_post_thread(kernel_args* args)
4154 {
4155         vm_page_init_post_thread(args);
4156         slab_init_post_thread();
4157         return heap_init_post_thread();
4158 }
4159
4160
4161 status_t
4162 vm_init_post_modules(kernel_args* args)
4163 {
4164         return arch_vm_init_post_modules(args);
4165 }
4166
4167
4168 void
4169 permit_page_faults(void)
4170 {
4171         Thread* thread = thread_get_current_thread();
4172         if (thread != NULL)
4173                 atomic_add(&thread->page_faults_allowed, 1);
4174 }
4175
4176
4177 void
4178 forbid_page_faults(void)
4179 {
4180         Thread* thread = thread_get_current_thread();
4181         if (thread != NULL)
4182                 atomic_add(&thread->page_faults_allowed, -1);
4183 }
4184
4185
4186 status_t
4187 vm_page_fault(addr_t address, addr_t faultAddress, bool isWrite, bool isExecute,
4188         bool isUser, addr_t* newIP)
4189 {
4190         FTRACE(("vm_page_fault: page fault at 0x%lx, ip 0x%lx\n", address,
4191                 faultAddress));
4192
4193         TPF(PageFaultStart(address, isWrite, isUser, faultAddress));
4194
4195         addr_t pageAddress = ROUNDDOWN(address, B_PAGE_SIZE);
4196         VMAddressSpace* addressSpace = NULL;
4197
4198         status_t status = B_OK;
4199         *newIP = 0;
4200         atomic_add((int32*)&sPageFaults, 1);
4201
4202         if (IS_KERNEL_ADDRESS(pageAddress)) {
4203                 addressSpace = VMAddressSpace::GetKernel();
4204         } else if (IS_USER_ADDRESS(pageAddress)) {
4205                 addressSpace = VMAddressSpace::GetCurrent();
4206                 if (addressSpace == NULL) {
4207                         if (!isUser) {
4208                                 dprintf("vm_page_fault: kernel thread accessing invalid user "
4209                                         "memory!\n");
4210                                 status = B_BAD_ADDRESS;
4211                                 TPF(PageFaultError(-1,
4212                                         VMPageFaultTracing
4213                                                 ::PAGE_FAULT_ERROR_KERNEL_BAD_USER_MEMORY));
4214                         } else {
4215                                 // XXX weird state.
4216                                 panic("vm_page_fault: non kernel thread accessing user memory "
4217                                         "that doesn't exist!\n");
4218                                 status = B_BAD_ADDRESS;
4219                         }
4220                 }
4221         } else {
4222                 // the hit was probably in the 64k DMZ between kernel and user space
4223                 // this keeps a user space thread from passing a buffer that crosses
4224                 // into kernel space
4225                 status = B_BAD_ADDRESS;
4226                 TPF(PageFaultError(-1,
4227                         VMPageFaultTracing::PAGE_FAULT_ERROR_NO_ADDRESS_SPACE));
4228         }
4229
4230         if (status == B_OK) {
4231                 status = vm_soft_fault(addressSpace, pageAddress, isWrite, isExecute,
4232                         isUser, NULL);
4233         }
4234
4235         if (status < B_OK) {
4236                 dprintf("vm_page_fault: vm_soft_fault returned error '%s' on fault at "
4237                         "0x%lx, ip 0x%lx, write %d, user %d, thread 0x%" B_PRIx32 "\n",
4238                         strerror(status), address, faultAddress, isWrite, isUser,
4239                         thread_get_current_thread_id());
4240                 if (!isUser) {
4241                         Thread* thread = thread_get_current_thread();
4242                         if (thread != NULL && thread->fault_handler != 0) {
4243                                 // this will cause the arch dependant page fault handler to
4244                                 // modify the IP on the interrupt frame or whatever to return
4245                                 // to this address
4246                                 *newIP = reinterpret_cast<uintptr_t>(thread->fault_handler);
4247                         } else {
4248                                 // unhandled page fault in the kernel
4249                                 panic("vm_page_fault: unhandled page fault in kernel space at "
4250                                         "0x%lx, ip 0x%lx\n", address, faultAddress);
4251                         }
4252                 } else {
4253 #if 1
4254                         // TODO: remove me once we have proper userland debugging support
4255                         // (and tools)
4256                         VMArea* area = NULL;
4257                         if (addressSpace != NULL) {
4258                                 addressSpace->ReadLock();
4259                                 area = addressSpace->LookupArea(faultAddress);
4260                         }
4261
4262                         Thread* thread = thread_get_current_thread();
4263                         dprintf("vm_page_fault: thread \"%s\" (%" B_PRId32 ") in team "
4264                                 "\"%s\" (%" B_PRId32 ") tried to %s address %#lx, ip %#lx "
4265                                 "(\"%s\" +%#lx)\n", thread->name, thread->id,
4266                                 thread->team->Name(), thread->team->id,
4267                                 isWrite ? "write" : (isExecute ? "execute" : "read"), address,
4268                                 faultAddress, area ? area->name : "???", faultAddress - (area ?
4269                                         area->Base() : 0x0));
4270
4271                         // We can print a stack trace of the userland thread here.
4272 // TODO: The user_memcpy() below can cause a deadlock, if it causes a page
4273 // fault and someone is already waiting for a write lock on the same address
4274 // space. This thread will then try to acquire the lock again and will
4275 // be queued after the writer.
4276 #       if 0
4277                         if (area) {
4278                                 struct stack_frame {
4279                                         #if defined(__INTEL__) || defined(__POWERPC__) || defined(__M68K__)
4280                                                 struct stack_frame*     previous;
4281                                                 void*                           return_address;
4282                                         #else
4283                                                 // ...
4284                                         #warning writeme
4285                                         #endif
4286                                 } frame;
4287 #               ifdef __INTEL__
4288                                 struct iframe* iframe = x86_get_user_iframe();
4289                                 if (iframe == NULL)
4290                                         panic("iframe is NULL!");
4291
4292                                 status_t status = user_memcpy(&frame, (void*)iframe->ebp,
4293                                         sizeof(struct stack_frame));
4294 #               elif defined(__POWERPC__)
4295                                 struct iframe* iframe = ppc_get_user_iframe();
4296                                 if (iframe == NULL)
4297                                         panic("iframe is NULL!");
4298
4299                                 status_t status = user_memcpy(&frame, (void*)iframe->r1,
4300                                         sizeof(struct stack_frame));
4301 #               else
4302 #                       warning "vm_page_fault() stack trace won't work"
4303                                 status = B_ERROR;
4304 #               endif
4305
4306                                 dprintf("stack trace:\n");
4307                                 int32 maxFrames = 50;
4308                                 while (status == B_OK && --maxFrames >= 0
4309                                                 && frame.return_address != NULL) {
4310                                         dprintf("  %p", frame.return_address);
4311                                         area = addressSpace->LookupArea(
4312                                                 (addr_t)frame.return_address);
4313                                         if (area) {
4314                                                 dprintf(" (%s + %#lx)", area->name,
4315                                                         (addr_t)frame.return_address - area->Base());
4316                                         }
4317                                         dprintf("\n");
4318
4319                                         status = user_memcpy(&frame, frame.previous,
4320                                                 sizeof(struct stack_frame));
4321                                 }
4322                         }
4323 #       endif   // 0 (stack trace)
4324
4325                         if (addressSpace != NULL)
4326                                 addressSpace->ReadUnlock();
4327 #endif
4328
4329                         // If the thread has a signal handler for SIGSEGV, we simply
4330                         // send it the signal. Otherwise we notify the user debugger
4331                         // first.
4332                         struct sigaction action;
4333                         if ((sigaction(SIGSEGV, NULL, &action) == 0
4334                                         && action.sa_handler != SIG_DFL
4335                                         && action.sa_handler != SIG_IGN)
4336                                 || user_debug_exception_occurred(B_SEGMENT_VIOLATION,
4337                                         SIGSEGV)) {
4338                                 Signal signal(SIGSEGV,
4339                                         status == B_PERMISSION_DENIED
4340                                                 ? SEGV_ACCERR : SEGV_MAPERR,
4341                                         EFAULT, thread->team->id);
4342                                 signal.SetAddress((void*)address);
4343                                 send_signal_to_thread(thread, signal, 0);
4344                         }
4345                 }
4346         }
4347
4348         if (addressSpace != NULL)
4349                 addressSpace->Put();
4350
4351         return B_HANDLED_INTERRUPT;
4352 }
4353
4354
4355 struct PageFaultContext {
4356         AddressSpaceReadLocker  addressSpaceLocker;
4357         VMCacheChainLocker              cacheChainLocker;
4358
4359         VMTranslationMap*               map;
4360         VMCache*                                topCache;
4361         off_t                                   cacheOffset;
4362         vm_page_reservation             reservation;
4363         bool                                    isWrite;
4364
4365         // return values
4366         vm_page*                                page;
4367         bool                                    restart;
4368         bool                                    pageAllocated;
4369
4370
4371         PageFaultContext(VMAddressSpace* addressSpace, bool isWrite)
4372                 :
4373                 addressSpaceLocker(addressSpace, true),
4374                 map(addressSpace->TranslationMap()),
4375                 isWrite(isWrite)
4376         {
4377         }
4378
4379         ~PageFaultContext()
4380         {
4381                 UnlockAll();
4382                 vm_page_unreserve_pages(&reservation);
4383         }
4384
4385         void Prepare(VMCache* topCache, off_t cacheOffset)
4386         {
4387                 this->topCache = topCache;
4388                 this->cacheOffset = cacheOffset;
4389                 page = NULL;
4390                 restart = false;
4391                 pageAllocated = false;
4392
4393                 cacheChainLocker.SetTo(topCache);
4394         }
4395
4396         void UnlockAll(VMCache* exceptCache = NULL)
4397         {
4398                 topCache = NULL;
4399                 addressSpaceLocker.Unlock();
4400                 cacheChainLocker.Unlock(exceptCache);
4401         }
4402 };
4403
4404
4405 /*!     Gets the page that should be mapped into the area.
4406         Returns an error code other than \c B_OK, if the page couldn't be found or
4407         paged in. The locking state of the address space and the caches is undefined
4408         in that case.
4409         Returns \c B_OK with \c context.restart set to \c true, if the functions
4410         had to unlock the address space and all caches and is supposed to be called
4411         again.
4412         Returns \c B_OK with \c context.restart set to \c false, if the page was
4413         found. It is returned in \c context.page. The address space will still be
4414         locked as well as all caches starting from the top cache to at least the
4415         cache the page lives in.
4416 */
4417 static status_t
4418 fault_get_page(PageFaultContext& context)
4419 {
4420         VMCache* cache = context.topCache;
4421         VMCache* lastCache = NULL;
4422         vm_page* page = NULL;
4423
4424         while (cache != NULL) {
4425                 // We already hold the lock of the cache at this point.
4426
4427                 lastCache = cache;
4428
4429                 page = cache->LookupPage(context.cacheOffset);
4430                 if (page != NULL && page->busy) {
4431                         // page must be busy -- wait for it to become unbusy
4432                         context.UnlockAll(cache);
4433                         cache->ReleaseRefLocked();
4434                         cache->WaitForPageEvents(page, PAGE_EVENT_NOT_BUSY, false);
4435
4436                         // restart the whole process
4437                         context.restart = true;
4438                         return B_OK;
4439                 }
4440
4441                 if (page != NULL)
4442                         break;
4443
4444                 // The current cache does not contain the page we're looking for.
4445
4446                 // see if the backing store has it
4447                 if (cache->HasPage(context.cacheOffset)) {
4448                         // insert a fresh page and mark it busy -- we're going to read it in
4449                         page = vm_page_allocate_page(&context.reservation,
4450                                 PAGE_STATE_ACTIVE | VM_PAGE_ALLOC_BUSY);
4451                         cache->InsertPage(page, context.cacheOffset);
4452
4453                         // We need to unlock all caches and the address space while reading
4454                         // the page in. Keep a reference to the cache around.
4455                         cache->AcquireRefLocked();
4456                         context.UnlockAll();
4457
4458                         // read the page in
4459                         generic_io_vec vec;
4460                         vec.base = (phys_addr_t)page->physical_page_number * B_PAGE_SIZE;
4461                         generic_size_t bytesRead = vec.length = B_PAGE_SIZE;
4462
4463                         status_t status = cache->Read(context.cacheOffset, &vec, 1,
4464                                 B_PHYSICAL_IO_REQUEST, &bytesRead);
4465
4466                         cache->Lock();
4467
4468                         if (status < B_OK) {
4469                                 // on error remove and free the page
4470                                 dprintf("reading page from cache %p returned: %s!\n",
4471                                         cache, strerror(status));
4472
4473                                 cache->NotifyPageEvents(page, PAGE_EVENT_NOT_BUSY);
4474                                 cache->RemovePage(page);
4475                                 vm_page_set_state(page, PAGE_STATE_FREE);
4476
4477                                 cache->ReleaseRefAndUnlock();
4478                                 return status;
4479                         }
4480
4481                         // mark the page unbusy again
4482                         cache->MarkPageUnbusy(page);
4483
4484                         DEBUG_PAGE_ACCESS_END(page);
4485
4486                         // Since we needed to unlock everything temporarily, the area
4487                         // situation might have changed. So we need to restart the whole
4488                         // process.
4489                         cache->ReleaseRefAndUnlock();
4490                         context.restart = true;
4491                         return B_OK;
4492                 }
4493
4494                 cache = context.cacheChainLocker.LockSourceCache();
4495         }
4496
4497         if (page == NULL) {
4498                 // There was no adequate page, determine the cache for a clean one.
4499                 // Read-only pages come in the deepest cache, only the top most cache
4500                 // may have direct write access.
4501                 cache = context.isWrite ? context.topCache : lastCache;
4502
4503                 // allocate a clean page
4504                 page = vm_page_allocate_page(&context.reservation,
4505                         PAGE_STATE_ACTIVE | VM_PAGE_ALLOC_CLEAR);
4506                 FTRACE(("vm_soft_fault: just allocated page 0x%" B_PRIxPHYSADDR "\n",
4507                         page->physical_page_number));
4508
4509                 // insert the new page into our cache
4510                 cache->InsertPage(page, context.cacheOffset);
4511                 context.pageAllocated = true;
4512         } else if (page->Cache() != context.topCache && context.isWrite) {
4513                 // We have a page that has the data we want, but in the wrong cache
4514                 // object so we need to copy it and stick it into the top cache.
4515                 vm_page* sourcePage = page;
4516
4517                 // TODO: If memory is low, it might be a good idea to steal the page
4518                 // from our source cache -- if possible, that is.
4519                 FTRACE(("get new page, copy it, and put it into the topmost cache\n"));
4520                 page = vm_page_allocate_page(&context.reservation, PAGE_STATE_ACTIVE);
4521
4522                 // To not needlessly kill concurrency we unlock all caches but the top
4523                 // one while copying the page. Lacking another mechanism to ensure that
4524                 // the source page doesn't disappear, we mark it busy.
4525                 sourcePage->busy = true;
4526                 context.cacheChainLocker.UnlockKeepRefs(true);
4527
4528                 // copy the page
4529                 vm_memcpy_physical_page(page->physical_page_number * B_PAGE_SIZE,
4530                         sourcePage->physical_page_number * B_PAGE_SIZE);
4531
4532                 context.cacheChainLocker.RelockCaches(true);
4533                 sourcePage->Cache()->MarkPageUnbusy(sourcePage);
4534
4535                 // insert the new page into our cache
4536                 context.topCache->InsertPage(page, context.cacheOffset);
4537                 context.pageAllocated = true;
4538         } else
4539                 DEBUG_PAGE_ACCESS_START(page);
4540
4541         context.page = page;
4542         return B_OK;
4543 }
4544
4545
4546 /*!     Makes sure the address in the given address space is mapped.
4547
4548         \param addressSpace The address space.
4549         \param originalAddress The address. Doesn't need to be page aligned.
4550         \param isWrite If \c true the address shall be write-accessible.
4551         \param isUser If \c true the access is requested by a userland team.
4552         \param wirePage On success, if non \c NULL, the wired count of the page
4553                 mapped at the given address is incremented and the page is returned
4554                 via this parameter.
4555         \return \c B_OK on success, another error code otherwise.
4556 */
4557 static status_t
4558 vm_soft_fault(VMAddressSpace* addressSpace, addr_t originalAddress,
4559         bool isWrite, bool isExecute, bool isUser, vm_page** wirePage)
4560 {
4561         FTRACE(("vm_soft_fault: thid 0x%" B_PRIx32 " address 0x%" B_PRIxADDR ", "
4562                 "isWrite %d, isUser %d\n", thread_get_current_thread_id(),
4563                 originalAddress, isWrite, isUser));
4564
4565         PageFaultContext context(addressSpace, isWrite);
4566
4567         addr_t address = ROUNDDOWN(originalAddress, B_PAGE_SIZE);
4568         status_t status = B_OK;
4569
4570         addressSpace->IncrementFaultCount();
4571
4572         // We may need up to 2 pages plus pages needed for mapping them -- reserving
4573         // the pages upfront makes sure we don't have any cache locked, so that the
4574         // page daemon/thief can do their job without problems.
4575         size_t reservePages = 2 + context.map->MaxPagesNeededToMap(originalAddress,
4576                 originalAddress);
4577         context.addressSpaceLocker.Unlock();
4578         vm_page_reserve_pages(&context.reservation, reservePages,
4579                 addressSpace == VMAddressSpace::Kernel()
4580                         ? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER);
4581
4582         while (true) {
4583                 context.addressSpaceLocker.Lock();
4584
4585                 // get the area the fault was in
4586                 VMArea* area = addressSpace->LookupArea(address);
4587                 if (area == NULL) {
4588                         dprintf("vm_soft_fault: va 0x%lx not covered by area in address "
4589                                 "space\n", originalAddress);
4590                         TPF(PageFaultError(-1,
4591                                 VMPageFaultTracing::PAGE_FAULT_ERROR_NO_AREA));
4592                         status = B_BAD_ADDRESS;
4593                         break;
4594                 }
4595
4596                 // check permissions
4597                 uint32 protection = get_area_page_protection(area, address);
4598                 if (isUser && (protection & B_USER_PROTECTION) == 0) {
4599                         dprintf("user access on kernel area 0x%" B_PRIx32 " at %p\n",
4600                                 area->id, (void*)originalAddress);
4601                         TPF(PageFaultError(area->id,
4602                                 VMPageFaultTracing::PAGE_FAULT_ERROR_KERNEL_ONLY));
4603                         status = B_PERMISSION_DENIED;
4604                         break;
4605                 }
4606                 if (isWrite && (protection
4607                                 & (B_WRITE_AREA | (isUser ? 0 : B_KERNEL_WRITE_AREA))) == 0) {
4608                         dprintf("write access attempted on write-protected area 0x%"
4609                                 B_PRIx32 " at %p\n", area->id, (void*)originalAddress);
4610                         TPF(PageFaultError(area->id,
4611                                 VMPageFaultTracing::PAGE_FAULT_ERROR_WRITE_PROTECTED));
4612                         status = B_PERMISSION_DENIED;
4613                         break;
4614                 } else if (isExecute && (protection
4615                                 & (B_EXECUTE_AREA
4616                                         | (isUser ? 0 : B_KERNEL_EXECUTE_AREA))) == 0) {
4617                         dprintf("instruction fetch attempted on execute-protected area 0x%"
4618                                 B_PRIx32 " at %p\n", area->id, (void*)originalAddress);
4619                         TPF(PageFaultError(area->id,
4620                                 VMPageFaultTracing::PAGE_FAULT_ERROR_EXECUTE_PROTECTED));
4621                         status = B_PERMISSION_DENIED;
4622                         break;
4623                 } else if (!isWrite && !isExecute && (protection
4624                                 & (B_READ_AREA | (isUser ? 0 : B_KERNEL_READ_AREA))) == 0) {
4625                         dprintf("read access attempted on read-protected area 0x%" B_PRIx32
4626                                 " at %p\n", area->id, (void*)originalAddress);
4627                         TPF(PageFaultError(area->id,
4628                                 VMPageFaultTracing::PAGE_FAULT_ERROR_READ_PROTECTED));
4629                         status = B_PERMISSION_DENIED;
4630                         break;
4631                 }
4632
4633                 // We have the area, it was a valid access, so let's try to resolve the
4634                 // page fault now.
4635                 // At first, the top most cache from the area is investigated.
4636
4637                 context.Prepare(vm_area_get_locked_cache(area),
4638                         address - area->Base() + area->cache_offset);
4639
4640                 // See if this cache has a fault handler -- this will do all the work
4641                 // for us.
4642                 {
4643                         // Note, since the page fault is resolved with interrupts enabled,
4644                         // the fault handler could be called more than once for the same
4645                         // reason -- the store must take this into account.
4646                         status = context.topCache->Fault(addressSpace, context.cacheOffset);
4647                         if (status != B_BAD_HANDLER)
4648                                 break;
4649                 }
4650
4651                 // The top most cache has no fault handler, so let's see if the cache or
4652                 // its sources already have the page we're searching for (we're going
4653                 // from top to bottom).
4654                 status = fault_get_page(context);
4655                 if (status != B_OK) {
4656                         TPF(PageFaultError(area->id, status));
4657                         break;
4658                 }
4659
4660                 if (context.restart)
4661                         continue;
4662
4663                 // All went fine, all there is left to do is to map the page into the
4664                 // address space.
4665                 TPF(PageFaultDone(area->id, context.topCache, context.page->Cache(),
4666                         context.page));
4667
4668                 // If the page doesn't reside in the area's cache, we need to make sure
4669                 // it's mapped in read-only, so that we cannot overwrite someone else's
4670                 // data (copy-on-write)
4671                 uint32 newProtection = protection;
4672                 if (context.page->Cache() != context.topCache && !isWrite)
4673                         newProtection &= ~(B_WRITE_AREA | B_KERNEL_WRITE_AREA);
4674
4675                 bool unmapPage = false;
4676                 bool mapPage = true;
4677
4678                 // check whether there's already a page mapped at the address
4679                 context.map->Lock();
4680
4681                 phys_addr_t physicalAddress;
4682                 uint32 flags;
4683                 vm_page* mappedPage = NULL;
4684                 if (context.map->Query(address, &physicalAddress, &flags) == B_OK
4685                         && (flags & PAGE_PRESENT) != 0
4686                         && (mappedPage = vm_lookup_page(physicalAddress / B_PAGE_SIZE))
4687                                 != NULL) {
4688                         // Yep there's already a page. If it's ours, we can simply adjust
4689                         // its protection. Otherwise we have to unmap it.
4690                         if (mappedPage == context.page) {
4691                                 context.map->ProtectPage(area, address, newProtection);
4692                                         // Note: We assume that ProtectPage() is atomic (i.e.
4693                                         // the page isn't temporarily unmapped), otherwise we'd have
4694                                         // to make sure it isn't wired.
4695                                 mapPage = false;
4696                         } else
4697                                 unmapPage = true;
4698                 }
4699
4700                 context.map->Unlock();
4701
4702                 if (unmapPage) {
4703                         // If the page is wired, we can't unmap it. Wait until it is unwired
4704                         // again and restart. Note that the page cannot be wired for
4705                         // writing, since it it isn't in the topmost cache. So we can safely
4706                         // ignore ranges wired for writing (our own and other concurrent
4707                         // wiring attempts in progress) and in fact have to do that to avoid
4708                         // a deadlock.
4709                         VMAreaUnwiredWaiter waiter;
4710                         if (area->AddWaiterIfWired(&waiter, address, B_PAGE_SIZE,
4711                                         VMArea::IGNORE_WRITE_WIRED_RANGES)) {
4712                                 // unlock everything and wait
4713                                 if (context.pageAllocated) {
4714                                         // ... but since we allocated a page and inserted it into
4715                                         // the top cache, remove and free it first. Otherwise we'd
4716                                         // have a page from a lower cache mapped while an upper
4717                                         // cache has a page that would shadow it.
4718                                         context.topCache->RemovePage(context.page);
4719                                         vm_page_free_etc(context.topCache, context.page,
4720                                                 &context.reservation);
4721                                 } else
4722                                         DEBUG_PAGE_ACCESS_END(context.page);
4723
4724                                 context.UnlockAll();
4725                                 waiter.waitEntry.Wait();
4726                                 continue;
4727                         }
4728
4729                         // Note: The mapped page is a page of a lower cache. We are
4730                         // guaranteed to have that cached locked, our new page is a copy of
4731                         // that page, and the page is not busy. The logic for that guarantee
4732                         // is as follows: Since the page is mapped, it must live in the top
4733                         // cache (ruled out above) or any of its lower caches, and there is
4734                         // (was before the new page was inserted) no other page in any
4735                         // cache between the top cache and the page's cache (otherwise that
4736                         // would be mapped instead). That in turn means that our algorithm
4737                         // must have found it and therefore it cannot be busy either.
4738                         DEBUG_PAGE_ACCESS_START(mappedPage);
4739                         unmap_page(area, address);
4740                         DEBUG_PAGE_ACCESS_END(mappedPage);
4741                 }
4742
4743                 if (mapPage) {
4744                         if (map_page(area, context.page, address, newProtection,
4745                                         &context.reservation) != B_OK) {
4746                                 // Mapping can only fail, when the page mapping object couldn't
4747                                 // be allocated. Save for the missing mapping everything is
4748                                 // fine, though. If this was a regular page fault, we'll simply
4749                                 // leave and probably fault again. To make sure we'll have more
4750                                 // luck then, we ensure that the minimum object reserve is
4751                                 // available.
4752                                 DEBUG_PAGE_ACCESS_END(context.page);
4753
4754                                 context.UnlockAll();
4755
4756                                 if (object_cache_reserve(gPageMappingsObjectCache, 1, 0)
4757                                                 != B_OK) {
4758                                         // Apparently the situation is serious. Let's get ourselves
4759                                         // killed.
4760                                         status = B_NO_MEMORY;
4761                                 } else if (wirePage != NULL) {
4762                                         // The caller expects us to wire the page. Since
4763                                         // object_cache_reserve() succeeded, we should now be able
4764                                         // to allocate a mapping structure. Restart.
4765                                         continue;
4766                                 }
4767
4768                                 break;
4769                         }
4770                 } else if (context.page->State() == PAGE_STATE_INACTIVE)
4771                         vm_page_set_state(context.page, PAGE_STATE_ACTIVE);
4772
4773                 // also wire the page, if requested
4774                 if (wirePage != NULL && status == B_OK) {
4775                         increment_page_wired_count(context.page);
4776                         *wirePage = context.page;
4777                 }
4778
4779                 DEBUG_PAGE_ACCESS_END(context.page);
4780
4781                 break;
4782         }
4783
4784         return status;
4785 }
4786
4787
4788 status_t
4789 vm_get_physical_page(phys_addr_t paddr, addr_t* _vaddr, void** _handle)
4790 {
4791         return sPhysicalPageMapper->GetPage(paddr, _vaddr, _handle);
4792 }
4793
4794 status_t
4795 vm_put_physical_page(addr_t vaddr, void* handle)
4796 {
4797         return sPhysicalPageMapper->PutPage(vaddr, handle);
4798 }
4799
4800
4801 status_t
4802 vm_get_physical_page_current_cpu(phys_addr_t paddr, addr_t* _vaddr,
4803         void** _handle)
4804 {
4805         return sPhysicalPageMapper->GetPageCurrentCPU(paddr, _vaddr, _handle);
4806 }
4807
4808 status_t
4809 vm_put_physical_page_current_cpu(addr_t vaddr, void* handle)
4810 {
4811         return sPhysicalPageMapper->PutPageCurrentCPU(vaddr, handle);
4812 }
4813
4814
4815 status_t
4816 vm_get_physical_page_debug(phys_addr_t paddr, addr_t* _vaddr, void** _handle)
4817 {
4818         return sPhysicalPageMapper->GetPageDebug(paddr, _vaddr, _handle);
4819 }
4820
4821 status_t
4822 vm_put_physical_page_debug(addr_t vaddr, void* handle)
4823 {
4824         return sPhysicalPageMapper->PutPageDebug(vaddr, handle);
4825 }
4826
4827
4828 void
4829 vm_get_info(system_info* info)
4830 {
4831         swap_get_info(info);
4832
4833         MutexLocker locker(sAvailableMemoryLock);
4834         info->needed_memory = sNeededMemory;
4835         info->free_memory = sAvailableMemory;
4836 }
4837
4838
4839 uint32
4840 vm_num_page_faults(void)
4841 {
4842         return sPageFaults;
4843 }
4844
4845
4846 off_t
4847 vm_available_memory(void)
4848 {
4849         MutexLocker locker(sAvailableMemoryLock);
4850         return sAvailableMemory;
4851 }
4852
4853
4854 off_t
4855 vm_available_not_needed_memory(void)
4856 {
4857         MutexLocker locker(sAvailableMemoryLock);
4858         return sAvailableMemory - sNeededMemory;
4859 }
4860
4861
4862 /*!     Like vm_available_not_needed_memory(), but only for use in the kernel
4863         debugger.
4864 */
4865 off_t
4866 vm_available_not_needed_memory_debug(void)
4867 {
4868         return sAvailableMemory - sNeededMemory;
4869 }
4870
4871
4872 size_t
4873 vm_kernel_address_space_left(void)
4874 {
4875         return VMAddressSpace::Kernel()->FreeSpace();
4876 }
4877
4878
4879 void
4880 vm_unreserve_memory(size_t amount)
4881 {
4882         mutex_lock(&sAvailableMemoryLock);
4883
4884         sAvailableMemory += amount;
4885
4886         mutex_unlock(&sAvailableMemoryLock);
4887 }
4888
4889
4890 status_t
4891 vm_try_reserve_memory(size_t amount, int priority, bigtime_t timeout)
4892 {
4893         size_t reserve = kMemoryReserveForPriority[priority];
4894
4895         MutexLocker locker(sAvailableMemoryLock);
4896
4897         //dprintf("try to reserve %lu bytes, %Lu left\n", amount, sAvailableMemory);
4898
4899         if (sAvailableMemory >= (off_t)(amount + reserve)) {
4900                 sAvailableMemory -= amount;
4901                 return B_OK;
4902         }
4903
4904         if (timeout <= 0)
4905                 return B_NO_MEMORY;
4906
4907         // turn timeout into an absolute timeout
4908         timeout += system_time();
4909
4910         // loop until we've got the memory or the timeout occurs
4911         do {
4912                 sNeededMemory += amount;
4913
4914                 // call the low resource manager
4915                 locker.Unlock();
4916                 low_resource(B_KERNEL_RESOURCE_MEMORY, sNeededMemory - sAvailableMemory,
4917                         B_ABSOLUTE_TIMEOUT, timeout);
4918                 locker.Lock();
4919
4920                 sNeededMemory -= amount;
4921
4922                 if (sAvailableMemory >= (off_t)(amount + reserve)) {
4923                         sAvailableMemory -= amount;
4924                         return B_OK;
4925                 }
4926         } while (timeout > system_time());
4927
4928         return B_NO_MEMORY;
4929 }
4930
4931
4932 status_t
4933 vm_set_area_memory_type(area_id id, phys_addr_t physicalBase, uint32 type)
4934 {
4935         // NOTE: The caller is responsible for synchronizing calls to this function!
4936
4937         AddressSpaceReadLocker locker;
4938         VMArea* area;
4939         status_t status = locker.SetFromArea(id, area);
4940         if (status != B_OK)
4941                 return status;
4942
4943         // nothing to do, if the type doesn't change
4944         uint32 oldType = area->MemoryType();
4945         if (type == oldType)
4946                 return B_OK;
4947
4948         // set the memory type of the area and the mapped pages
4949         VMTranslationMap* map = area->address_space->TranslationMap();
4950         map->Lock();
4951         area->SetMemoryType(type);
4952         map->ProtectArea(area, area->protection);
4953         map->Unlock();
4954
4955         // set the physical memory type
4956         status_t error = arch_vm_set_memory_type(area, physicalBase, type);
4957         if (error != B_OK) {
4958                 // reset the memory type of the area and the mapped pages
4959                 map->Lock();
4960                 area->SetMemoryType(oldType);
4961                 map->ProtectArea(area, area->protection);
4962                 map->Unlock();
4963                 return error;
4964         }
4965
4966         return B_OK;
4967
4968 }
4969
4970
4971 /*!     This function enforces some protection properties:
4972          - if B_WRITE_AREA is set, B_KERNEL_WRITE_AREA is set as well
4973          - if B_EXECUTE_AREA is set, B_KERNEL_EXECUTE_AREA is set as well
4974          - if only B_READ_AREA has been set, B_KERNEL_READ_AREA is also set
4975          - if no protection is specified, it defaults to B_KERNEL_READ_AREA
4976            and B_KERNEL_WRITE_AREA.
4977 */
4978 static void
4979 fix_protection(uint32* protection)
4980 {
4981         if ((*protection & B_KERNEL_PROTECTION) == 0) {
4982                 if ((*protection & B_USER_PROTECTION) == 0
4983                         || (*protection & B_WRITE_AREA) != 0)
4984                         *protection |= B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA;
4985                 else
4986                         *protection |= B_KERNEL_READ_AREA;
4987                 if ((*protection & B_EXECUTE_AREA) != 0)
4988                         *protection |= B_KERNEL_EXECUTE_AREA;
4989         }
4990 }
4991
4992
4993 static void
4994 fill_area_info(struct VMArea* area, area_info* info, size_t size)
4995 {
4996         strlcpy(info->name, area->name, B_OS_NAME_LENGTH);
4997         info->area = area->id;
4998         info->address = (void*)area->Base();
4999         info->size = area->Size();
5000         info->protection = area->protection;
5001         info->lock = B_FULL_LOCK;
5002         info->team = area->address_space->ID();
5003         info->copy_count = 0;
5004         info->in_count = 0;
5005         info->out_count = 0;
5006                 // TODO: retrieve real values here!
5007
5008         VMCache* cache = vm_area_get_locked_cache(area);
5009
5010         // Note, this is a simplification; the cache could be larger than this area
5011         info->ram_size = cache->page_count * B_PAGE_SIZE;
5012
5013         vm_area_put_locked_cache(cache);
5014 }
5015
5016
5017 static status_t
5018 vm_resize_area(area_id areaID, size_t newSize, bool kernel)
5019 {
5020         // is newSize a multiple of B_PAGE_SIZE?
5021         if (newSize & (B_PAGE_SIZE - 1))
5022                 return B_BAD_VALUE;
5023
5024         // lock all affected address spaces and the cache
5025         VMArea* area;
5026         VMCache* cache;
5027
5028         MultiAddressSpaceLocker locker;
5029         AreaCacheLocker cacheLocker;
5030
5031         status_t status;
5032         size_t oldSize;
5033         bool anyKernelArea;
5034         bool restart;
5035
5036         do {
5037                 anyKernelArea = false;
5038                 restart = false;
5039
5040                 locker.Unset();
5041                 status = locker.AddAreaCacheAndLock(areaID, true, true, area, &cache);
5042                 if (status != B_OK)
5043                         return status;
5044                 cacheLocker.SetTo(cache, true); // already locked
5045
5046                 // enforce restrictions
5047                 if (!kernel) {
5048                         if ((area->protection & B_KERNEL_AREA) != 0)
5049                                 return B_NOT_ALLOWED;
5050                         // TODO: Enforce all restrictions (team, etc.)!
5051                 }
5052
5053                 oldSize = area->Size();
5054                 if (newSize == oldSize)
5055                         return B_OK;
5056
5057                 if (cache->type != CACHE_TYPE_RAM)
5058                         return B_NOT_ALLOWED;
5059
5060                 if (oldSize < newSize) {
5061                         // We need to check if all areas of this cache can be resized.
5062                         for (VMArea* current = cache->areas; current != NULL;
5063                                         current = current->cache_next) {
5064                                 if (!current->address_space->CanResizeArea(current, newSize))
5065                                         return B_ERROR;
5066                                 anyKernelArea
5067                                         |= current->address_space == VMAddressSpace::Kernel();
5068                         }
5069                 } else {
5070                         // We're shrinking the areas, so we must make sure the affected
5071                         // ranges are not wired.
5072                         for (VMArea* current = cache->areas; current != NULL;
5073                                         current = current->cache_next) {
5074                                 anyKernelArea
5075                                         |= current->address_space == VMAddressSpace::Kernel();
5076
5077                                 if (wait_if_area_range_is_wired(current,
5078                                                 current->Base() + newSize, oldSize - newSize, &locker,
5079                                                 &cacheLocker)) {
5080                                         restart = true;
5081                                         break;
5082                                 }
5083                         }
5084                 }
5085         } while (restart);
5086
5087         // Okay, looks good so far, so let's do it
5088
5089         int priority = kernel && anyKernelArea
5090                 ? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER;
5091         uint32 allocationFlags = kernel && anyKernelArea
5092                 ? HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE : 0;
5093
5094         if (oldSize < newSize) {
5095                 // Growing the cache can fail, so we do it first.
5096                 status = cache->Resize(cache->virtual_base + newSize, priority);
5097                 if (status != B_OK)
5098                         return status;
5099         }
5100
5101         for (VMArea* current = cache->areas; current != NULL;
5102                         current = current->cache_next) {
5103                 status = current->address_space->ResizeArea(current, newSize,
5104                         allocationFlags);
5105                 if (status != B_OK)
5106                         break;
5107
5108                 // We also need to unmap all pages beyond the new size, if the area has
5109                 // shrunk
5110                 if (newSize < oldSize) {
5111                         VMCacheChainLocker cacheChainLocker(cache);
5112                         cacheChainLocker.LockAllSourceCaches();
5113
5114                         unmap_pages(current, current->Base() + newSize,
5115                                 oldSize - newSize);
5116
5117                         cacheChainLocker.Unlock(cache);
5118                 }
5119         }
5120
5121         if (status == B_OK) {
5122                 // Shrink or grow individual page protections if in use.
5123                 if (area->page_protections != NULL) {
5124                         uint32 bytes = (newSize / B_PAGE_SIZE + 1) / 2;
5125                         uint8* newProtections
5126                                 = (uint8*)realloc(area->page_protections, bytes);
5127                         if (newProtections == NULL)
5128                                 status = B_NO_MEMORY;
5129                         else {
5130                                 area->page_protections = newProtections;
5131
5132                                 if (oldSize < newSize) {
5133                                         // init the additional page protections to that of the area
5134                                         uint32 offset = (oldSize / B_PAGE_SIZE + 1) / 2;
5135                                         uint32 areaProtection = area->protection
5136                                                 & (B_READ_AREA | B_WRITE_AREA | B_EXECUTE_AREA);
5137                                         memset(area->page_protections + offset,
5138                                                 areaProtection | (areaProtection << 4), bytes - offset);
5139                                         if ((oldSize / B_PAGE_SIZE) % 2 != 0) {
5140                                                 uint8& entry = area->page_protections[offset - 1];
5141                                                 entry = (entry & 0x0f) | (areaProtection << 4);
5142                                         }
5143                                 }
5144                         }
5145                 }
5146         }
5147
5148         // shrinking the cache can't fail, so we do it now
5149         if (status == B_OK && newSize < oldSize)
5150                 status = cache->Resize(cache->virtual_base + newSize, priority);
5151
5152         if (status != B_OK) {
5153                 // Something failed -- resize the areas back to their original size.
5154                 // This can fail, too, in which case we're seriously screwed.
5155                 for (VMArea* current = cache->areas; current != NULL;
5156                                 current = current->cache_next) {
5157                         if (current->address_space->ResizeArea(current, oldSize,
5158                                         allocationFlags) != B_OK) {
5159                                 panic("vm_resize_area(): Failed and not being able to restore "
5160                                         "original state.");
5161                         }
5162                 }
5163
5164                 cache->Resize(cache->virtual_base + oldSize, priority);
5165         }
5166
5167         // TODO: we must honour the lock restrictions of this area
5168         return status;
5169 }
5170
5171
5172 status_t
5173 vm_memset_physical(phys_addr_t address, int value, phys_size_t length)
5174 {
5175         return sPhysicalPageMapper->MemsetPhysical(address, value, length);
5176 }
5177
5178
5179 status_t
5180 vm_memcpy_from_physical(void* to, phys_addr_t from, size_t length, bool user)
5181 {
5182         return sPhysicalPageMapper->MemcpyFromPhysical(to, from, length, user);
5183 }
5184
5185
5186 status_t
5187 vm_memcpy_to_physical(phys_addr_t to, const void* _from, size_t length,
5188         bool user)
5189 {
5190         return sPhysicalPageMapper->MemcpyToPhysical(to, _from, length, user);
5191 }
5192
5193
5194 void
5195 vm_memcpy_physical_page(phys_addr_t to, phys_addr_t from)
5196 {
5197         return sPhysicalPageMapper->MemcpyPhysicalPage(to, from);
5198 }
5199
5200
5201 /*!     Copies a range of memory directly from/to a page that might not be mapped
5202         at the moment.
5203
5204         For \a unsafeMemory the current mapping (if any is ignored). The function
5205         walks through the respective area's cache chain to find the physical page
5206         and copies from/to it directly.
5207         The memory range starting at \a unsafeMemory with a length of \a size bytes
5208         must not cross a page boundary.
5209
5210         \param teamID The team ID identifying the address space \a unsafeMemory is
5211                 to be interpreted in. Ignored, if \a unsafeMemory is a kernel address
5212                 (the kernel address space is assumed in this case). If \c B_CURRENT_TEAM
5213                 is passed, the address space of the thread returned by
5214                 debug_get_debugged_thread() is used.
5215         \param unsafeMemory The start of the unsafe memory range to be copied
5216                 from/to.
5217         \param buffer A safely accessible kernel buffer to be copied from/to.
5218         \param size The number of bytes to be copied.
5219         \param copyToUnsafe If \c true, memory is copied from \a buffer to
5220                 \a unsafeMemory, the other way around otherwise.
5221 */
5222 status_t
5223 vm_debug_copy_page_memory(team_id teamID, void* unsafeMemory, void* buffer,
5224         size_t size, bool copyToUnsafe)
5225 {
5226         if (size > B_PAGE_SIZE || ROUNDDOWN((addr_t)unsafeMemory, B_PAGE_SIZE)
5227                         != ROUNDDOWN((addr_t)unsafeMemory + size - 1, B_PAGE_SIZE)) {
5228                 return B_BAD_VALUE;
5229         }
5230
5231         // get the address space for the debugged thread
5232         VMAddressSpace* addressSpace;
5233         if (IS_KERNEL_ADDRESS(unsafeMemory)) {
5234                 addressSpace = VMAddressSpace::Kernel();
5235         } else if (teamID == B_CURRENT_TEAM) {
5236                 Thread* thread = debug_get_debugged_thread();
5237                 if (thread == NULL || thread->team == NULL)
5238                         return B_BAD_ADDRESS;
5239
5240                 addressSpace = thread->team->address_space;
5241         } else
5242                 addressSpace = VMAddressSpace::DebugGet(teamID);
5243
5244         if (addressSpace == NULL)
5245                 return B_BAD_ADDRESS;
5246
5247         // get the area
5248         VMArea* area = addressSpace->LookupArea((addr_t)unsafeMemory);
5249         if (area == NULL)
5250                 return B_BAD_ADDRESS;
5251
5252         // search the page
5253         off_t cacheOffset = (addr_t)unsafeMemory - area->Base()
5254                 + area->cache_offset;
5255         VMCache* cache = area->cache;
5256         vm_page* page = NULL;
5257         while (cache != NULL) {
5258                 page = cache->DebugLookupPage(cacheOffset);
5259                 if (page != NULL)
5260                         break;
5261
5262                 // Page not found in this cache -- if it is paged out, we must not try
5263                 // to get it from lower caches.
5264                 if (cache->DebugHasPage(cacheOffset))
5265                         break;
5266
5267                 cache = cache->source;
5268         }
5269
5270         if (page == NULL)
5271                 return B_UNSUPPORTED;
5272
5273         // copy from/to physical memory
5274         phys_addr_t physicalAddress = page->physical_page_number * B_PAGE_SIZE
5275                 + (addr_t)unsafeMemory % B_PAGE_SIZE;
5276
5277         if (copyToUnsafe) {
5278                 if (page->Cache() != area->cache)
5279                         return B_UNSUPPORTED;
5280
5281                 return vm_memcpy_to_physical(physicalAddress, buffer, size, false);
5282         }
5283
5284         return vm_memcpy_from_physical(buffer, physicalAddress, size, false);
5285 }
5286
5287
5288 //      #pragma mark - kernel public API
5289
5290
5291 status_t
5292 user_memcpy(void* to, const void* from, size_t size)
5293 {
5294         // don't allow address overflows
5295         if ((addr_t)from + size < (addr_t)from || (addr_t)to + size < (addr_t)to)
5296                 return B_BAD_ADDRESS;
5297
5298         if (arch_cpu_user_memcpy(to, from, size) < B_OK)
5299                 return B_BAD_ADDRESS;
5300
5301         return B_OK;
5302 }
5303
5304
5305 /*!     \brief Copies at most (\a size - 1) characters from the string in \a from to
5306         the string in \a to, NULL-terminating the result.
5307
5308         \param to Pointer to the destination C-string.
5309         \param from Pointer to the source C-string.
5310         \param size Size in bytes of the string buffer pointed to by \a to.
5311
5312         \return strlen(\a from).
5313 */
5314 ssize_t
5315 user_strlcpy(char* to, const char* from, size_t size)
5316 {
5317         if (to == NULL && size != 0)
5318                 return B_BAD_VALUE;
5319         if (from == NULL)
5320                 return B_BAD_ADDRESS;
5321
5322         // limit size to avoid address overflows
5323         size_t maxSize = std::min((addr_t)size,
5324                 ~(addr_t)0 - std::max((addr_t)from, (addr_t)to) + 1);
5325                 // NOTE: Since arch_cpu_user_strlcpy() determines the length of \a from,
5326                 // the source address might still overflow.
5327
5328         ssize_t result = arch_cpu_user_strlcpy(to, from, maxSize);
5329
5330         // If we hit the address overflow boundary, fail.
5331         if (result < 0 || (result >= 0 && (size_t)result >= maxSize
5332                         && maxSize < size)) {
5333                 return B_BAD_ADDRESS;
5334         }
5335
5336         return result;
5337 }
5338
5339
5340 status_t
5341 user_memset(void* s, char c, size_t count)
5342 {
5343         // don't allow address overflows
5344         if ((addr_t)s + count < (addr_t)s)
5345                 return B_BAD_ADDRESS;
5346         if (arch_cpu_user_memset(s, c, count) < B_OK)
5347                 return B_BAD_ADDRESS;
5348
5349         return B_OK;
5350 }
5351
5352
5353 /*!     Wires a single page at the given address.
5354
5355         \param team The team whose address space the address belongs to. Supports
5356                 also \c B_CURRENT_TEAM. If the given address is a kernel address, the
5357                 parameter is ignored.
5358         \param address address The virtual address to wire down. Does not need to
5359                 be page aligned.
5360         \param writable If \c true the page shall be writable.
5361         \param info On success the info is filled in, among other things
5362                 containing the physical address the given virtual one translates to.
5363         \return \c B_OK, when the page could be wired, another error code otherwise.
5364 */
5365 status_t
5366 vm_wire_page(team_id team, addr_t address, bool writable,
5367         VMPageWiringInfo* info)
5368 {
5369         addr_t pageAddress = ROUNDDOWN((addr_t)address, B_PAGE_SIZE);
5370         info->range.SetTo(pageAddress, B_PAGE_SIZE, writable, false);
5371
5372         // compute the page protection that is required
5373         bool isUser = IS_USER_ADDRESS(address);
5374         uint32 requiredProtection = PAGE_PRESENT
5375                 | B_KERNEL_READ_AREA | (isUser ? B_READ_AREA : 0);
5376         if (writable)
5377                 requiredProtection |= B_KERNEL_WRITE_AREA | (isUser ? B_WRITE_AREA : 0);
5378
5379         // get and read lock the address space
5380         VMAddressSpace* addressSpace = NULL;
5381         if (isUser) {
5382                 if (team == B_CURRENT_TEAM)
5383                         addressSpace = VMAddressSpace::GetCurrent();
5384                 else
5385                         addressSpace = VMAddressSpace::Get(team);
5386         } else
5387                 addressSpace = VMAddressSpace::GetKernel();
5388         if (addressSpace == NULL)
5389                 return B_ERROR;
5390
5391         AddressSpaceReadLocker addressSpaceLocker(addressSpace, true);
5392
5393         VMTranslationMap* map = addressSpace->TranslationMap();
5394         status_t error = B_OK;
5395
5396         // get the area
5397         VMArea* area = addressSpace->LookupArea(pageAddress);
5398         if (area == NULL) {
5399                 addressSpace->Put();
5400                 return B_BAD_ADDRESS;
5401         }
5402
5403         // Lock the area's top cache. This is a requirement for VMArea::Wire().
5404         VMCacheChainLocker cacheChainLocker(vm_area_get_locked_cache(area));
5405
5406         // mark the area range wired
5407         area->Wire(&info->range);
5408
5409         // Lock the area's cache chain and the translation map. Needed to look
5410         // up the page and play with its wired count.
5411         cacheChainLocker.LockAllSourceCaches();
5412         map->Lock();
5413
5414         phys_addr_t physicalAddress;
5415         uint32 flags;
5416         vm_page* page;
5417         if (map->Query(pageAddress, &physicalAddress, &flags) == B_OK
5418                 && (flags & requiredProtection) == requiredProtection
5419                 && (page = vm_lookup_page(physicalAddress / B_PAGE_SIZE))
5420                         != NULL) {
5421                 // Already mapped with the correct permissions -- just increment
5422                 // the page's wired count.
5423                 increment_page_wired_count(page);
5424
5425                 map->Unlock();
5426                 cacheChainLocker.Unlock();
5427                 addressSpaceLocker.Unlock();
5428         } else {
5429                 // Let vm_soft_fault() map the page for us, if possible. We need
5430                 // to fully unlock to avoid deadlocks. Since we have already
5431                 // wired the area itself, nothing disturbing will happen with it
5432                 // in the meantime.
5433                 map->Unlock();
5434                 cacheChainLocker.Unlock();
5435                 addressSpaceLocker.Unlock();
5436
5437                 error = vm_soft_fault(addressSpace, pageAddress, writable, false,
5438                         isUser, &page);
5439
5440                 if (error != B_OK) {
5441                         // The page could not be mapped -- clean up.
5442                         VMCache* cache = vm_area_get_locked_cache(area);
5443                         area->Unwire(&info->range);
5444                         cache->ReleaseRefAndUnlock();
5445                         addressSpace->Put();
5446                         return error;
5447                 }
5448         }
5449
5450         info->physicalAddress
5451                 = (phys_addr_t)page->physical_page_number * B_PAGE_SIZE
5452                         + address % B_PAGE_SIZE;
5453         info->page = page;
5454
5455         return B_OK;
5456 }
5457
5458
5459 /*!     Unwires a single page previously wired via vm_wire_page().
5460
5461         \param info The same object passed to vm_wire_page() before.
5462 */
5463 void
5464 vm_unwire_page(VMPageWiringInfo* info)
5465 {
5466         // lock the address space
5467         VMArea* area = info->range.area;
5468         AddressSpaceReadLocker addressSpaceLocker(area->address_space, false);
5469                 // takes over our reference
5470
5471         // lock the top cache
5472         VMCache* cache = vm_area_get_locked_cache(area);
5473         VMCacheChainLocker cacheChainLocker(cache);
5474
5475         if (info->page->Cache() != cache) {
5476                 // The page is not in the top cache, so we lock the whole cache chain
5477                 // before touching the page's wired count.
5478                 cacheChainLocker.LockAllSourceCaches();
5479         }
5480
5481         decrement_page_wired_count(info->page);
5482
5483         // remove the wired range from the range
5484         area->Unwire(&info->range);
5485
5486         cacheChainLocker.Unlock();
5487 }
5488
5489
5490 /*!     Wires down the given address range in the specified team's address space.
5491
5492         If successful the function
5493         - acquires a reference to the specified team's address space,
5494         - adds respective wired ranges to all areas that intersect with the given
5495           address range,
5496         - makes sure all pages in the given address range are mapped with the
5497           requested access permissions and increments their wired count.
5498
5499         It fails, when \a team doesn't specify a valid address space, when any part
5500         of the specified address range is not covered by areas, when the concerned
5501         areas don't allow mapping with the requested permissions, or when mapping
5502         failed for another reason.
5503
5504         When successful the call must be balanced by a unlock_memory_etc() call with
5505         the exact same parameters.
5506
5507         \param team Identifies the address (via team ID). \c B_CURRENT_TEAM is
5508                 supported.
5509         \param address The start of the address range to be wired.
5510         \param numBytes The size of the address range to be wired.
5511         \param flags Flags. Currently only \c B_READ_DEVICE is defined, which
5512                 requests that the range must be wired writable ("read from device
5513                 into memory").
5514         \return \c B_OK on success, another error code otherwise.
5515 */
5516 status_t
5517 lock_memory_etc(team_id team, void* address, size_t numBytes, uint32 flags)
5518 {
5519         addr_t lockBaseAddress = ROUNDDOWN((addr_t)address, B_PAGE_SIZE);
5520         addr_t lockEndAddress = ROUNDUP((addr_t)address + numBytes, B_PAGE_SIZE);
5521
5522         // compute the page protection that is required
5523         bool isUser = IS_USER_ADDRESS(address);
5524         bool writable = (flags & B_READ_DEVICE) == 0;
5525         uint32 requiredProtection = PAGE_PRESENT
5526                 | B_KERNEL_READ_AREA | (isUser ? B_READ_AREA : 0);
5527         if (writable)
5528                 requiredProtection |= B_KERNEL_WRITE_AREA | (isUser ? B_WRITE_AREA : 0);
5529
5530         uint32 mallocFlags = isUser
5531                 ? 0 : HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE;
5532
5533         // get and read lock the address space
5534         VMAddressSpace* addressSpace = NULL;
5535         if (isUser) {
5536                 if (team == B_CURRENT_TEAM)
5537                         addressSpace = VMAddressSpace::GetCurrent();
5538                 else
5539                         addressSpace = VMAddressSpace::Get(team);
5540         } else
5541                 addressSpace = VMAddressSpace::GetKernel();
5542         if (addressSpace == NULL)
5543                 return B_ERROR;
5544
5545         AddressSpaceReadLocker addressSpaceLocker(addressSpace, true);
5546                 // We get a new address space reference here. The one we got above will
5547                 // be freed by unlock_memory_etc().
5548
5549         VMTranslationMap* map = addressSpace->TranslationMap();
5550         status_t error = B_OK;
5551
5552         // iterate through all concerned areas
5553         addr_t nextAddress = lockBaseAddress;
5554         while (nextAddress != lockEndAddress) {
5555                 // get the next area
5556                 VMArea* area = addressSpace->LookupArea(nextAddress);
5557                 if (area == NULL) {
5558                         error = B_BAD_ADDRESS;
5559                         break;
5560                 }
5561
5562                 addr_t areaStart = nextAddress;
5563                 addr_t areaEnd = std::min(lockEndAddress, area->Base() + area->Size());
5564
5565                 // allocate the wired range (do that before locking the cache to avoid
5566                 // deadlocks)
5567                 VMAreaWiredRange* range = new(malloc_flags(mallocFlags))
5568                         VMAreaWiredRange(areaStart, areaEnd - areaStart, writable, true);
5569                 if (range == NULL) {
5570                         error = B_NO_MEMORY;
5571                         break;
5572                 }
5573
5574                 // Lock the area's top cache. This is a requirement for VMArea::Wire().
5575                 VMCacheChainLocker cacheChainLocker(vm_area_get_locked_cache(area));
5576
5577                 // mark the area range wired
5578                 area->Wire(range);
5579
5580                 // Depending on the area cache type and the wiring, we may not need to
5581                 // look at the individual pages.
5582                 if (area->cache_type == CACHE_TYPE_NULL
5583                         || area->cache_type == CACHE_TYPE_DEVICE
5584                         || area->wiring == B_FULL_LOCK
5585                         || area->wiring == B_CONTIGUOUS) {
5586                         nextAddress = areaEnd;
5587                         continue;
5588                 }
5589
5590                 // Lock the area's cache chain and the translation map. Needed to look
5591                 // up pages and play with their wired count.
5592                 cacheChainLocker.LockAllSourceCaches();
5593                 map->Lock();
5594
5595                 // iterate through the pages and wire them
5596                 for (; nextAddress != areaEnd; nextAddress += B_PAGE_SIZE) {
5597                         phys_addr_t physicalAddress;
5598                         uint32 flags;
5599
5600                         vm_page* page;
5601                         if (map->Query(nextAddress, &physicalAddress, &flags) == B_OK
5602                                 && (flags & requiredProtection) == requiredProtection
5603                                 && (page = vm_lookup_page(physicalAddress / B_PAGE_SIZE))
5604                                         != NULL) {
5605                                 // Already mapped with the correct permissions -- just increment
5606                                 // the page's wired count.
5607                                 increment_page_wired_count(page);
5608                         } else {
5609                                 // Let vm_soft_fault() map the page for us, if possible. We need
5610                                 // to fully unlock to avoid deadlocks. Since we have already
5611                                 // wired the area itself, nothing disturbing will happen with it
5612                                 // in the meantime.
5613                                 map->Unlock();
5614                                 cacheChainLocker.Unlock();
5615                                 addressSpaceLocker.Unlock();
5616
5617                                 error = vm_soft_fault(addressSpace, nextAddress, writable,
5618                                         false, isUser, &page);
5619
5620                                 addressSpaceLocker.Lock();
5621                                 cacheChainLocker.SetTo(vm_area_get_locked_cache(area));
5622                                 cacheChainLocker.LockAllSourceCaches();
5623                                 map->Lock();
5624                         }
5625
5626                         if (error != B_OK)
5627                                 break;
5628                 }
5629
5630                 map->Unlock();
5631
5632                 if (error == B_OK) {
5633                         cacheChainLocker.Unlock();
5634                 } else {
5635                         // An error occurred, so abort right here. If the current address
5636                         // is the first in this area, unwire the area, since we won't get
5637                         // to it when reverting what we've done so far.
5638                         if (nextAddress == areaStart) {
5639                                 area->Unwire(range);
5640                                 cacheChainLocker.Unlock();
5641                                 range->~VMAreaWiredRange();
5642                                 free_etc(range, mallocFlags);
5643                         } else
5644                                 cacheChainLocker.Unlock();
5645
5646                         break;
5647                 }
5648         }
5649
5650         if (error != B_OK) {
5651                 // An error occurred, so unwire all that we've already wired. Note that
5652                 // even if not a single page was wired, unlock_memory_etc() is called
5653                 // to put the address space reference.
5654                 addressSpaceLocker.Unlock();
5655                 unlock_memory_etc(team, (void*)lockBaseAddress,
5656                         nextAddress - lockBaseAddress, flags);
5657         }
5658
5659         return error;
5660 }
5661
5662
5663 status_t
5664 lock_memory(void* address, size_t numBytes, uint32 flags)
5665 {
5666         return lock_memory_etc(B_CURRENT_TEAM, address, numBytes, flags);
5667 }
5668
5669
5670 /*!     Unwires an address range previously wired with lock_memory_etc().
5671
5672         Note that a call to this function must balance a previous lock_memory_etc()
5673         call with exactly the same parameters.
5674 */
5675 status_t
5676 unlock_memory_etc(team_id team, void* address, size_t numBytes, uint32 flags)
5677 {
5678         addr_t lockBaseAddress = ROUNDDOWN((addr_t)address, B_PAGE_SIZE);
5679         addr_t lockEndAddress = ROUNDUP((addr_t)address + numBytes, B_PAGE_SIZE);
5680
5681         // compute the page protection that is required
5682         bool isUser = IS_USER_ADDRESS(address);
5683         bool writable = (flags & B_READ_DEVICE) == 0;
5684         uint32 requiredProtection = PAGE_PRESENT
5685                 | B_KERNEL_READ_AREA | (isUser ? B_READ_AREA : 0);
5686         if (writable)
5687                 requiredProtection |= B_KERNEL_WRITE_AREA | (isUser ? B_WRITE_AREA : 0);
5688
5689         uint32 mallocFlags = isUser
5690                 ? 0 : HEAP_DONT_WAIT_FOR_MEMORY | HEAP_DONT_LOCK_KERNEL_SPACE;
5691
5692         // get and read lock the address space
5693         VMAddressSpace* addressSpace = NULL;
5694         if (isUser) {
5695                 if (team == B_CURRENT_TEAM)
5696                         addressSpace = VMAddressSpace::GetCurrent();
5697                 else
5698                         addressSpace = VMAddressSpace::Get(team);
5699         } else
5700                 addressSpace = VMAddressSpace::GetKernel();
5701         if (addressSpace == NULL)
5702                 return B_ERROR;
5703
5704         AddressSpaceReadLocker addressSpaceLocker(addressSpace, false);
5705                 // Take over the address space reference. We don't unlock until we're
5706                 // done.
5707
5708         VMTranslationMap* map = addressSpace->TranslationMap();
5709         status_t error = B_OK;
5710
5711         // iterate through all concerned areas
5712         addr_t nextAddress = lockBaseAddress;
5713         while (nextAddress != lockEndAddress) {
5714                 // get the next area
5715                 VMArea* area = addressSpace->LookupArea(nextAddress);
5716                 if (area == NULL) {
5717                         error = B_BAD_ADDRESS;
5718                         break;
5719                 }
5720
5721                 addr_t areaStart = nextAddress;
5722                 addr_t areaEnd = std::min(lockEndAddress, area->Base() + area->Size());
5723
5724                 // Lock the area's top cache. This is a requirement for
5725                 // VMArea::Unwire().
5726                 VMCacheChainLocker cacheChainLocker(vm_area_get_locked_cache(area));
5727
5728                 // Depending on the area cache type and the wiring, we may not need to
5729                 // look at the individual pages.
5730                 if (area->cache_type == CACHE_TYPE_NULL
5731                         || area->cache_type == CACHE_TYPE_DEVICE
5732                         || area->wiring == B_FULL_LOCK
5733                         || area->wiring == B_CONTIGUOUS) {
5734                         // unwire the range (to avoid deadlocks we delete the range after
5735                         // unlocking the cache)
5736                         nextAddress = areaEnd;
5737                         VMAreaWiredRange* range = area->Unwire(areaStart,
5738                                 areaEnd - areaStart, writable);
5739                         cacheChainLocker.Unlock();
5740                         if (range != NULL) {
5741                                 range->~VMAreaWiredRange();
5742                                 free_etc(range, mallocFlags);
5743                         }
5744                         continue;
5745                 }
5746
5747                 // Lock the area's cache chain and the translation map. Needed to look
5748                 // up pages and play with their wired count.
5749                 cacheChainLocker.LockAllSourceCaches();
5750                 map->Lock();
5751
5752                 // iterate through the pages and unwire them
5753                 for (; nextAddress != areaEnd; nextAddress += B_PAGE_SIZE) {
5754                         phys_addr_t physicalAddress;
5755                         uint32 flags;
5756
5757                         vm_page* page;
5758                         if (map->Query(nextAddress, &physicalAddress, &flags) == B_OK
5759                                 && (flags & PAGE_PRESENT) != 0
5760                                 && (page = vm_lookup_page(physicalAddress / B_PAGE_SIZE))
5761                                         != NULL) {
5762                                 // Already mapped with the correct permissions -- just increment
5763                                 // the page's wired count.
5764                                 decrement_page_wired_count(page);
5765                         } else {
5766                                 panic("unlock_memory_etc(): Failed to unwire page: address "
5767                                         "space %p, address: %#" B_PRIxADDR, addressSpace,
5768                                         nextAddress);
5769                                 error = B_BAD_VALUE;
5770                                 break;
5771                         }
5772                 }
5773
5774                 map->Unlock();
5775
5776                 // All pages are unwired. Remove the area's wired range as well (to
5777                 // avoid deadlocks we delete the range after unlocking the cache).
5778                 VMAreaWiredRange* range = area->Unwire(areaStart,
5779                         areaEnd - areaStart, writable);
5780
5781                 cacheChainLocker.Unlock();
5782
5783                 if (range != NULL) {
5784                         range->~VMAreaWiredRange();
5785                         free_etc(range, mallocFlags);
5786                 }
5787
5788                 if (error != B_OK)
5789                         break;
5790         }
5791
5792         // get rid of the address space reference lock_memory_etc() acquired
5793         addressSpace->Put();
5794
5795         return error;
5796 }
5797
5798
5799 status_t
5800 unlock_memory(void* address, size_t numBytes, uint32 flags)
5801 {
5802         return unlock_memory_etc(B_CURRENT_TEAM, address, numBytes, flags);
5803 }
5804
5805
5806 /*!     Similar to get_memory_map(), but also allows to specify the address space
5807         for the memory in question and has a saner semantics.
5808         Returns \c B_OK when the complete range could be translated or
5809         \c B_BUFFER_OVERFLOW, if the provided array wasn't big enough. In either
5810         case the actual number of entries is written to \c *_numEntries. Any other
5811         error case indicates complete failure; \c *_numEntries will be set to \c 0
5812         in this case.
5813 */
5814 status_t
5815 get_memory_map_etc(team_id team, const void* address, size_t numBytes,
5816         physical_entry* table, uint32* _numEntries)
5817 {
5818         uint32 numEntries = *_numEntries;
5819         *_numEntries = 0;
5820
5821         VMAddressSpace* addressSpace;
5822         addr_t virtualAddress = (addr_t)address;
5823         addr_t pageOffset = virtualAddress & (B_PAGE_SIZE - 1);
5824         phys_addr_t physicalAddress;
5825         status_t status = B_OK;
5826         int32 index = -1;
5827         addr_t offset = 0;
5828         bool interrupts = are_interrupts_enabled();
5829
5830         TRACE(("get_memory_map_etc(%" B_PRId32 ", %p, %lu bytes, %" B_PRIu32 " "
5831                 "entries)\n", team, address, numBytes, numEntries));
5832
5833         if (numEntries == 0 || numBytes == 0)
5834                 return B_BAD_VALUE;
5835
5836         // in which address space is the address to be found?
5837         if (IS_USER_ADDRESS(virtualAddress)) {
5838                 if (team == B_CURRENT_TEAM)
5839                         addressSpace = VMAddressSpace::GetCurrent();
5840                 else
5841                         addressSpace = VMAddressSpace::Get(team);
5842         } else
5843                 addressSpace = VMAddressSpace::GetKernel();
5844
5845         if (addressSpace == NULL)
5846                 return B_ERROR;
5847
5848         VMTranslationMap* map = addressSpace->TranslationMap();
5849
5850         if (interrupts)
5851                 map->Lock();
5852
5853         while (offset < numBytes) {
5854                 addr_t bytes = min_c(numBytes - offset, B_PAGE_SIZE);
5855                 uint32 flags;
5856
5857                 if (interrupts) {
5858                         status = map->Query((addr_t)address + offset, &physicalAddress,
5859                                 &flags);
5860                 } else {
5861                         status = map->QueryInterrupt((addr_t)address + offset,
5862                                 &physicalAddress, &flags);
5863                 }
5864                 if (status < B_OK)
5865                         break;
5866                 if ((flags & PAGE_PRESENT) == 0) {
5867                         panic("get_memory_map() called on unmapped memory!");
5868                         return B_BAD_ADDRESS;
5869                 }
5870
5871                 if (index < 0 && pageOffset > 0) {
5872                         physicalAddress += pageOffset;
5873                         if (bytes > B_PAGE_SIZE - pageOffset)
5874                                 bytes = B_PAGE_SIZE - pageOffset;
5875                 }
5876
5877                 // need to switch to the next physical_entry?
5878                 if (index < 0 || table[index].address
5879                                 != physicalAddress - table[index].size) {
5880                         if ((uint32)++index + 1 > numEntries) {
5881                                 // table to small
5882                                 break;
5883                         }
5884                         table[index].address = physicalAddress;
5885                         table[index].size = bytes;
5886                 } else {
5887                         // page does fit in current entry
5888                         table[index].size += bytes;
5889                 }
5890
5891                 offset += bytes;
5892         }
5893
5894         if (interrupts)
5895                 map->Unlock();
5896
5897         if (status != B_OK)
5898                 return status;
5899
5900         if ((uint32)index + 1 > numEntries) {
5901                 *_numEntries = index;
5902                 return B_BUFFER_OVERFLOW;
5903         }
5904
5905         *_numEntries = index + 1;
5906         return B_OK;
5907 }
5908
5909
5910 /*!     According to the BeBook, this function should always succeed.
5911         This is no longer the case.
5912 */
5913 extern "C" int32
5914 __get_memory_map_haiku(const void* address, size_t numBytes,
5915         physical_entry* table, int32 numEntries)
5916 {
5917         uint32 entriesRead = numEntries;
5918         status_t error = get_memory_map_etc(B_CURRENT_TEAM, address, numBytes,
5919                 table, &entriesRead);
5920         if (error != B_OK)
5921                 return error;
5922
5923         // close the entry list
5924
5925         // if it's only one entry, we will silently accept the missing ending
5926         if (numEntries == 1)
5927                 return B_OK;
5928
5929         if (entriesRead + 1 > (uint32)numEntries)
5930                 return B_BUFFER_OVERFLOW;
5931
5932         table[entriesRead].address = 0;
5933         table[entriesRead].size = 0;
5934
5935         return B_OK;
5936 }
5937
5938
5939 area_id
5940 area_for(void* address)
5941 {
5942         return vm_area_for((addr_t)address, true);
5943 }
5944
5945
5946 area_id
5947 find_area(const char* name)
5948 {
5949         return VMAreaHash::Find(name);
5950 }
5951
5952
5953 status_t
5954 _get_area_info(area_id id, area_info* info, size_t size)
5955 {
5956         if (size != sizeof(area_info) || info == NULL)
5957                 return B_BAD_VALUE;
5958
5959         AddressSpaceReadLocker locker;
5960         VMArea* area;
5961         status_t status = locker.SetFromArea(id, area);
5962         if (status != B_OK)
5963                 return status;
5964
5965         fill_area_info(area, info, size);
5966         return B_OK;
5967 }
5968
5969
5970 status_t
5971 _get_next_area_info(team_id team, ssize_t* cookie, area_info* info, size_t size)
5972 {
5973         addr_t nextBase = *(addr_t*)cookie;
5974
5975         // we're already through the list
5976         if (nextBase == (addr_t)-1)
5977                 return B_ENTRY_NOT_FOUND;
5978
5979         if (team == B_CURRENT_TEAM)
5980                 team = team_get_current_team_id();
5981
5982         AddressSpaceReadLocker locker(team);
5983         if (!locker.IsLocked())
5984                 return B_BAD_TEAM_ID;
5985
5986         VMArea* area;
5987         for (VMAddressSpace::AreaIterator it
5988                                 = locker.AddressSpace()->GetAreaIterator();
5989                         (area = it.Next()) != NULL;) {
5990                 if (area->Base() > nextBase)
5991                         break;
5992         }
5993
5994         if (area == NULL) {
5995                 nextBase = (addr_t)-1;
5996                 return B_ENTRY_NOT_FOUND;
5997         }
5998
5999         fill_area_info(area, info, size);
6000         *cookie = (ssize_t)(area->Base());
6001
6002         return B_OK;
6003 }
6004
6005
6006 status_t
6007 set_area_protection(area_id area, uint32 newProtection)
6008 {
6009         return vm_set_area_protection(VMAddressSpace::KernelID(), area,
6010                 newProtection, true);
6011 }
6012
6013
6014 status_t
6015 resize_area(area_id areaID, size_t newSize)
6016 {
6017         return vm_resize_area(areaID, newSize, true);
6018 }
6019
6020
6021 /*!     Transfers the specified area to a new team. The caller must be the owner
6022         of the area.
6023 */
6024 area_id
6025 transfer_area(area_id id, void** _address, uint32 addressSpec, team_id target,
6026         bool kernel)
6027 {
6028         area_info info;
6029         status_t status = get_area_info(id, &info);
6030         if (status != B_OK)
6031                 return status;
6032
6033         if (info.team != thread_get_current_thread()->team->id)
6034                 return B_PERMISSION_DENIED;
6035
6036         area_id clonedArea = vm_clone_area(target, info.name, _address,
6037                 addressSpec, info.protection, REGION_NO_PRIVATE_MAP, id, kernel);
6038         if (clonedArea < 0)
6039                 return clonedArea;
6040
6041         status = vm_delete_area(info.team, id, kernel);
6042         if (status != B_OK) {
6043                 vm_delete_area(target, clonedArea, kernel);
6044                 return status;
6045         }
6046
6047         // TODO: The clonedArea is B_SHARED_AREA, which is not really desired.
6048
6049         return clonedArea;
6050 }
6051
6052
6053 extern "C" area_id
6054 __map_physical_memory_haiku(const char* name, phys_addr_t physicalAddress,
6055         size_t numBytes, uint32 addressSpec, uint32 protection,
6056         void** _virtualAddress)
6057 {
6058         if (!arch_vm_supports_protection(protection))
6059                 return B_NOT_SUPPORTED;
6060
6061         fix_protection(&protection);
6062
6063         return vm_map_physical_memory(VMAddressSpace::KernelID(), name,
6064                 _virtualAddress, addressSpec, numBytes, protection, physicalAddress,
6065                 false);
6066 }
6067
6068
6069 area_id
6070 clone_area(const char* name, void** _address, uint32 addressSpec,
6071         uint32 protection, area_id source)
6072 {
6073         if ((protection & B_KERNEL_PROTECTION) == 0)
6074                 protection |= B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA;
6075
6076         return vm_clone_area(VMAddressSpace::KernelID(), name, _address,
6077                 addressSpec, protection, REGION_NO_PRIVATE_MAP, source, true);
6078 }
6079
6080
6081 area_id
6082 create_area_etc(team_id team, const char* name, uint32 size, uint32 lock,
6083         uint32 protection, uint32 flags, uint32 guardSize,
6084         const virtual_address_restrictions* virtualAddressRestrictions,
6085         const physical_address_restrictions* physicalAddressRestrictions,
6086         void** _address)
6087 {
6088         fix_protection(&protection);
6089
6090         return vm_create_anonymous_area(team, name, size, lock, protection, flags,
6091                 guardSize, virtualAddressRestrictions, physicalAddressRestrictions,
6092                 true, _address);
6093 }
6094
6095
6096 extern "C" area_id
6097 __create_area_haiku(const char* name, void** _address, uint32 addressSpec,
6098         size_t size, uint32 lock, uint32 protection)
6099 {
6100         fix_protection(&protection);
6101
6102         virtual_address_restrictions virtualRestrictions = {};
6103         virtualRestrictions.address = *_address;
6104         virtualRestrictions.address_specification = addressSpec;
6105         physical_address_restrictions physicalRestrictions = {};
6106         return vm_create_anonymous_area(VMAddressSpace::KernelID(), name, size,
6107                 lock, protection, 0, 0, &virtualRestrictions, &physicalRestrictions,
6108                 true, _address);
6109 }
6110
6111
6112 status_t
6113 delete_area(area_id area)
6114 {
6115         return vm_delete_area(VMAddressSpace::KernelID(), area, true);
6116 }
6117
6118
6119 //      #pragma mark - Userland syscalls
6120
6121
6122 status_t
6123 _user_reserve_address_range(addr_t* userAddress, uint32 addressSpec,
6124         addr_t size)
6125 {
6126         // filter out some unavailable values (for userland)
6127         switch (addressSpec) {
6128                 case B_ANY_KERNEL_ADDRESS:
6129                 case B_ANY_KERNEL_BLOCK_ADDRESS:
6130                         return B_BAD_VALUE;
6131         }
6132
6133         addr_t address;
6134
6135         if (!IS_USER_ADDRESS(userAddress)
6136                 || user_memcpy(&address, userAddress, sizeof(address)) != B_OK)
6137                 return B_BAD_ADDRESS;
6138
6139         status_t status = vm_reserve_address_range(
6140                 VMAddressSpace::CurrentID(), (void**)&address, addressSpec, size,
6141                 RESERVED_AVOID_BASE);
6142         if (status != B_OK)
6143                 return status;
6144
6145         if (user_memcpy(userAddress, &address, sizeof(address)) != B_OK) {
6146                 vm_unreserve_address_range(VMAddressSpace::CurrentID(),
6147                         (void*)address, size);
6148                 return B_BAD_ADDRESS;
6149         }
6150
6151         return B_OK;
6152 }
6153
6154
6155 status_t
6156 _user_unreserve_address_range(addr_t address, addr_t size)
6157 {
6158         return vm_unreserve_address_range(VMAddressSpace::CurrentID(),
6159                 (void*)address, size);
6160 }
6161
6162
6163 area_id
6164 _user_area_for(void* address)
6165 {
6166         return vm_area_for((addr_t)address, false);
6167 }
6168
6169
6170 area_id
6171 _user_find_area(const char* userName)
6172 {
6173         char name[B_OS_NAME_LENGTH];
6174
6175         if (!IS_USER_ADDRESS(userName)
6176                 || user_strlcpy(name, userName, B_OS_NAME_LENGTH) < B_OK)
6177                 return B_BAD_ADDRESS;
6178
6179         return find_area(name);
6180 }
6181
6182
6183 status_t
6184 _user_get_area_info(area_id area, area_info* userInfo)
6185 {
6186         if (!IS_USER_ADDRESS(userInfo))
6187                 return B_BAD_ADDRESS;
6188
6189         area_info info;
6190         status_t status = get_area_info(area, &info);
6191         if (status < B_OK)
6192                 return status;
6193
6194         // TODO: do we want to prevent userland from seeing kernel protections?
6195         //info.protection &= B_USER_PROTECTION;
6196
6197         if (user_memcpy(userInfo, &info, sizeof(area_info)) < B_OK)
6198                 return B_BAD_ADDRESS;
6199
6200         return status;
6201 }
6202
6203
6204 status_t
6205 _user_get_next_area_info(team_id team, ssize_t* userCookie, area_info* userInfo)
6206 {
6207         ssize_t cookie;
6208
6209         if (!IS_USER_ADDRESS(userCookie)
6210                 || !IS_USER_ADDRESS(userInfo)
6211                 || user_memcpy(&cookie, userCookie, sizeof(ssize_t)) < B_OK)
6212                 return B_BAD_ADDRESS;
6213
6214         area_info info;
6215         status_t status = _get_next_area_info(team, &cookie, &info,
6216                 sizeof(area_info));
6217         if (status != B_OK)
6218                 return status;
6219
6220         //info.protection &= B_USER_PROTECTION;
6221
6222         if (user_memcpy(userCookie, &cookie, sizeof(ssize_t)) < B_OK
6223                 || user_memcpy(userInfo, &info, sizeof(area_info)) < B_OK)
6224                 return B_BAD_ADDRESS;
6225
6226         return status;
6227 }
6228
6229
6230 status_t
6231 _user_set_area_protection(area_id area, uint32 newProtection)
6232 {
6233         if ((newProtection & ~B_USER_PROTECTION) != 0)
6234                 return B_BAD_VALUE;
6235
6236         return vm_set_area_protection(VMAddressSpace::CurrentID(), area,
6237                 newProtection, false);
6238 }
6239
6240
6241 status_t
6242 _user_resize_area(area_id area, size_t newSize)
6243 {
6244         // TODO: Since we restrict deleting of areas to those owned by the team,
6245         // we should also do that for resizing (check other functions, too).
6246         return vm_resize_area(area, newSize, false);
6247 }
6248
6249
6250 area_id
6251 _user_transfer_area(area_id area, void** userAddress, uint32 addressSpec,
6252         team_id target)
6253 {
6254         // filter out some unavailable values (for userland)
6255         switch (addressSpec) {
6256                 case B_ANY_KERNEL_ADDRESS:
6257                 case B_ANY_KERNEL_BLOCK_ADDRESS:
6258                         return B_BAD_VALUE;
6259         }
6260
6261         void* address;
6262         if (!IS_USER_ADDRESS(userAddress)
6263                 || user_memcpy(&address, userAddress, sizeof(address)) < B_OK)
6264                 return B_BAD_ADDRESS;
6265
6266         area_id newArea = transfer_area(area, &address, addressSpec, target, false);
6267         if (newArea < B_OK)
6268                 return newArea;
6269
6270         if (user_memcpy(userAddress, &address, sizeof(address)) < B_OK)
6271                 return B_BAD_ADDRESS;
6272
6273         return newArea;
6274 }
6275
6276
6277 area_id
6278 _user_clone_area(const char* userName, void** userAddress, uint32 addressSpec,
6279         uint32 protection, area_id sourceArea)
6280 {
6281         char name[B_OS_NAME_LENGTH];
6282         void* address;
6283
6284         // filter out some unavailable values (for userland)
6285         switch (addressSpec) {
6286                 case B_ANY_KERNEL_ADDRESS:
6287                 case B_ANY_KERNEL_BLOCK_ADDRESS:
6288                         return B_BAD_VALUE;
6289         }
6290         if ((protection & ~B_USER_AREA_FLAGS) != 0)
6291                 return B_BAD_VALUE;
6292
6293         if (!IS_USER_ADDRESS(userName)
6294                 || !IS_USER_ADDRESS(userAddress)
6295                 || user_strlcpy(name, userName, sizeof(name)) < B_OK
6296                 || user_memcpy(&address, userAddress, sizeof(address)) < B_OK)
6297                 return B_BAD_ADDRESS;
6298
6299         fix_protection(&protection);
6300
6301         area_id clonedArea = vm_clone_area(VMAddressSpace::CurrentID(), name,
6302                 &address, addressSpec, protection, REGION_NO_PRIVATE_MAP, sourceArea,
6303                 false);
6304         if (clonedArea < B_OK)
6305                 return clonedArea;
6306
6307         if (user_memcpy(userAddress, &address, sizeof(address)) < B_OK) {
6308                 delete_area(clonedArea);
6309                 return B_BAD_ADDRESS;
6310         }
6311
6312         return clonedArea;
6313 }
6314
6315
6316 area_id
6317 _user_create_area(const char* userName, void** userAddress, uint32 addressSpec,
6318         size_t size, uint32 lock, uint32 protection)
6319 {
6320         char name[B_OS_NAME_LENGTH];
6321         void* address;
6322
6323         // filter out some unavailable values (for userland)
6324         switch (addressSpec) {
6325                 case B_ANY_KERNEL_ADDRESS:
6326                 case B_ANY_KERNEL_BLOCK_ADDRESS:
6327                         return B_BAD_VALUE;
6328         }
6329         if ((protection & ~B_USER_AREA_FLAGS) != 0)
6330                 return B_BAD_VALUE;
6331
6332         if (!IS_USER_ADDRESS(userName)
6333                 || !IS_USER_ADDRESS(userAddress)
6334                 || user_strlcpy(name, userName, sizeof(name)) < B_OK
6335                 || user_memcpy(&address, userAddress, sizeof(address)) < B_OK)
6336                 return B_BAD_ADDRESS;
6337
6338         if (addressSpec == B_EXACT_ADDRESS
6339                 && IS_KERNEL_ADDRESS(address))
6340                 return B_BAD_VALUE;
6341
6342         if (addressSpec == B_ANY_ADDRESS)
6343                 addressSpec = B_RANDOMIZED_ANY_ADDRESS;
6344         if (addressSpec == B_BASE_ADDRESS)
6345                 addressSpec = B_RANDOMIZED_BASE_ADDRESS;
6346
6347         fix_protection(&protection);
6348
6349         virtual_address_restrictions virtualRestrictions = {};
6350         virtualRestrictions.address = address;
6351         virtualRestrictions.address_specification = addressSpec;
6352         physical_address_restrictions physicalRestrictions = {};
6353         area_id area = vm_create_anonymous_area(VMAddressSpace::CurrentID(), name,
6354                 size, lock, protection, 0, 0, &virtualRestrictions,
6355                 &physicalRestrictions, false, &address);
6356
6357         if (area >= B_OK
6358                 && user_memcpy(userAddress, &address, sizeof(address)) < B_OK) {
6359                 delete_area(area);
6360                 return B_BAD_ADDRESS;
6361         }
6362
6363         return area;
6364 }
6365
6366
6367 status_t
6368 _user_delete_area(area_id area)
6369 {
6370         // Unlike the BeOS implementation, you can now only delete areas
6371         // that you have created yourself from userland.
6372         // The documentation to delete_area() explicitly states that this
6373         // will be restricted in the future, and so it will.
6374         return vm_delete_area(VMAddressSpace::CurrentID(), area, false);
6375 }
6376
6377
6378 // TODO: create a BeOS style call for this!
6379
6380 area_id
6381 _user_map_file(const char* userName, void** userAddress, uint32 addressSpec,
6382         size_t size, uint32 protection, uint32 mapping, bool unmapAddressRange,
6383         int fd, off_t offset)
6384 {
6385         char name[B_OS_NAME_LENGTH];
6386         void* address;
6387         area_id area;
6388
6389         if ((protection & ~B_USER_AREA_FLAGS) != 0)
6390                 return B_BAD_VALUE;
6391
6392         fix_protection(&protection);
6393
6394         if (!IS_USER_ADDRESS(userName) || !IS_USER_ADDRESS(userAddress)
6395                 || user_strlcpy(name, userName, B_OS_NAME_LENGTH) < B_OK
6396                 || user_memcpy(&address, userAddress, sizeof(address)) < B_OK)
6397                 return B_BAD_ADDRESS;
6398
6399         if (addressSpec == B_EXACT_ADDRESS) {
6400                 if ((addr_t)address + size < (addr_t)address
6401                                 || (addr_t)address % B_PAGE_SIZE != 0) {
6402                         return B_BAD_VALUE;
6403                 }
6404                 if (!IS_USER_ADDRESS(address)
6405                                 || !IS_USER_ADDRESS((addr_t)address + size)) {
6406                         return B_BAD_ADDRESS;
6407                 }
6408         }
6409
6410         area = _vm_map_file(VMAddressSpace::CurrentID(), name, &address,
6411                 addressSpec, size, protection, mapping, unmapAddressRange, fd, offset,
6412                 false);
6413         if (area < B_OK)
6414                 return area;
6415
6416         if (user_memcpy(userAddress, &address, sizeof(address)) < B_OK)
6417                 return B_BAD_ADDRESS;
6418
6419         return area;
6420 }
6421
6422
6423 status_t
6424 _user_unmap_memory(void* _address, size_t size)
6425 {
6426         addr_t address = (addr_t)_address;
6427
6428         // check params
6429         if (size == 0 || (addr_t)address + size < (addr_t)address
6430                 || (addr_t)address % B_PAGE_SIZE != 0) {
6431                 return B_BAD_VALUE;
6432         }
6433
6434         if (!IS_USER_ADDRESS(address) || !IS_USER_ADDRESS((addr_t)address + size))
6435                 return B_BAD_ADDRESS;
6436
6437         // Write lock the address space and ensure the address range is not wired.
6438         AddressSpaceWriteLocker locker;
6439         do {
6440                 status_t status = locker.SetTo(team_get_current_team_id());
6441                 if (status != B_OK)
6442                         return status;
6443         } while (wait_if_address_range_is_wired(locker.AddressSpace(), address,
6444                         size, &locker));
6445
6446         // unmap
6447         return unmap_address_range(locker.AddressSpace(), address, size, false);
6448 }
6449
6450
6451 status_t
6452 _user_set_memory_protection(void* _address, size_t size, uint32 protection)
6453 {
6454         // check address range
6455         addr_t address = (addr_t)_address;
6456         size = PAGE_ALIGN(size);
6457
6458         if ((address % B_PAGE_SIZE) != 0)
6459                 return B_BAD_VALUE;
6460         if ((addr_t)address + size < (addr_t)address || !IS_USER_ADDRESS(address)
6461                 || !IS_USER_ADDRESS((addr_t)address + size)) {
6462                 // weird error code required by POSIX
6463                 return ENOMEM;
6464         }
6465
6466         // extend and check protection
6467         if ((protection & ~B_USER_PROTECTION) != 0)
6468                 return B_BAD_VALUE;
6469
6470         fix_protection(&protection);
6471
6472         // We need to write lock the address space, since we're going to play with
6473         // the areas. Also make sure that none of the areas is wired and that we're
6474         // actually allowed to change the protection.
6475         AddressSpaceWriteLocker locker;
6476
6477         bool restart;
6478         do {
6479                 restart = false;
6480
6481                 status_t status = locker.SetTo(team_get_current_team_id());
6482                 if (status != B_OK)
6483                         return status;
6484
6485                 // First round: Check whether the whole range is covered by areas and we
6486                 // are allowed to modify them.
6487                 addr_t currentAddress = address;
6488                 size_t sizeLeft = size;
6489                 while (sizeLeft > 0) {
6490                         VMArea* area = locker.AddressSpace()->LookupArea(currentAddress);
6491                         if (area == NULL)
6492                                 return B_NO_MEMORY;
6493
6494                         if ((area->protection & B_KERNEL_AREA) != 0)
6495                                 return B_NOT_ALLOWED;
6496
6497                         // TODO: For (shared) mapped files we should check whether the new
6498                         // protections are compatible with the file permissions. We don't
6499                         // have a way to do that yet, though.
6500
6501                         addr_t offset = currentAddress - area->Base();
6502                         size_t rangeSize = min_c(area->Size() - offset, sizeLeft);
6503
6504                         AreaCacheLocker cacheLocker(area);
6505
6506                         if (wait_if_area_range_is_wired(area, currentAddress, rangeSize,
6507                                         &locker, &cacheLocker)) {
6508                                 restart = true;
6509                                 break;
6510                         }
6511
6512                         cacheLocker.Unlock();
6513
6514                         currentAddress += rangeSize;
6515                         sizeLeft -= rangeSize;
6516                 }
6517         } while (restart);
6518
6519         // Second round: If the protections differ from that of the area, create a
6520         // page protection array and re-map mapped pages.
6521         VMTranslationMap* map = locker.AddressSpace()->TranslationMap();
6522         addr_t currentAddress = address;
6523         size_t sizeLeft = size;
6524         while (sizeLeft > 0) {
6525                 VMArea* area = locker.AddressSpace()->LookupArea(currentAddress);
6526                 if (area == NULL)
6527                         return B_NO_MEMORY;
6528
6529                 addr_t offset = currentAddress - area->Base();
6530                 size_t rangeSize = min_c(area->Size() - offset, sizeLeft);
6531
6532                 currentAddress += rangeSize;
6533                 sizeLeft -= rangeSize;
6534
6535                 if (area->page_protections == NULL) {
6536                         if (area->protection == protection)
6537                                 continue;
6538
6539                         status_t status = allocate_area_page_protections(area);
6540                         if (status != B_OK)
6541                                 return status;
6542                 }
6543
6544                 // We need to lock the complete cache chain, since we potentially unmap
6545                 // pages of lower caches.
6546                 VMCache* topCache = vm_area_get_locked_cache(area);
6547                 VMCacheChainLocker cacheChainLocker(topCache);
6548                 cacheChainLocker.LockAllSourceCaches();
6549
6550                 for (addr_t pageAddress = area->Base() + offset;
6551                                 pageAddress < currentAddress; pageAddress += B_PAGE_SIZE) {
6552                         map->Lock();
6553
6554                         set_area_page_protection(area, pageAddress, protection);
6555
6556                         phys_addr_t physicalAddress;
6557                         uint32 flags;
6558
6559                         status_t error = map->Query(pageAddress, &physicalAddress, &flags);
6560                         if (error != B_OK || (flags & PAGE_PRESENT) == 0) {
6561                                 map->Unlock();
6562                                 continue;
6563                         }
6564
6565                         vm_page* page = vm_lookup_page(physicalAddress / B_PAGE_SIZE);
6566                         if (page == NULL) {
6567                                 panic("area %p looking up page failed for pa %#" B_PRIxPHYSADDR
6568                                         "\n", area, physicalAddress);
6569                                 map->Unlock();
6570                                 return B_ERROR;
6571                         }
6572
6573                         // If the page is not in the topmost cache and write access is
6574                         // requested, we have to unmap it. Otherwise we can re-map it with
6575                         // the new protection.
6576                         bool unmapPage = page->Cache() != topCache
6577                                 && (protection & B_WRITE_AREA) != 0;
6578
6579                         if (!unmapPage)
6580                                 map->ProtectPage(area, pageAddress, protection);
6581
6582                         map->Unlock();
6583
6584                         if (unmapPage) {
6585                                 DEBUG_PAGE_ACCESS_START(page);
6586                                 unmap_page(area, pageAddress);
6587                                 DEBUG_PAGE_ACCESS_END(page);
6588                         }
6589                 }
6590         }
6591
6592         return B_OK;
6593 }
6594
6595
6596 status_t
6597 _user_sync_memory(void* _address, size_t size, uint32 flags)
6598 {
6599         addr_t address = (addr_t)_address;
6600         size = PAGE_ALIGN(size);
6601
6602         // check params
6603         if ((address % B_PAGE_SIZE) != 0)
6604                 return B_BAD_VALUE;
6605         if ((addr_t)address + size < (addr_t)address || !IS_USER_ADDRESS(address)
6606                 || !IS_USER_ADDRESS((addr_t)address + size)) {
6607                 // weird error code required by POSIX
6608                 return ENOMEM;
6609         }
6610
6611         bool writeSync = (flags & MS_SYNC) != 0;
6612         bool writeAsync = (flags & MS_ASYNC) != 0;
6613         if (writeSync && writeAsync)
6614                 return B_BAD_VALUE;
6615
6616         if (size == 0 || (!writeSync && !writeAsync))
6617                 return B_OK;
6618
6619         // iterate through the range and sync all concerned areas
6620         while (size > 0) {
6621                 // read lock the address space
6622                 AddressSpaceReadLocker locker;
6623                 status_t error = locker.SetTo(team_get_current_team_id());
6624                 if (error != B_OK)
6625                         return error;
6626
6627                 // get the first area
6628                 VMArea* area = locker.AddressSpace()->LookupArea(address);
6629                 if (area == NULL)
6630                         return B_NO_MEMORY;
6631
6632                 uint32 offset = address - area->Base();
6633                 size_t rangeSize = min_c(area->Size() - offset, size);
6634                 offset += area->cache_offset;
6635
6636                 // lock the cache
6637                 AreaCacheLocker cacheLocker(area);
6638                 if (!cacheLocker)
6639                         return B_BAD_VALUE;
6640                 VMCache* cache = area->cache;
6641
6642                 locker.Unlock();
6643
6644                 uint32 firstPage = offset >> PAGE_SHIFT;
6645                 uint32 endPage = firstPage + (rangeSize >> PAGE_SHIFT);
6646
6647                 // write the pages
6648                 if (cache->type == CACHE_TYPE_VNODE) {
6649                         if (writeSync) {
6650                                 // synchronous
6651                                 error = vm_page_write_modified_page_range(cache, firstPage,
6652                                         endPage);
6653                                 if (error != B_OK)
6654                                         return error;
6655                         } else {
6656                                 // asynchronous
6657                                 vm_page_schedule_write_page_range(cache, firstPage, endPage);
6658                                 // TODO: This is probably not quite what is supposed to happen.
6659                                 // Especially when a lot has to be written, it might take ages
6660                                 // until it really hits the disk.
6661                         }
6662                 }
6663
6664                 address += rangeSize;
6665                 size -= rangeSize;
6666         }
6667
6668         // NOTE: If I understand it correctly the purpose of MS_INVALIDATE is to
6669         // synchronize multiple mappings of the same file. In our VM they never get
6670         // out of sync, though, so we don't have to do anything.
6671
6672         return B_OK;
6673 }
6674
6675
6676 status_t
6677 _user_memory_advice(void* address, size_t size, uint32 advice)
6678 {
6679         // TODO: Implement!
6680         return B_OK;
6681 }
6682
6683
6684 status_t
6685 _user_get_memory_properties(team_id teamID, const void* address,
6686         uint32* _protected, uint32* _lock)
6687 {
6688         if (!IS_USER_ADDRESS(_protected) || !IS_USER_ADDRESS(_lock))
6689                 return B_BAD_ADDRESS;
6690
6691         AddressSpaceReadLocker locker;
6692         status_t error = locker.SetTo(teamID);
6693         if (error != B_OK)
6694                 return error;
6695
6696         VMArea* area = locker.AddressSpace()->LookupArea((addr_t)address);
6697         if (area == NULL)
6698                 return B_NO_MEMORY;
6699
6700
6701         uint32 protection = area->protection;
6702         if (area->page_protections != NULL)
6703                 protection = get_area_page_protection(area, (addr_t)address);
6704
6705         uint32 wiring = area->wiring;
6706
6707         locker.Unlock();
6708
6709         error = user_memcpy(_protected, &protection, sizeof(protection));
6710         if (error != B_OK)
6711                 return error;
6712
6713         error = user_memcpy(_lock, &wiring, sizeof(wiring));
6714
6715         return error;
6716 }
6717
6718
6719 // #pragma mark -- compatibility
6720
6721
6722 #if defined(__INTEL__) && B_HAIKU_PHYSICAL_BITS > 32
6723
6724
6725 struct physical_entry_beos {
6726         uint32  address;
6727         uint32  size;
6728 };
6729
6730
6731 /*!     The physical_entry structure has changed. We need to translate it to the
6732         old one.
6733 */
6734 extern "C" int32
6735 __get_memory_map_beos(const void* _address, size_t numBytes,
6736         physical_entry_beos* table, int32 numEntries)
6737 {
6738         if (numEntries <= 0)
6739                 return B_BAD_VALUE;
6740
6741         const uint8* address = (const uint8*)_address;
6742
6743         int32 count = 0;
6744         while (numBytes > 0 && count < numEntries) {
6745                 physical_entry entry;
6746                 status_t result = __get_memory_map_haiku(address, numBytes, &entry, 1);
6747                 if (result < 0) {
6748                         if (result != B_BUFFER_OVERFLOW)
6749                                 return result;
6750                 }
6751
6752                 if (entry.address >= (phys_addr_t)1 << 32) {
6753                         panic("get_memory_map(): Address is greater 4 GB!");
6754                         return B_ERROR;
6755                 }
6756
6757                 table[count].address = entry.address;
6758                 table[count++].size = entry.size;
6759
6760                 address += entry.size;
6761                 numBytes -= entry.size;
6762         }
6763
6764         // null-terminate the table, if possible
6765         if (count < numEntries) {
6766                 table[count].address = 0;
6767                 table[count].size = 0;
6768         }
6769
6770         return B_OK;
6771 }
6772
6773
6774 /*!     The type of the \a physicalAddress parameter has changed from void* to
6775         phys_addr_t.
6776 */
6777 extern "C" area_id
6778 __map_physical_memory_beos(const char* name, void* physicalAddress,
6779         size_t numBytes, uint32 addressSpec, uint32 protection,
6780         void** _virtualAddress)
6781 {
6782         return __map_physical_memory_haiku(name, (addr_t)physicalAddress, numBytes,
6783                 addressSpec, protection, _virtualAddress);
6784 }
6785
6786
6787 /*! The caller might not be able to deal with physical addresses >= 4 GB, so
6788         we meddle with the \a lock parameter to force 32 bit.
6789 */
6790 extern "C" area_id
6791 __create_area_beos(const char* name, void** _address, uint32 addressSpec,
6792         size_t size, uint32 lock, uint32 protection)
6793 {
6794         switch (lock) {
6795                 case B_NO_LOCK:
6796                         break;
6797                 case B_FULL_LOCK:
6798                 case B_LAZY_LOCK:
6799                         lock = B_32_BIT_FULL_LOCK;
6800                         break;
6801                 case B_CONTIGUOUS:
6802                         lock = B_32_BIT_CONTIGUOUS;
6803                         break;
6804         }
6805
6806         return __create_area_haiku(name, _address, addressSpec, size, lock,
6807                 protection);
6808 }
6809
6810
6811 DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__get_memory_map_beos", "get_memory_map@",
6812         "BASE");
6813 DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__map_physical_memory_beos",
6814         "map_physical_memory@", "BASE");
6815 DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__create_area_beos", "create_area@",
6816         "BASE");
6817
6818 DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__get_memory_map_haiku",
6819         "get_memory_map@@", "1_ALPHA3");
6820 DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__map_physical_memory_haiku",
6821         "map_physical_memory@@", "1_ALPHA3");
6822 DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__create_area_haiku", "create_area@@",
6823         "1_ALPHA3");
6824
6825
6826 #else
6827
6828
6829 DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__get_memory_map_haiku",
6830         "get_memory_map@@", "BASE");
6831 DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__map_physical_memory_haiku",
6832         "map_physical_memory@@", "BASE");
6833 DEFINE_LIBROOT_KERNEL_SYMBOL_VERSION("__create_area_haiku", "create_area@@",
6834         "BASE");
6835
6836
6837 #endif  // defined(__INTEL__) && B_HAIKU_PHYSICAL_BITS > 32