vfs: check userland buffers before reading them.
[haiku.git] / src / system / kernel / cache / file_cache.cpp
blob80f5d4174a5779a130fb31afbd5e5de06944a5f4
1 /*
2 * Copyright 2004-2009, Axel Dörfler, axeld@pinc-software.de.
3 * Distributed under the terms of the MIT License.
4 */
7 #include "vnode_store.h"
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <string.h>
13 #include <KernelExport.h>
14 #include <fs_cache.h>
16 #include <condition_variable.h>
17 #include <file_cache.h>
18 #include <generic_syscall.h>
19 #include <low_resource_manager.h>
20 #include <thread.h>
21 #include <util/AutoLock.h>
22 #include <util/kernel_cpp.h>
23 #include <vfs.h>
24 #include <vm/vm.h>
25 #include <vm/vm_page.h>
26 #include <vm/VMCache.h>
28 #include "IORequest.h"
31 //#define TRACE_FILE_CACHE
32 #ifdef TRACE_FILE_CACHE
33 # define TRACE(x) dprintf x
34 #else
35 # define TRACE(x) ;
36 #endif
38 // maximum number of iovecs per request
39 #define MAX_IO_VECS 32 // 128 kB
40 #define MAX_FILE_IO_VECS 32
42 #define BYPASS_IO_SIZE 65536
43 #define LAST_ACCESSES 3
45 struct file_cache_ref {
46 VMCache *cache;
47 struct vnode *vnode;
48 off_t last_access[LAST_ACCESSES];
49 // TODO: it would probably be enough to only store the least
50 // significant 31 bits, and make this uint32 (one bit for
51 // write vs. read)
52 int32 last_access_index;
53 uint16 disabled_count;
55 inline void SetLastAccess(int32 index, off_t access, bool isWrite)
57 // we remember writes as negative offsets
58 last_access[index] = isWrite ? -access : access;
61 inline off_t LastAccess(int32 index, bool isWrite) const
63 return isWrite ? -last_access[index] : last_access[index];
66 inline uint32 LastAccessPageOffset(int32 index, bool isWrite)
68 return LastAccess(index, isWrite) >> PAGE_SHIFT;
72 class PrecacheIO : public AsyncIOCallback {
73 public:
74 PrecacheIO(file_cache_ref* ref, off_t offset,
75 generic_size_t size);
76 ~PrecacheIO();
78 status_t Prepare(vm_page_reservation* reservation);
79 void ReadAsync();
81 virtual void IOFinished(status_t status,
82 bool partialTransfer,
83 generic_size_t bytesTransferred);
85 private:
86 file_cache_ref* fRef;
87 VMCache* fCache;
88 vm_page** fPages;
89 size_t fPageCount;
90 ConditionVariable* fBusyConditions;
91 generic_io_vec* fVecs;
92 off_t fOffset;
93 uint32 fVecCount;
94 generic_size_t fSize;
95 #if DEBUG_PAGE_ACCESS
96 thread_id fAllocatingThread;
97 #endif
100 typedef status_t (*cache_func)(file_cache_ref* ref, void* cookie, off_t offset,
101 int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
102 vm_page_reservation* reservation, size_t reservePages);
104 static void add_to_iovec(generic_io_vec* vecs, uint32 &index, uint32 max,
105 generic_addr_t address, generic_size_t size);
108 static struct cache_module_info* sCacheModule;
111 static const uint32 kZeroVecCount = 32;
112 static const size_t kZeroVecSize = kZeroVecCount * B_PAGE_SIZE;
113 static phys_addr_t sZeroPage; // physical address
114 static generic_io_vec sZeroVecs[kZeroVecCount];
117 // #pragma mark -
120 PrecacheIO::PrecacheIO(file_cache_ref* ref, off_t offset, generic_size_t size)
122 fRef(ref),
123 fCache(ref->cache),
124 fPages(NULL),
125 fVecs(NULL),
126 fOffset(offset),
127 fVecCount(0),
128 fSize(size)
130 fPageCount = (size + B_PAGE_SIZE - 1) / B_PAGE_SIZE;
131 fCache->AcquireRefLocked();
135 PrecacheIO::~PrecacheIO()
137 delete[] fPages;
138 delete[] fVecs;
139 fCache->ReleaseRefLocked();
143 status_t
144 PrecacheIO::Prepare(vm_page_reservation* reservation)
146 if (fPageCount == 0)
147 return B_BAD_VALUE;
149 fPages = new(std::nothrow) vm_page*[fPageCount];
150 if (fPages == NULL)
151 return B_NO_MEMORY;
153 fVecs = new(std::nothrow) generic_io_vec[fPageCount];
154 if (fVecs == NULL)
155 return B_NO_MEMORY;
157 // allocate pages for the cache and mark them busy
158 uint32 i = 0;
159 for (generic_size_t pos = 0; pos < fSize; pos += B_PAGE_SIZE) {
160 vm_page* page = vm_page_allocate_page(reservation,
161 PAGE_STATE_CACHED | VM_PAGE_ALLOC_BUSY);
163 fCache->InsertPage(page, fOffset + pos);
165 add_to_iovec(fVecs, fVecCount, fPageCount,
166 page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
167 fPages[i++] = page;
170 #if DEBUG_PAGE_ACCESS
171 fAllocatingThread = find_thread(NULL);
172 #endif
174 return B_OK;
178 void
179 PrecacheIO::ReadAsync()
181 // This object is going to be deleted after the I/O request has been
182 // fulfilled
183 vfs_asynchronous_read_pages(fRef->vnode, NULL, fOffset, fVecs, fVecCount,
184 fSize, B_PHYSICAL_IO_REQUEST, this);
188 void
189 PrecacheIO::IOFinished(status_t status, bool partialTransfer,
190 generic_size_t bytesTransferred)
192 AutoLocker<VMCache> locker(fCache);
194 // Make successfully loaded pages accessible again (partially
195 // transferred pages are considered failed)
196 phys_size_t pagesTransferred
197 = (bytesTransferred + B_PAGE_SIZE - 1) / B_PAGE_SIZE;
199 if (fOffset + (off_t)bytesTransferred > fCache->virtual_end)
200 bytesTransferred = fCache->virtual_end - fOffset;
202 for (uint32 i = 0; i < pagesTransferred; i++) {
203 if (i == pagesTransferred - 1
204 && (bytesTransferred % B_PAGE_SIZE) != 0) {
205 // clear partial page
206 size_t bytesTouched = bytesTransferred % B_PAGE_SIZE;
207 vm_memset_physical(
208 ((phys_addr_t)fPages[i]->physical_page_number << PAGE_SHIFT)
209 + bytesTouched,
210 0, B_PAGE_SIZE - bytesTouched);
213 DEBUG_PAGE_ACCESS_TRANSFER(fPages[i], fAllocatingThread);
215 fCache->MarkPageUnbusy(fPages[i]);
217 DEBUG_PAGE_ACCESS_END(fPages[i]);
220 // Free pages after failed I/O
221 for (uint32 i = pagesTransferred; i < fPageCount; i++) {
222 DEBUG_PAGE_ACCESS_TRANSFER(fPages[i], fAllocatingThread);
223 fCache->NotifyPageEvents(fPages[i], PAGE_EVENT_NOT_BUSY);
224 fCache->RemovePage(fPages[i]);
225 vm_page_set_state(fPages[i], PAGE_STATE_FREE);
228 delete this;
232 // #pragma mark -
235 static void
236 add_to_iovec(generic_io_vec* vecs, uint32 &index, uint32 max,
237 generic_addr_t address, generic_size_t size)
239 if (index > 0 && vecs[index - 1].base + vecs[index - 1].length == address) {
240 // the iovec can be combined with the previous one
241 vecs[index - 1].length += size;
242 return;
245 if (index == max)
246 panic("no more space for iovecs!");
248 // we need to start a new iovec
249 vecs[index].base = address;
250 vecs[index].length = size;
251 index++;
255 static inline bool
256 access_is_sequential(file_cache_ref* ref)
258 return ref->last_access[ref->last_access_index] != 0;
262 static inline void
263 push_access(file_cache_ref* ref, off_t offset, generic_size_t bytes,
264 bool isWrite)
266 TRACE(("%p: push %Ld, %ld, %s\n", ref, offset, bytes,
267 isWrite ? "write" : "read"));
269 int32 index = ref->last_access_index;
270 int32 previous = index - 1;
271 if (previous < 0)
272 previous = LAST_ACCESSES - 1;
274 if (offset != ref->LastAccess(previous, isWrite))
275 ref->last_access[previous] = 0;
277 ref->SetLastAccess(index, offset + bytes, isWrite);
279 if (++index >= LAST_ACCESSES)
280 index = 0;
281 ref->last_access_index = index;
285 static void
286 reserve_pages(file_cache_ref* ref, vm_page_reservation* reservation,
287 size_t reservePages, bool isWrite)
289 if (low_resource_state(B_KERNEL_RESOURCE_PAGES) != B_NO_LOW_RESOURCE) {
290 VMCache* cache = ref->cache;
291 cache->Lock();
293 if (cache->consumers.IsEmpty() && cache->areas == NULL
294 && access_is_sequential(ref)) {
295 // we are not mapped, and we're accessed sequentially
297 if (isWrite) {
298 // Just write some pages back, and actually wait until they
299 // have been written back in order to relieve the page pressure
300 // a bit.
301 int32 index = ref->last_access_index;
302 int32 previous = index - 1;
303 if (previous < 0)
304 previous = LAST_ACCESSES - 1;
306 vm_page_write_modified_page_range(cache,
307 ref->LastAccessPageOffset(previous, true),
308 ref->LastAccessPageOffset(index, true));
309 } else {
310 // free some pages from our cache
311 // TODO: start with oldest
312 uint32 left = reservePages;
313 vm_page* page;
314 for (VMCachePagesTree::Iterator it = cache->pages.GetIterator();
315 (page = it.Next()) != NULL && left > 0;) {
316 if (page->State() == PAGE_STATE_CACHED && !page->busy) {
317 DEBUG_PAGE_ACCESS_START(page);
318 ASSERT(!page->IsMapped());
319 ASSERT(!page->modified);
320 cache->RemovePage(page);
321 vm_page_set_state(page, PAGE_STATE_FREE);
322 left--;
327 cache->Unlock();
330 vm_page_reserve_pages(reservation, reservePages, VM_PRIORITY_USER);
334 static inline status_t
335 read_pages_and_clear_partial(file_cache_ref* ref, void* cookie, off_t offset,
336 const generic_io_vec* vecs, size_t count, uint32 flags,
337 generic_size_t* _numBytes)
339 generic_size_t bytesUntouched = *_numBytes;
341 status_t status = vfs_read_pages(ref->vnode, cookie, offset, vecs, count,
342 flags, _numBytes);
344 generic_size_t bytesEnd = *_numBytes;
346 if (offset + (off_t)bytesEnd > ref->cache->virtual_end)
347 bytesEnd = ref->cache->virtual_end - offset;
349 if (status == B_OK && bytesEnd < bytesUntouched) {
350 // Clear out any leftovers that were not touched by the above read.
351 // We're doing this here so that not every file system/device has to
352 // implement this.
353 bytesUntouched -= bytesEnd;
355 for (int32 i = count; i-- > 0 && bytesUntouched != 0; ) {
356 generic_size_t length = min_c(bytesUntouched, vecs[i].length);
357 vm_memset_physical(vecs[i].base + vecs[i].length - length, 0,
358 length);
360 bytesUntouched -= length;
364 return status;
368 /*! Reads the requested amount of data into the cache, and allocates
369 pages needed to fulfill that request. This function is called by cache_io().
370 It can only handle a certain amount of bytes, and the caller must make
371 sure that it matches that criterion.
372 The cache_ref lock must be held when calling this function; during
373 operation it will unlock the cache, though.
375 static status_t
376 read_into_cache(file_cache_ref* ref, void* cookie, off_t offset,
377 int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
378 vm_page_reservation* reservation, size_t reservePages)
380 TRACE(("read_into_cache(offset = %Ld, pageOffset = %ld, buffer = %#lx, "
381 "bufferSize = %lu\n", offset, pageOffset, buffer, bufferSize));
383 VMCache* cache = ref->cache;
385 // TODO: We're using way too much stack! Rather allocate a sufficiently
386 // large chunk on the heap.
387 generic_io_vec vecs[MAX_IO_VECS];
388 uint32 vecCount = 0;
390 generic_size_t numBytes = PAGE_ALIGN(pageOffset + bufferSize);
391 vm_page* pages[MAX_IO_VECS];
392 int32 pageIndex = 0;
394 // allocate pages for the cache and mark them busy
395 for (generic_size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) {
396 vm_page* page = pages[pageIndex++] = vm_page_allocate_page(
397 reservation, PAGE_STATE_CACHED | VM_PAGE_ALLOC_BUSY);
399 cache->InsertPage(page, offset + pos);
401 add_to_iovec(vecs, vecCount, MAX_IO_VECS,
402 page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
403 // TODO: check if the array is large enough (currently panics)!
406 push_access(ref, offset, bufferSize, false);
407 cache->Unlock();
408 vm_page_unreserve_pages(reservation);
410 // read file into reserved pages
411 status_t status = read_pages_and_clear_partial(ref, cookie, offset, vecs,
412 vecCount, B_PHYSICAL_IO_REQUEST, &numBytes);
413 if (status != B_OK) {
414 // reading failed, free allocated pages
416 dprintf("file_cache: read pages failed: %s\n", strerror(status));
418 cache->Lock();
420 for (int32 i = 0; i < pageIndex; i++) {
421 cache->NotifyPageEvents(pages[i], PAGE_EVENT_NOT_BUSY);
422 cache->RemovePage(pages[i]);
423 vm_page_set_state(pages[i], PAGE_STATE_FREE);
426 return status;
429 // copy the pages if needed and unmap them again
431 for (int32 i = 0; i < pageIndex; i++) {
432 if (useBuffer && bufferSize != 0) {
433 size_t bytes = min_c(bufferSize, (size_t)B_PAGE_SIZE - pageOffset);
435 vm_memcpy_from_physical((void*)buffer,
436 pages[i]->physical_page_number * B_PAGE_SIZE + pageOffset,
437 bytes, IS_USER_ADDRESS(buffer));
439 buffer += bytes;
440 bufferSize -= bytes;
441 pageOffset = 0;
445 reserve_pages(ref, reservation, reservePages, false);
446 cache->Lock();
448 // make the pages accessible in the cache
449 for (int32 i = pageIndex; i-- > 0;) {
450 DEBUG_PAGE_ACCESS_END(pages[i]);
452 cache->MarkPageUnbusy(pages[i]);
455 return B_OK;
459 static status_t
460 read_from_file(file_cache_ref* ref, void* cookie, off_t offset,
461 int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
462 vm_page_reservation* reservation, size_t reservePages)
464 TRACE(("read_from_file(offset = %Ld, pageOffset = %ld, buffer = %#lx, "
465 "bufferSize = %lu\n", offset, pageOffset, buffer, bufferSize));
467 if (!useBuffer)
468 return B_OK;
470 generic_io_vec vec;
471 vec.base = buffer;
472 vec.length = bufferSize;
474 push_access(ref, offset, bufferSize, false);
475 ref->cache->Unlock();
476 vm_page_unreserve_pages(reservation);
478 generic_size_t toRead = bufferSize;
479 status_t status = vfs_read_pages(ref->vnode, cookie, offset + pageOffset,
480 &vec, 1, 0, &toRead);
482 if (status == B_OK)
483 reserve_pages(ref, reservation, reservePages, false);
485 ref->cache->Lock();
487 return status;
491 /*! Like read_into_cache() but writes data into the cache.
492 To preserve data consistency, it might also read pages into the cache,
493 though, if only a partial page gets written.
494 The same restrictions apply.
496 static status_t
497 write_to_cache(file_cache_ref* ref, void* cookie, off_t offset,
498 int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
499 vm_page_reservation* reservation, size_t reservePages)
501 // TODO: We're using way too much stack! Rather allocate a sufficiently
502 // large chunk on the heap.
503 generic_io_vec vecs[MAX_IO_VECS];
504 uint32 vecCount = 0;
505 generic_size_t numBytes = PAGE_ALIGN(pageOffset + bufferSize);
506 vm_page* pages[MAX_IO_VECS];
507 int32 pageIndex = 0;
508 status_t status = B_OK;
510 // ToDo: this should be settable somewhere
511 bool writeThrough = false;
513 // allocate pages for the cache and mark them busy
514 for (generic_size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) {
515 // TODO: if space is becoming tight, and this cache is already grown
516 // big - shouldn't we better steal the pages directly in that case?
517 // (a working set like approach for the file cache)
518 // TODO: the pages we allocate here should have been reserved upfront
519 // in cache_io()
520 vm_page* page = pages[pageIndex++] = vm_page_allocate_page(
521 reservation,
522 (writeThrough ? PAGE_STATE_CACHED : PAGE_STATE_MODIFIED)
523 | VM_PAGE_ALLOC_BUSY);
525 page->modified = !writeThrough;
527 ref->cache->InsertPage(page, offset + pos);
529 add_to_iovec(vecs, vecCount, MAX_IO_VECS,
530 page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
533 push_access(ref, offset, bufferSize, true);
534 ref->cache->Unlock();
535 vm_page_unreserve_pages(reservation);
537 // copy contents (and read in partially written pages first)
539 if (pageOffset != 0) {
540 // This is only a partial write, so we have to read the rest of the page
541 // from the file to have consistent data in the cache
542 generic_io_vec readVec = { vecs[0].base, B_PAGE_SIZE };
543 generic_size_t bytesRead = B_PAGE_SIZE;
545 status = vfs_read_pages(ref->vnode, cookie, offset, &readVec, 1,
546 B_PHYSICAL_IO_REQUEST, &bytesRead);
547 // ToDo: handle errors for real!
548 if (status < B_OK)
549 panic("1. vfs_read_pages() failed: %s!\n", strerror(status));
552 size_t lastPageOffset = (pageOffset + bufferSize) % B_PAGE_SIZE;
553 if (lastPageOffset != 0) {
554 // get the last page in the I/O vectors
555 generic_addr_t last = vecs[vecCount - 1].base
556 + vecs[vecCount - 1].length - B_PAGE_SIZE;
558 if ((off_t)(offset + pageOffset + bufferSize) == ref->cache->virtual_end) {
559 // the space in the page after this write action needs to be cleaned
560 vm_memset_physical(last + lastPageOffset, 0,
561 B_PAGE_SIZE - lastPageOffset);
562 } else {
563 // the end of this write does not happen on a page boundary, so we
564 // need to fetch the last page before we can update it
565 generic_io_vec readVec = { last, B_PAGE_SIZE };
566 generic_size_t bytesRead = B_PAGE_SIZE;
568 status = vfs_read_pages(ref->vnode, cookie,
569 PAGE_ALIGN(offset + pageOffset + bufferSize) - B_PAGE_SIZE,
570 &readVec, 1, B_PHYSICAL_IO_REQUEST, &bytesRead);
571 // ToDo: handle errors for real!
572 if (status < B_OK)
573 panic("vfs_read_pages() failed: %s!\n", strerror(status));
575 if (bytesRead < B_PAGE_SIZE) {
576 // the space beyond the file size needs to be cleaned
577 vm_memset_physical(last + bytesRead, 0,
578 B_PAGE_SIZE - bytesRead);
583 for (uint32 i = 0; i < vecCount; i++) {
584 generic_addr_t base = vecs[i].base;
585 generic_size_t bytes = min_c((generic_size_t)bufferSize,
586 generic_size_t(vecs[i].length - pageOffset));
588 if (useBuffer) {
589 // copy data from user buffer
590 vm_memcpy_to_physical(base + pageOffset, (void*)buffer, bytes,
591 IS_USER_ADDRESS(buffer));
592 } else {
593 // clear buffer instead
594 vm_memset_physical(base + pageOffset, 0, bytes);
597 bufferSize -= bytes;
598 if (bufferSize == 0)
599 break;
601 buffer += bytes;
602 pageOffset = 0;
605 if (writeThrough) {
606 // write cached pages back to the file if we were asked to do that
607 status_t status = vfs_write_pages(ref->vnode, cookie, offset, vecs,
608 vecCount, B_PHYSICAL_IO_REQUEST, &numBytes);
609 if (status < B_OK) {
610 // ToDo: remove allocated pages, ...?
611 panic("file_cache: remove allocated pages! write pages failed: %s\n",
612 strerror(status));
616 if (status == B_OK)
617 reserve_pages(ref, reservation, reservePages, true);
619 ref->cache->Lock();
621 // make the pages accessible in the cache
622 for (int32 i = pageIndex; i-- > 0;) {
623 ref->cache->MarkPageUnbusy(pages[i]);
625 DEBUG_PAGE_ACCESS_END(pages[i]);
628 return status;
632 static status_t
633 write_to_file(file_cache_ref* ref, void* cookie, off_t offset, int32 pageOffset,
634 addr_t buffer, size_t bufferSize, bool useBuffer,
635 vm_page_reservation* reservation, size_t reservePages)
637 push_access(ref, offset, bufferSize, true);
638 ref->cache->Unlock();
639 vm_page_unreserve_pages(reservation);
641 status_t status = B_OK;
643 if (!useBuffer) {
644 while (bufferSize > 0) {
645 generic_size_t written = min_c(bufferSize, kZeroVecSize);
646 status = vfs_write_pages(ref->vnode, cookie, offset + pageOffset,
647 sZeroVecs, kZeroVecCount, B_PHYSICAL_IO_REQUEST, &written);
648 if (status != B_OK)
649 return status;
650 if (written == 0)
651 return B_ERROR;
653 bufferSize -= written;
654 pageOffset += written;
656 } else {
657 generic_io_vec vec;
658 vec.base = buffer;
659 vec.length = bufferSize;
660 generic_size_t toWrite = bufferSize;
661 status = vfs_write_pages(ref->vnode, cookie, offset + pageOffset,
662 &vec, 1, 0, &toWrite);
665 if (status == B_OK)
666 reserve_pages(ref, reservation, reservePages, true);
668 ref->cache->Lock();
670 return status;
674 static inline status_t
675 satisfy_cache_io(file_cache_ref* ref, void* cookie, cache_func function,
676 off_t offset, addr_t buffer, bool useBuffer, int32 &pageOffset,
677 size_t bytesLeft, size_t &reservePages, off_t &lastOffset,
678 addr_t &lastBuffer, int32 &lastPageOffset, size_t &lastLeft,
679 size_t &lastReservedPages, vm_page_reservation* reservation)
681 if (lastBuffer == buffer)
682 return B_OK;
684 size_t requestSize = buffer - lastBuffer;
685 reservePages = min_c(MAX_IO_VECS, (lastLeft - requestSize
686 + lastPageOffset + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
688 status_t status = function(ref, cookie, lastOffset, lastPageOffset,
689 lastBuffer, requestSize, useBuffer, reservation, reservePages);
690 if (status == B_OK) {
691 lastReservedPages = reservePages;
692 lastBuffer = buffer;
693 lastLeft = bytesLeft;
694 lastOffset = offset;
695 lastPageOffset = 0;
696 pageOffset = 0;
698 return status;
702 static status_t
703 cache_io(void* _cacheRef, void* cookie, off_t offset, addr_t buffer,
704 size_t* _size, bool doWrite)
706 if (_cacheRef == NULL)
707 panic("cache_io() called with NULL ref!\n");
709 file_cache_ref* ref = (file_cache_ref*)_cacheRef;
710 VMCache* cache = ref->cache;
711 off_t fileSize = cache->virtual_end;
712 bool useBuffer = buffer != 0;
714 TRACE(("cache_io(ref = %p, offset = %Ld, buffer = %p, size = %lu, %s)\n",
715 ref, offset, (void*)buffer, *_size, doWrite ? "write" : "read"));
717 // out of bounds access?
718 if (offset >= fileSize || offset < 0) {
719 *_size = 0;
720 return B_OK;
723 int32 pageOffset = offset & (B_PAGE_SIZE - 1);
724 size_t size = *_size;
725 offset -= pageOffset;
727 if ((off_t)(offset + pageOffset + size) > fileSize) {
728 // adapt size to be within the file's offsets
729 size = fileSize - pageOffset - offset;
730 *_size = size;
732 if (size == 0)
733 return B_OK;
735 // "offset" and "lastOffset" are always aligned to B_PAGE_SIZE,
736 // the "last*" variables always point to the end of the last
737 // satisfied request part
739 const uint32 kMaxChunkSize = MAX_IO_VECS * B_PAGE_SIZE;
740 size_t bytesLeft = size, lastLeft = size;
741 int32 lastPageOffset = pageOffset;
742 addr_t lastBuffer = buffer;
743 off_t lastOffset = offset;
744 size_t lastReservedPages = min_c(MAX_IO_VECS, (pageOffset + bytesLeft
745 + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
746 size_t reservePages = 0;
747 size_t pagesProcessed = 0;
748 cache_func function = NULL;
750 vm_page_reservation reservation;
751 reserve_pages(ref, &reservation, lastReservedPages, doWrite);
753 AutoLocker<VMCache> locker(cache);
755 while (bytesLeft > 0) {
756 // Periodically reevaluate the low memory situation and select the
757 // read/write hook accordingly
758 if (pagesProcessed % 32 == 0) {
759 if (size >= BYPASS_IO_SIZE
760 && low_resource_state(B_KERNEL_RESOURCE_PAGES)
761 != B_NO_LOW_RESOURCE) {
762 // In low memory situations we bypass the cache beyond a
763 // certain I/O size.
764 function = doWrite ? write_to_file : read_from_file;
765 } else
766 function = doWrite ? write_to_cache : read_into_cache;
769 // check if this page is already in memory
770 vm_page* page = cache->LookupPage(offset);
771 if (page != NULL) {
772 // The page may be busy - since we need to unlock the cache sometime
773 // in the near future, we need to satisfy the request of the pages
774 // we didn't get yet (to make sure no one else interferes in the
775 // meantime).
776 status_t status = satisfy_cache_io(ref, cookie, function, offset,
777 buffer, useBuffer, pageOffset, bytesLeft, reservePages,
778 lastOffset, lastBuffer, lastPageOffset, lastLeft,
779 lastReservedPages, &reservation);
780 if (status != B_OK)
781 return status;
783 // Since satisfy_cache_io() unlocks the cache, we need to look up
784 // the page again.
785 page = cache->LookupPage(offset);
786 if (page != NULL && page->busy) {
787 cache->WaitForPageEvents(page, PAGE_EVENT_NOT_BUSY, true);
788 continue;
792 size_t bytesInPage = min_c(size_t(B_PAGE_SIZE - pageOffset), bytesLeft);
794 TRACE(("lookup page from offset %Ld: %p, size = %lu, pageOffset "
795 "= %lu\n", offset, page, bytesLeft, pageOffset));
797 if (page != NULL) {
798 if (doWrite || useBuffer) {
799 // Since the following user_mem{cpy,set}() might cause a page
800 // fault, which in turn might cause pages to be reserved, we
801 // need to unlock the cache temporarily to avoid a potential
802 // deadlock. To make sure that our page doesn't go away, we mark
803 // it busy for the time.
804 page->busy = true;
805 locker.Unlock();
807 // copy the contents of the page already in memory
808 phys_addr_t pageAddress
809 = (phys_addr_t)page->physical_page_number * B_PAGE_SIZE
810 + pageOffset;
811 bool userBuffer = IS_USER_ADDRESS(buffer);
812 if (doWrite) {
813 if (useBuffer) {
814 vm_memcpy_to_physical(pageAddress, (void*)buffer,
815 bytesInPage, userBuffer);
816 } else {
817 vm_memset_physical(pageAddress, 0, bytesInPage);
819 } else if (useBuffer) {
820 vm_memcpy_from_physical((void*)buffer, pageAddress,
821 bytesInPage, userBuffer);
824 locker.Lock();
826 if (doWrite) {
827 DEBUG_PAGE_ACCESS_START(page);
829 page->modified = true;
831 if (page->State() != PAGE_STATE_MODIFIED)
832 vm_page_set_state(page, PAGE_STATE_MODIFIED);
834 DEBUG_PAGE_ACCESS_END(page);
837 cache->MarkPageUnbusy(page);
840 // If it is cached only, requeue the page, so the respective queue
841 // roughly remains LRU first sorted.
842 if (page->State() == PAGE_STATE_CACHED
843 || page->State() == PAGE_STATE_MODIFIED) {
844 DEBUG_PAGE_ACCESS_START(page);
845 vm_page_requeue(page, true);
846 DEBUG_PAGE_ACCESS_END(page);
849 if (bytesLeft <= bytesInPage) {
850 // we've read the last page, so we're done!
851 locker.Unlock();
852 vm_page_unreserve_pages(&reservation);
853 return B_OK;
856 // prepare a potential gap request
857 lastBuffer = buffer + bytesInPage;
858 lastLeft = bytesLeft - bytesInPage;
859 lastOffset = offset + B_PAGE_SIZE;
860 lastPageOffset = 0;
863 if (bytesLeft <= bytesInPage)
864 break;
866 buffer += bytesInPage;
867 bytesLeft -= bytesInPage;
868 pageOffset = 0;
869 offset += B_PAGE_SIZE;
870 pagesProcessed++;
872 if (buffer - lastBuffer + lastPageOffset >= kMaxChunkSize) {
873 status_t status = satisfy_cache_io(ref, cookie, function, offset,
874 buffer, useBuffer, pageOffset, bytesLeft, reservePages,
875 lastOffset, lastBuffer, lastPageOffset, lastLeft,
876 lastReservedPages, &reservation);
877 if (status != B_OK)
878 return status;
882 // fill the last remaining bytes of the request (either write or read)
884 return function(ref, cookie, lastOffset, lastPageOffset, lastBuffer,
885 lastLeft, useBuffer, &reservation, 0);
889 static status_t
890 file_cache_control(const char* subsystem, uint32 function, void* buffer,
891 size_t bufferSize)
893 switch (function) {
894 case CACHE_CLEAR:
895 // ToDo: clear the cache
896 dprintf("cache_control: clear cache!\n");
897 return B_OK;
899 case CACHE_SET_MODULE:
901 cache_module_info* module = sCacheModule;
903 // unset previous module
905 if (sCacheModule != NULL) {
906 sCacheModule = NULL;
907 snooze(100000); // 0.1 secs
908 put_module(module->info.name);
911 // get new module, if any
913 if (buffer == NULL)
914 return B_OK;
916 char name[B_FILE_NAME_LENGTH];
917 if (!IS_USER_ADDRESS(buffer)
918 || user_strlcpy(name, (char*)buffer,
919 B_FILE_NAME_LENGTH) < B_OK)
920 return B_BAD_ADDRESS;
922 if (strncmp(name, CACHE_MODULES_NAME, strlen(CACHE_MODULES_NAME)))
923 return B_BAD_VALUE;
925 dprintf("cache_control: set module %s!\n", name);
927 status_t status = get_module(name, (module_info**)&module);
928 if (status == B_OK)
929 sCacheModule = module;
931 return status;
935 return B_BAD_HANDLER;
939 // #pragma mark - private kernel API
942 extern "C" void
943 cache_prefetch_vnode(struct vnode* vnode, off_t offset, size_t size)
945 if (size == 0)
946 return;
948 VMCache* cache;
949 if (vfs_get_vnode_cache(vnode, &cache, false) != B_OK)
950 return;
952 file_cache_ref* ref = ((VMVnodeCache*)cache)->FileCacheRef();
953 off_t fileSize = cache->virtual_end;
955 if ((off_t)(offset + size) > fileSize)
956 size = fileSize - offset;
958 // "offset" and "size" are always aligned to B_PAGE_SIZE,
959 offset = ROUNDDOWN(offset, B_PAGE_SIZE);
960 size = ROUNDUP(size, B_PAGE_SIZE);
962 size_t reservePages = size / B_PAGE_SIZE;
964 // Don't do anything if we don't have the resources left, or the cache
965 // already contains more than 2/3 of its pages
966 if (offset >= fileSize || vm_page_num_unused_pages() < 2 * reservePages
967 || 3 * cache->page_count > 2 * fileSize / B_PAGE_SIZE) {
968 cache->ReleaseRef();
969 return;
972 size_t bytesToRead = 0;
973 off_t lastOffset = offset;
975 vm_page_reservation reservation;
976 vm_page_reserve_pages(&reservation, reservePages, VM_PRIORITY_USER);
978 cache->Lock();
980 while (true) {
981 // check if this page is already in memory
982 if (size > 0) {
983 vm_page* page = cache->LookupPage(offset);
985 offset += B_PAGE_SIZE;
986 size -= B_PAGE_SIZE;
988 if (page == NULL) {
989 bytesToRead += B_PAGE_SIZE;
990 continue;
993 if (bytesToRead != 0) {
994 // read the part before the current page (or the end of the request)
995 PrecacheIO* io = new(std::nothrow) PrecacheIO(ref, lastOffset,
996 bytesToRead);
997 if (io == NULL || io->Prepare(&reservation) != B_OK) {
998 delete io;
999 break;
1002 // we must not have the cache locked during I/O
1003 cache->Unlock();
1004 io->ReadAsync();
1005 cache->Lock();
1007 bytesToRead = 0;
1010 if (size == 0) {
1011 // we have reached the end of the request
1012 break;
1015 lastOffset = offset;
1018 cache->ReleaseRefAndUnlock();
1019 vm_page_unreserve_pages(&reservation);
1023 extern "C" void
1024 cache_prefetch(dev_t mountID, ino_t vnodeID, off_t offset, size_t size)
1026 // ToDo: schedule prefetch
1028 TRACE(("cache_prefetch(vnode %ld:%Ld)\n", mountID, vnodeID));
1030 // get the vnode for the object, this also grabs a ref to it
1031 struct vnode* vnode;
1032 if (vfs_get_vnode(mountID, vnodeID, true, &vnode) != B_OK)
1033 return;
1035 cache_prefetch_vnode(vnode, offset, size);
1036 vfs_put_vnode(vnode);
1040 extern "C" void
1041 cache_node_opened(struct vnode* vnode, int32 fdType, VMCache* cache,
1042 dev_t mountID, ino_t parentID, ino_t vnodeID, const char* name)
1044 if (sCacheModule == NULL || sCacheModule->node_opened == NULL)
1045 return;
1047 off_t size = -1;
1048 if (cache != NULL) {
1049 file_cache_ref* ref = ((VMVnodeCache*)cache)->FileCacheRef();
1050 if (ref != NULL)
1051 size = cache->virtual_end;
1054 sCacheModule->node_opened(vnode, fdType, mountID, parentID, vnodeID, name,
1055 size);
1059 extern "C" void
1060 cache_node_closed(struct vnode* vnode, int32 fdType, VMCache* cache,
1061 dev_t mountID, ino_t vnodeID)
1063 if (sCacheModule == NULL || sCacheModule->node_closed == NULL)
1064 return;
1066 int32 accessType = 0;
1067 if (cache != NULL) {
1068 // ToDo: set accessType
1071 sCacheModule->node_closed(vnode, fdType, mountID, vnodeID, accessType);
1075 extern "C" void
1076 cache_node_launched(size_t argCount, char* const* args)
1078 if (sCacheModule == NULL || sCacheModule->node_launched == NULL)
1079 return;
1081 sCacheModule->node_launched(argCount, args);
1085 extern "C" status_t
1086 file_cache_init_post_boot_device(void)
1088 // ToDo: get cache module out of driver settings
1090 if (get_module("file_cache/launch_speedup/v1",
1091 (module_info**)&sCacheModule) == B_OK) {
1092 dprintf("** opened launch speedup: %" B_PRId64 "\n", system_time());
1094 return B_OK;
1098 extern "C" status_t
1099 file_cache_init(void)
1101 // allocate a clean page we can use for writing zeroes
1102 vm_page_reservation reservation;
1103 vm_page_reserve_pages(&reservation, 1, VM_PRIORITY_SYSTEM);
1104 vm_page* page = vm_page_allocate_page(&reservation,
1105 PAGE_STATE_WIRED | VM_PAGE_ALLOC_CLEAR);
1106 vm_page_unreserve_pages(&reservation);
1108 sZeroPage = (phys_addr_t)page->physical_page_number * B_PAGE_SIZE;
1110 for (uint32 i = 0; i < kZeroVecCount; i++) {
1111 sZeroVecs[i].base = sZeroPage;
1112 sZeroVecs[i].length = B_PAGE_SIZE;
1115 register_generic_syscall(CACHE_SYSCALLS, file_cache_control, 1, 0);
1116 return B_OK;
1120 // #pragma mark - public FS API
1123 extern "C" void*
1124 file_cache_create(dev_t mountID, ino_t vnodeID, off_t size)
1126 TRACE(("file_cache_create(mountID = %ld, vnodeID = %Ld, size = %Ld)\n",
1127 mountID, vnodeID, size));
1129 file_cache_ref* ref = new file_cache_ref;
1130 if (ref == NULL)
1131 return NULL;
1133 memset(ref->last_access, 0, sizeof(ref->last_access));
1134 ref->last_access_index = 0;
1135 ref->disabled_count = 0;
1137 // TODO: delay VMCache creation until data is
1138 // requested/written for the first time? Listing lots of
1139 // files in Tracker (and elsewhere) could be slowed down.
1140 // Since the file_cache_ref itself doesn't have a lock,
1141 // we would need to "rent" one during construction, possibly
1142 // the vnode lock, maybe a dedicated one.
1143 // As there shouldn't be too much contention, we could also
1144 // use atomic_test_and_set(), and free the resources again
1145 // when that fails...
1147 // Get the vnode for the object
1148 // (note, this does not grab a reference to the node)
1149 if (vfs_lookup_vnode(mountID, vnodeID, &ref->vnode) != B_OK)
1150 goto err1;
1152 // Gets (usually creates) the cache for the node
1153 if (vfs_get_vnode_cache(ref->vnode, &ref->cache, true) != B_OK)
1154 goto err1;
1156 ref->cache->virtual_end = size;
1157 ((VMVnodeCache*)ref->cache)->SetFileCacheRef(ref);
1158 return ref;
1160 err1:
1161 delete ref;
1162 return NULL;
1166 extern "C" void
1167 file_cache_delete(void* _cacheRef)
1169 file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1171 if (ref == NULL)
1172 return;
1174 TRACE(("file_cache_delete(ref = %p)\n", ref));
1176 ref->cache->ReleaseRef();
1177 delete ref;
1181 extern "C" void
1182 file_cache_enable(void* _cacheRef)
1184 file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1186 AutoLocker<VMCache> _(ref->cache);
1188 if (ref->disabled_count == 0) {
1189 panic("Unbalanced file_cache_enable()!");
1190 return;
1193 ref->disabled_count--;
1197 extern "C" status_t
1198 file_cache_disable(void* _cacheRef)
1200 // TODO: This function only removes all pages from the cache and prevents
1201 // that the file cache functions add any new ones until re-enabled. The
1202 // VM (on page fault) can still add pages, if the file is mmap()ed. We
1203 // should mark the cache to prevent shared mappings of the file and fix
1204 // the page fault code to deal correctly with private mappings (i.e. only
1205 // insert pages in consumer caches).
1207 file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1209 AutoLocker<VMCache> _(ref->cache);
1211 // If already disabled, there's nothing to do for us.
1212 if (ref->disabled_count > 0) {
1213 ref->disabled_count++;
1214 return B_OK;
1217 // The file cache is not yet disabled. We need to evict all cached pages.
1218 status_t error = ref->cache->FlushAndRemoveAllPages();
1219 if (error != B_OK)
1220 return error;
1222 ref->disabled_count++;
1223 return B_OK;
1227 extern "C" bool
1228 file_cache_is_enabled(void* _cacheRef)
1230 file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1231 AutoLocker<VMCache> _(ref->cache);
1233 return ref->disabled_count == 0;
1237 extern "C" status_t
1238 file_cache_set_size(void* _cacheRef, off_t newSize)
1240 file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1242 TRACE(("file_cache_set_size(ref = %p, size = %Ld)\n", ref, newSize));
1244 if (ref == NULL)
1245 return B_OK;
1247 VMCache* cache = ref->cache;
1248 AutoLocker<VMCache> _(cache);
1250 off_t oldSize = cache->virtual_end;
1251 status_t status = cache->Resize(newSize, VM_PRIORITY_USER);
1252 // Note, the priority doesn't really matter, since this cache doesn't
1253 // reserve any memory.
1254 if (status == B_OK && newSize < oldSize) {
1255 // We may have a new partial page at the end of the cache that must be
1256 // cleared.
1257 uint32 partialBytes = newSize % B_PAGE_SIZE;
1258 if (partialBytes != 0) {
1259 vm_page* page = cache->LookupPage(newSize - partialBytes);
1260 if (page != NULL) {
1261 vm_memset_physical(page->physical_page_number * B_PAGE_SIZE
1262 + partialBytes, 0, B_PAGE_SIZE - partialBytes);
1267 return status;
1271 extern "C" status_t
1272 file_cache_sync(void* _cacheRef)
1274 file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1275 if (ref == NULL)
1276 return B_BAD_VALUE;
1278 return ref->cache->WriteModified();
1282 extern "C" status_t
1283 file_cache_read(void* _cacheRef, void* cookie, off_t offset, void* buffer,
1284 size_t* _size)
1286 file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1288 TRACE(("file_cache_read(ref = %p, offset = %Ld, buffer = %p, size = %lu)\n",
1289 ref, offset, buffer, *_size));
1291 if (ref->disabled_count > 0) {
1292 // Caching is disabled -- read directly from the file.
1293 generic_io_vec vec;
1294 vec.base = (addr_t)buffer;
1295 generic_size_t size = vec.length = *_size;
1296 status_t error = vfs_read_pages(ref->vnode, cookie, offset, &vec, 1, 0,
1297 &size);
1298 *_size = size;
1299 return error;
1302 return cache_io(ref, cookie, offset, (addr_t)buffer, _size, false);
1306 extern "C" status_t
1307 file_cache_write(void* _cacheRef, void* cookie, off_t offset,
1308 const void* buffer, size_t* _size)
1310 file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1312 if (ref->disabled_count > 0) {
1313 // Caching is disabled -- write directly to the file.
1315 if (buffer != NULL) {
1316 generic_io_vec vec;
1317 vec.base = (addr_t)buffer;
1318 generic_size_t size = vec.length = *_size;
1320 status_t error = vfs_write_pages(ref->vnode, cookie, offset, &vec,
1321 1, 0, &size);
1322 *_size = size;
1323 return error;
1326 // NULL buffer -- use a dummy buffer to write zeroes
1327 size_t size = *_size;
1328 while (size > 0) {
1329 size_t toWrite = min_c(size, kZeroVecSize);
1330 generic_size_t written = toWrite;
1331 status_t error = vfs_write_pages(ref->vnode, cookie, offset,
1332 sZeroVecs, kZeroVecCount, B_PHYSICAL_IO_REQUEST, &written);
1333 if (error != B_OK)
1334 return error;
1335 if (written == 0)
1336 break;
1338 offset += written;
1339 size -= written;
1342 *_size -= size;
1343 return B_OK;
1346 status_t status = cache_io(ref, cookie, offset,
1347 (addr_t)const_cast<void*>(buffer), _size, true);
1349 TRACE(("file_cache_write(ref = %p, offset = %Ld, buffer = %p, size = %lu)"
1350 " = %ld\n", ref, offset, buffer, *_size, status));
1352 return status;