Remove building with NOCRYPTO option
[minix.git] / minix / lib / libminixfs / cache.c
blobd022709c7adfd95c67843c2387f2c1f4c43d3ba9
2 #define _SYSTEM
4 #include <assert.h>
5 #include <string.h>
6 #include <errno.h>
7 #include <math.h>
8 #include <stdlib.h>
10 #include <machine/vmparam.h>
12 #include <sys/param.h>
13 #include <sys/mman.h>
15 #include <minix/dmap.h>
16 #include <minix/libminixfs.h>
17 #include <minix/syslib.h>
18 #include <minix/sysutil.h>
19 #include <minix/u64.h>
20 #include <minix/bdev.h>
21 #include <minix/bitmap.h>
23 #include "inc.h"
25 /* Buffer (block) cache. To acquire a block, a routine calls lmfs_get_block(),
26 * telling which block it wants. The block is then regarded as "in use" and
27 * has its reference count incremented. All the blocks that are not in use are
28 * chained together in an LRU list, with 'front' pointing to the least recently
29 * used block, and 'rear' to the most recently used block. A reverse chain is
30 * also maintained. Usage for LRU is measured by the time the put_block() is
31 * done. The second parameter to put_block() can violate the LRU order and put
32 * a block on the front of the list, if it will probably not be needed again.
33 * This is used internally only; the lmfs_put_block() API call has no second
34 * parameter. If a block is modified, the modifying routine must mark the
35 * block as dirty, so the block will eventually be rewritten to the disk.
38 /* Flags to put_block(). */
39 #define ONE_SHOT 0x1 /* set if block will not be needed again */
41 #define BUFHASH(b) ((unsigned int)((b) % nr_bufs))
42 #define MARKCLEAN lmfs_markclean
44 #define MINBUFS 6 /* minimal no of bufs for sanity check */
46 static struct buf *front; /* points to least recently used free block */
47 static struct buf *rear; /* points to most recently used free block */
48 static unsigned int bufs_in_use;/* # bufs currently in use (not on free list)*/
50 static void rm_lru(struct buf *bp);
51 static int read_block(struct buf *bp, size_t size);
52 static void freeblock(struct buf *bp);
53 static void cache_heuristic_check(void);
54 static void put_block(struct buf *bp, int put_flags);
56 static int vmcache = 0; /* are we using vm's secondary cache? (initially not) */
58 static struct buf *buf;
59 static struct buf **buf_hash; /* the buffer hash table */
60 static unsigned int nr_bufs;
61 static int may_use_vmcache;
63 static size_t fs_block_size = PAGE_SIZE; /* raw i/o block size */
65 static fsblkcnt_t fs_btotal = 0, fs_bused = 0;
67 static int quiet = 0;
69 typedef struct buf *noxfer_buf_ptr_t; /* annotation for temporary buf ptrs */
71 void lmfs_setquiet(int q) { quiet = q; }
73 static int fs_bufs_heuristic(int minbufs, fsblkcnt_t btotal,
74 fsblkcnt_t bused, int blocksize)
76 struct vm_stats_info vsi;
77 int bufs;
78 u32_t kbytes_used_fs, kbytes_total_fs, kbcache, kb_fsmax;
79 u32_t kbytes_remain_mem;
81 /* set a reasonable cache size; cache at most a certain
82 * portion of the used FS, and at most a certain %age of remaining
83 * memory
85 if(vm_info_stats(&vsi) != OK) {
86 bufs = 1024;
87 if(!quiet)
88 printf("fslib: heuristic info fail: default to %d bufs\n", bufs);
89 return bufs;
92 /* remaining free memory is unused memory plus memory in used for cache,
93 * as the cache can be evicted
95 kbytes_remain_mem = (u64_t)(vsi.vsi_free + vsi.vsi_cached) *
96 vsi.vsi_pagesize / 1024;
98 /* check fs usage. */
99 kbytes_used_fs = (unsigned long)(((u64_t)bused * blocksize) / 1024);
100 kbytes_total_fs = (unsigned long)(((u64_t)btotal * blocksize) / 1024);
102 /* heuristic for a desired cache size based on FS usage;
103 * but never bigger than half of the total filesystem
105 kb_fsmax = sqrt_approx(kbytes_used_fs)*40;
106 kb_fsmax = MIN(kb_fsmax, kbytes_total_fs/2);
108 /* heuristic for a maximum usage - 10% of remaining memory */
109 kbcache = MIN(kbytes_remain_mem/10, kb_fsmax);
110 bufs = kbcache * 1024 / blocksize;
112 /* but we simply need MINBUFS no matter what */
113 if(bufs < minbufs)
114 bufs = minbufs;
116 return bufs;
119 void lmfs_change_blockusage(int delta)
121 /* Change the number of allocated blocks by 'delta.'
122 * Also accumulate the delta since the last cache re-evaluation.
123 * If it is outside a certain band, ask the cache library to
124 * re-evaluate the cache size.
126 static int bitdelta = 0, warn_low = TRUE, warn_high = TRUE;
128 /* Adjust the file system block usage counter accordingly. Do bounds
129 * checking, and report file system misbehavior.
131 if (delta > 0 && (fsblkcnt_t)delta > fs_btotal - fs_bused) {
132 if (warn_high) {
133 printf("libminixfs: block usage overflow\n");
134 warn_high = FALSE;
136 delta = (int)(fs_btotal - fs_bused);
137 } else if (delta < 0 && (fsblkcnt_t)-delta > fs_bused) {
138 if (warn_low) {
139 printf("libminixfs: block usage underflow\n");
140 warn_low = FALSE;
142 delta = -(int)fs_bused;
144 fs_bused += delta;
146 bitdelta += delta;
148 #define BAND_KB (10*1024) /* recheck cache every 10MB change */
150 /* If the accumulated delta exceeds the configured threshold, resize
151 * the cache, but only if the cache isn't in use any more. In order to
152 * avoid that the latter case blocks a resize forever, we also call
153 * this function from lmfs_flushall(). Since lmfs_buf_pool() may call
154 * lmfs_flushall(), reset 'bitdelta' before doing the heuristics check.
156 if (bufs_in_use == 0 &&
157 (bitdelta*(int)fs_block_size/1024 > BAND_KB ||
158 bitdelta*(int)fs_block_size/1024 < -BAND_KB)) {
159 bitdelta = 0;
160 cache_heuristic_check();
164 void lmfs_markdirty(struct buf *bp)
166 bp->lmfs_flags |= VMMC_DIRTY;
169 void lmfs_markclean(struct buf *bp)
171 bp->lmfs_flags &= ~VMMC_DIRTY;
174 int lmfs_isclean(struct buf *bp)
176 return !(bp->lmfs_flags & VMMC_DIRTY);
179 static void free_unused_blocks(void)
181 struct buf *bp;
183 int freed = 0, bytes = 0;
184 printf("libminixfs: freeing; %d blocks in use\n", bufs_in_use);
185 for(bp = &buf[0]; bp < &buf[nr_bufs]; bp++) {
186 if(bp->lmfs_bytes > 0 && bp->lmfs_count == 0) {
187 freed++;
188 bytes += bp->lmfs_bytes;
189 freeblock(bp);
192 printf("libminixfs: freeing; %d blocks, %d bytes\n", freed, bytes);
195 static void lmfs_alloc_block(struct buf *bp, size_t block_size)
197 ASSERT(!bp->data);
198 ASSERT(bp->lmfs_bytes == 0);
200 if((bp->data = mmap(0, block_size, PROT_READ|PROT_WRITE,
201 MAP_PREALLOC|MAP_ANON, -1, 0)) == MAP_FAILED) {
202 free_unused_blocks();
203 if((bp->data = mmap(0, block_size, PROT_READ|PROT_WRITE,
204 MAP_PREALLOC|MAP_ANON, -1, 0)) == MAP_FAILED) {
205 panic("libminixfs: could not allocate block");
208 assert(bp->data);
209 bp->lmfs_bytes = block_size;
210 bp->lmfs_needsetcache = 1;
213 /*===========================================================================*
214 * lmfs_get_block *
215 *===========================================================================*/
216 int lmfs_get_block(struct buf **bpp, dev_t dev, block64_t block, int how)
218 return lmfs_get_block_ino(bpp, dev, block, how, VMC_NO_INODE, 0);
221 static void munmap_t(void *a, int len)
223 assert(a);
224 assert(a != MAP_FAILED);
225 assert(!((vir_bytes)a % PAGE_SIZE));
226 assert(len > 0);
228 len = roundup(len, PAGE_SIZE);
230 assert(!(len % PAGE_SIZE));
232 if(munmap(a, len) < 0)
233 panic("libminixfs cache: munmap failed");
236 static void raisecount(struct buf *bp)
238 ASSERT(bp->lmfs_count < CHAR_MAX);
239 bp->lmfs_count++;
240 if(bp->lmfs_count == 1) bufs_in_use++;
241 assert(bufs_in_use > 0);
244 static void lowercount(struct buf *bp)
246 assert(bufs_in_use > 0);
247 ASSERT(bp->lmfs_count > 0);
248 bp->lmfs_count--;
249 if(bp->lmfs_count == 0) bufs_in_use--;
252 static void freeblock(struct buf *bp)
254 ASSERT(bp->lmfs_count == 0);
255 /* If the block taken is dirty, make it clean by writing it to the disk.
256 * Avoid hysteresis by flushing all other dirty blocks for the same device.
258 if (bp->lmfs_dev != NO_DEV) {
259 if (!lmfs_isclean(bp)) lmfs_flushdev(bp->lmfs_dev);
260 assert(bp->lmfs_bytes > 0);
261 bp->lmfs_dev = NO_DEV;
264 /* Fill in block's parameters and add it to the hash chain where it goes. */
265 MARKCLEAN(bp); /* NO_DEV blocks may be marked dirty */
266 if(bp->lmfs_bytes > 0) {
267 assert(bp->data);
268 munmap_t(bp->data, bp->lmfs_bytes);
269 bp->lmfs_bytes = 0;
270 bp->data = NULL;
271 } else assert(!bp->data);
274 /*===========================================================================*
275 * find_block *
276 *===========================================================================*/
277 static struct buf *find_block(dev_t dev, block64_t block)
279 /* Search the hash chain for (dev, block). Return the buffer structure if
280 * found, or NULL otherwise.
282 struct buf *bp;
283 int b;
285 assert(dev != NO_DEV);
287 b = BUFHASH(block);
288 for (bp = buf_hash[b]; bp != NULL; bp = bp->lmfs_hash)
289 if (bp->lmfs_blocknr == block && bp->lmfs_dev == dev)
290 return bp;
292 return NULL;
295 /*===========================================================================*
296 * get_block_ino *
297 *===========================================================================*/
298 static int get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how,
299 ino_t ino, u64_t ino_off, size_t block_size)
301 /* Check to see if the requested block is in the block cache. The requested
302 * block is identified by the block number in 'block' on device 'dev', counted
303 * in the file system block size. The amount of data requested for this block
304 * is given in 'block_size', which may be less than the file system block size
305 * iff the requested block is the last (partial) block on a device. Note that
306 * the given block size does *not* affect the conversion of 'block' to a byte
307 * offset! Either way, if the block could be obtained, either from the cache
308 * or by reading from the device, return OK, with a pointer to the buffer
309 * structure stored in 'bpp'. If not, return a negative error code (and no
310 * buffer). If necessary, evict some other block and fetch the contents from
311 * disk (if 'how' is NORMAL). If 'how' is NO_READ, the caller intends to
312 * overwrite the requested block in its entirety, so it is only necessary to
313 * see if it is in the cache; if it is not, any free buffer will do. If 'how'
314 * is PEEK, the function returns the block if it is in the cache or the VM
315 * cache, and an ENOENT error code otherwise.
316 * In addition to the LRU chain, there is also a hash chain to link together
317 * blocks whose block numbers end with the same bit strings, for fast lookup.
319 int b, r;
320 static struct buf *bp;
321 uint64_t dev_off;
322 struct buf *prev_ptr;
324 assert(buf_hash);
325 assert(buf);
326 assert(nr_bufs > 0);
328 ASSERT(fs_block_size > 0);
330 assert(dev != NO_DEV);
332 assert(block <= UINT64_MAX / fs_block_size);
334 dev_off = block * fs_block_size;
336 if((ino_off % fs_block_size)) {
338 printf("cache: unaligned lmfs_get_block_ino ino_off %llu\n",
339 ino_off);
340 util_stacktrace();
343 /* See if the block is in the cache. If so, we can return it right away. */
344 bp = find_block(dev, block);
345 if (bp != NULL && !(bp->lmfs_flags & VMMC_EVICTED)) {
346 ASSERT(bp->lmfs_dev == dev);
347 ASSERT(bp->lmfs_dev != NO_DEV);
349 /* The block must have exactly the requested number of bytes. */
350 if (bp->lmfs_bytes != block_size)
351 return EIO;
353 /* Block needed has been found. */
354 if (bp->lmfs_count == 0) {
355 rm_lru(bp);
356 ASSERT(bp->lmfs_needsetcache == 0);
357 ASSERT(!(bp->lmfs_flags & VMMC_BLOCK_LOCKED));
358 /* FIXME: race condition against the VMMC_EVICTED check */
359 bp->lmfs_flags |= VMMC_BLOCK_LOCKED;
361 raisecount(bp);
362 ASSERT(bp->lmfs_flags & VMMC_BLOCK_LOCKED);
363 ASSERT(bp->data);
365 if(ino != VMC_NO_INODE) {
366 if(bp->lmfs_inode == VMC_NO_INODE
367 || bp->lmfs_inode != ino
368 || bp->lmfs_inode_offset != ino_off) {
369 bp->lmfs_inode = ino;
370 bp->lmfs_inode_offset = ino_off;
371 bp->lmfs_needsetcache = 1;
375 *bpp = bp;
376 return OK;
379 /* We had the block in the cache but VM evicted it; invalidate it. */
380 if (bp != NULL) {
381 assert(bp->lmfs_flags & VMMC_EVICTED);
382 ASSERT(bp->lmfs_count == 0);
383 ASSERT(!(bp->lmfs_flags & VMMC_BLOCK_LOCKED));
384 ASSERT(!(bp->lmfs_flags & VMMC_DIRTY));
385 bp->lmfs_dev = NO_DEV;
386 bp->lmfs_bytes = 0;
387 bp->data = NULL;
390 /* Desired block is not on available chain. Find a free block to use. */
391 if(bp) {
392 ASSERT(bp->lmfs_flags & VMMC_EVICTED);
393 } else {
394 if ((bp = front) == NULL) panic("all buffers in use: %d", nr_bufs);
396 assert(bp);
398 rm_lru(bp);
400 /* Remove the block that was just taken from its hash chain. */
401 b = BUFHASH(bp->lmfs_blocknr);
402 prev_ptr = buf_hash[b];
403 if (prev_ptr == bp) {
404 buf_hash[b] = bp->lmfs_hash;
405 } else {
406 /* The block just taken is not on the front of its hash chain. */
407 while (prev_ptr->lmfs_hash != NULL)
408 if (prev_ptr->lmfs_hash == bp) {
409 prev_ptr->lmfs_hash = bp->lmfs_hash; /* found it */
410 break;
411 } else {
412 prev_ptr = prev_ptr->lmfs_hash; /* keep looking */
416 freeblock(bp);
418 bp->lmfs_inode = ino;
419 bp->lmfs_inode_offset = ino_off;
421 bp->lmfs_flags = VMMC_BLOCK_LOCKED;
422 bp->lmfs_needsetcache = 0;
423 bp->lmfs_dev = dev; /* fill in device number */
424 bp->lmfs_blocknr = block; /* fill in block number */
425 ASSERT(bp->lmfs_count == 0);
426 raisecount(bp);
427 b = BUFHASH(bp->lmfs_blocknr);
428 bp->lmfs_hash = buf_hash[b];
430 buf_hash[b] = bp; /* add to hash list */
432 assert(dev != NO_DEV);
434 /* The block is not found in our cache, but we do want it if it's in the VM
435 * cache. The exception is NO_READ, purely for context switching performance
436 * reasons. NO_READ is used for 1) newly allocated blocks, 2) blocks being
437 * prefetched, and 3) blocks about to be fully overwritten. In the first two
438 * cases, VM will not have the block in its cache anyway, and for the third
439 * we save on one VM call only if the block is in the VM cache.
441 assert(!bp->data);
442 assert(!bp->lmfs_bytes);
443 if (how != NO_READ && vmcache) {
444 if((bp->data = vm_map_cacheblock(dev, dev_off, ino, ino_off,
445 &bp->lmfs_flags, roundup(block_size, PAGE_SIZE))) != MAP_FAILED) {
446 bp->lmfs_bytes = block_size;
447 ASSERT(!bp->lmfs_needsetcache);
448 *bpp = bp;
449 return OK;
452 bp->data = NULL;
454 /* The block is not in the cache, and VM does not know about it. If we were
455 * requested to search for the block only, we can now return failure to the
456 * caller. Return the block to the pool without allocating data pages, since
457 * these would be freed upon recycling the block anyway.
459 if (how == PEEK) {
460 bp->lmfs_dev = NO_DEV;
462 put_block(bp, ONE_SHOT);
464 return ENOENT;
467 /* Not in the cache; reserve memory for its contents. */
469 lmfs_alloc_block(bp, block_size);
471 assert(bp->data);
473 if (how == NORMAL) {
474 /* Try to read the block. Return an error code on failure. */
475 if ((r = read_block(bp, block_size)) != OK) {
476 put_block(bp, 0);
478 return r;
480 } else if(how == NO_READ) {
481 /* This block will be overwritten by new contents. */
482 } else
483 panic("unexpected 'how' value: %d", how);
485 assert(bp->data);
487 *bpp = bp; /* return the newly acquired block */
488 return OK;
491 /*===========================================================================*
492 * lmfs_get_block_ino *
493 *===========================================================================*/
494 int lmfs_get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how,
495 ino_t ino, u64_t ino_off)
497 return get_block_ino(bpp, dev, block, how, ino, ino_off, fs_block_size);
500 /*===========================================================================*
501 * lmfs_get_partial_block *
502 *===========================================================================*/
503 int lmfs_get_partial_block(struct buf **bpp, dev_t dev, block64_t block,
504 int how, size_t block_size)
506 return get_block_ino(bpp, dev, block, how, VMC_NO_INODE, 0, block_size);
509 /*===========================================================================*
510 * put_block *
511 *===========================================================================*/
512 static void put_block(struct buf *bp, int put_flags)
514 /* Return a block to the list of available blocks. Depending on 'put_flags'
515 * it may be put on the front or rear of the LRU chain. Blocks that are
516 * expected to be needed again at some point go on the rear; blocks that are
517 * unlikely to be needed again at all go on the front.
519 dev_t dev;
520 uint64_t dev_off;
521 int r, setflags;
523 assert(bp != NULL);
525 dev = bp->lmfs_dev;
527 dev_off = bp->lmfs_blocknr * fs_block_size;
529 lowercount(bp);
530 if (bp->lmfs_count != 0) return; /* block is still in use */
532 /* Put this block back on the LRU chain. */
533 if (dev == NO_DEV || dev == DEV_RAM || (put_flags & ONE_SHOT)) {
534 /* Block will not be needed again. Put it on front of chain.
535 * It will be the next block to be evicted from the cache.
537 bp->lmfs_prev = NULL;
538 bp->lmfs_next = front;
539 if (front == NULL)
540 rear = bp; /* LRU chain was empty */
541 else
542 front->lmfs_prev = bp;
543 front = bp;
545 else {
546 /* Block may be needed again. Put it on rear of chain.
547 * It will not be evicted from the cache for a long time.
549 bp->lmfs_prev = rear;
550 bp->lmfs_next = NULL;
551 if (rear == NULL)
552 front = bp;
553 else
554 rear->lmfs_next = bp;
555 rear = bp;
558 assert(bp->lmfs_flags & VMMC_BLOCK_LOCKED);
559 bp->lmfs_flags &= ~VMMC_BLOCK_LOCKED;
561 /* block has sensible content - if necessary, identify it to VM */
562 if(vmcache && bp->lmfs_needsetcache && dev != NO_DEV) {
563 assert(bp->data);
565 setflags = (put_flags & ONE_SHOT) ? VMSF_ONCE : 0;
567 if ((r = vm_set_cacheblock(bp->data, dev, dev_off, bp->lmfs_inode,
568 bp->lmfs_inode_offset, &bp->lmfs_flags,
569 roundup(bp->lmfs_bytes, PAGE_SIZE), setflags)) != OK) {
570 if(r == ENOSYS) {
571 printf("libminixfs: ENOSYS, disabling VM calls\n");
572 vmcache = 0;
573 } else if (r == ENOMEM) {
574 /* Do not panic in this case. Running out of memory is
575 * bad, especially since it may lead to applications
576 * crashing when trying to access memory-mapped pages
577 * we haven't been able to pass off to the VM cache,
578 * but the entire file system crashing is always worse.
580 printf("libminixfs: no memory for cache block!\n");
581 } else {
582 panic("libminixfs: setblock of %p dev 0x%llx off "
583 "0x%llx failed\n", bp->data, dev, dev_off);
587 bp->lmfs_needsetcache = 0;
589 /* Now that we (may) have given the block to VM, invalidate the block if it
590 * is a one-shot block. Otherwise, it may still be reobtained immediately
591 * after, which could be a problem if VM already forgot the block and we are
592 * expected to pass it to VM again, which then wouldn't happen.
594 if (put_flags & ONE_SHOT)
595 bp->lmfs_dev = NO_DEV;
598 /*===========================================================================*
599 * lmfs_put_block *
600 *===========================================================================*/
601 void lmfs_put_block(struct buf *bp)
603 /* User interface to put_block(). */
605 if (bp == NULL) return; /* for poorly written file systems */
607 put_block(bp, 0);
610 /*===========================================================================*
611 * lmfs_free_block *
612 *===========================================================================*/
613 void lmfs_free_block(dev_t dev, block64_t block)
615 /* The file system has just freed the given block. The block may previously
616 * have been in use as data block for an inode. Therefore, we now need to tell
617 * VM that the block is no longer associated with an inode. If we fail to do so
618 * and the inode now has a hole at this location, mapping in the hole would
619 * yield the old block contents rather than a zeroed page. In addition, if the
620 * block is in the cache, it will be removed, even if it was dirty.
622 struct buf *bp;
623 int r;
625 /* Tell VM to forget about the block. The primary purpose of this call is to
626 * break the inode association, but since the block is part of a mounted file
627 * system, it is not expected to be accessed directly anyway. So, save some
628 * cache memory by throwing it out of the VM cache altogether.
630 if (vmcache) {
631 if ((r = vm_forget_cacheblock(dev, block * fs_block_size,
632 fs_block_size)) != OK)
633 printf("libminixfs: vm_forget_cacheblock failed (%d)\n", r);
636 if ((bp = find_block(dev, block)) != NULL) {
637 lmfs_markclean(bp);
639 /* Invalidate the block. The block may or may not be in use right now,
640 * so don't be smart about freeing memory or repositioning in the LRU.
642 bp->lmfs_dev = NO_DEV;
645 /* Note that this is *not* the right place to implement TRIM support. Even
646 * though the block is freed, on the device it may still be part of a
647 * previous checkpoint or snapshot of some sort. Only the file system can
648 * be trusted to decide which blocks can be reused on the device!
652 /*===========================================================================*
653 * lmfs_zero_block_ino *
654 *===========================================================================*/
655 void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t ino_off)
657 /* Files may have holes. From an application perspective, these are just file
658 * regions filled with zeroes. From a file system perspective however, holes
659 * may represent unallocated regions on disk. Thus, these holes do not have
660 * corresponding blocks on the disk, and therefore also no block number.
661 * Therefore, we cannot simply use lmfs_get_block_ino() for them. For reads,
662 * this is not a problem, since the file system can just zero out the target
663 * application buffer instead. For mapped pages however, this *is* a problem,
664 * since the VM cache needs to be told about the corresponding block, and VM
665 * does not accept blocks without a device offset. The role of this function is
666 * therefore to tell VM about the hole using a fake device offset. The device
667 * offsets are picked so that the VM cache will see a block memory-mapped for
668 * the hole in the file, while the same block is not visible when
669 * memory-mapping the block device.
671 struct buf *bp;
672 static block64_t fake_block = 0;
673 int r;
675 if (!vmcache)
676 return;
678 assert(fs_block_size > 0);
680 /* Pick a block number which is above the threshold of what can possibly be
681 * mapped in by mmap'ing the device, since off_t is signed, and it is safe to
682 * say that it will take a while before we have 8-exabyte devices. Pick a
683 * different block number each time to avoid possible concurrency issues.
684 * FIXME: it does not seem like VM actually verifies mmap offsets though..
686 if (fake_block == 0 || ++fake_block >= UINT64_MAX / fs_block_size)
687 fake_block = ((uint64_t)INT64_MAX + 1) / fs_block_size;
689 /* Obtain a block. */
690 if ((r = lmfs_get_block_ino(&bp, dev, fake_block, NO_READ, ino,
691 ino_off)) != OK)
692 panic("libminixfs: getting a NO_READ block failed: %d", r);
693 assert(bp != NULL);
694 assert(bp->lmfs_dev != NO_DEV);
696 /* The block is already zeroed, as it has just been allocated with mmap. File
697 * systems do not rely on this assumption yet, so if VM ever gets changed to
698 * not clear the blocks we allocate (e.g., by recycling pages in the VM cache
699 * for the same process, which would be safe), we need to add a memset here.
702 /* Release the block. We don't expect it to be accessed ever again. Moreover,
703 * if we keep the block around in the VM cache, it may erroneously be mapped
704 * in beyond the file end later. Hence, use VMSF_ONCE when passing it to VM.
705 * TODO: tell VM that it is an all-zeroes block, so that VM can deduplicate
706 * all such pages in its cache.
708 put_block(bp, ONE_SHOT);
711 void lmfs_set_blockusage(fsblkcnt_t btotal, fsblkcnt_t bused)
714 assert(bused <= btotal);
715 fs_btotal = btotal;
716 fs_bused = bused;
718 /* if the cache isn't in use, we could resize it. */
719 if (bufs_in_use == 0)
720 cache_heuristic_check();
723 /*===========================================================================*
724 * read_block *
725 *===========================================================================*/
726 static int read_block(struct buf *bp, size_t block_size)
728 /* Read a disk block of 'size' bytes. The given size is always the FS block
729 * size, except for the last block of a device. If an I/O error occurs,
730 * invalidate the block and return an error code.
732 ssize_t r;
733 off_t pos;
734 dev_t dev = bp->lmfs_dev;
736 assert(dev != NO_DEV);
738 ASSERT(bp->lmfs_bytes == block_size);
739 ASSERT(fs_block_size > 0);
741 pos = (off_t)bp->lmfs_blocknr * fs_block_size;
742 if (block_size > PAGE_SIZE) {
743 #define MAXPAGES 20
744 vir_bytes blockrem, vaddr = (vir_bytes) bp->data;
745 int p = 0;
746 static iovec_t iovec[MAXPAGES];
747 blockrem = block_size;
748 while(blockrem > 0) {
749 vir_bytes chunk = blockrem >= PAGE_SIZE ? PAGE_SIZE : blockrem;
750 iovec[p].iov_addr = vaddr;
751 iovec[p].iov_size = chunk;
752 vaddr += chunk;
753 blockrem -= chunk;
754 p++;
756 r = bdev_gather(dev, pos, iovec, p, BDEV_NOFLAGS);
757 } else {
758 r = bdev_read(dev, pos, bp->data, block_size, BDEV_NOFLAGS);
760 if (r != (ssize_t)block_size) {
761 /* Aesthetics: do not report EOF errors on superblock reads, because
762 * this is a fairly common occurrence, e.g. during system installation.
764 if (bp->lmfs_blocknr != 0 /*first block*/ || r != 0 /*EOF*/)
765 printf("fs cache: I/O error on device %d/%d, block %"PRIu64
766 " (%zd)\n", major(dev), minor(dev), bp->lmfs_blocknr, r);
768 if (r >= 0)
769 r = EIO; /* TODO: retry retrieving (just) the remaining part */
771 bp->lmfs_dev = NO_DEV; /* invalidate block */
773 return r;
776 return OK;
779 /*===========================================================================*
780 * lmfs_invalidate *
781 *===========================================================================*/
782 void lmfs_invalidate(
783 dev_t device /* device whose blocks are to be purged */
786 /* Remove all the blocks belonging to some device from the cache. */
788 register struct buf *bp;
790 assert(device != NO_DEV);
792 for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) {
793 if (bp->lmfs_dev == device) {
794 assert(bp->data);
795 assert(bp->lmfs_bytes > 0);
796 munmap_t(bp->data, bp->lmfs_bytes);
797 bp->lmfs_dev = NO_DEV;
798 bp->lmfs_bytes = 0;
799 bp->data = NULL;
803 /* Clear the cache even if VM caching is disabled for the file system:
804 * caching may be disabled as side effect of an error, leaving blocks behind
805 * in the actual VM cache.
807 vm_clear_cache(device);
810 /*===========================================================================*
811 * sort_blocks *
812 *===========================================================================*/
813 static void sort_blocks(struct buf **bufq, unsigned int bufqsize)
815 struct buf *bp;
816 int i, j, gap;
818 gap = 1;
820 gap = 3 * gap + 1;
821 while ((unsigned int)gap <= bufqsize);
823 while (gap != 1) {
824 gap /= 3;
825 for (j = gap; (unsigned int)j < bufqsize; j++) {
826 for (i = j - gap; i >= 0 &&
827 bufq[i]->lmfs_blocknr > bufq[i + gap]->lmfs_blocknr;
828 i -= gap) {
829 bp = bufq[i];
830 bufq[i] = bufq[i + gap];
831 bufq[i + gap] = bp;
837 /*===========================================================================*
838 * rw_scattered *
839 *===========================================================================*/
840 static void rw_scattered(
841 dev_t dev, /* major-minor device number */
842 struct buf **bufq, /* pointer to array of buffers */
843 unsigned int bufqsize, /* number of buffers */
844 int rw_flag /* READING or WRITING */
847 /* Read or write scattered data from a device. */
849 register struct buf *bp;
850 register iovec_t *iop;
851 static iovec_t iovec[NR_IOREQS];
852 off_t pos;
853 unsigned int i, iov_per_block;
854 #if !defined(NDEBUG)
855 unsigned int start_in_use = bufs_in_use, start_bufqsize = bufqsize;
856 #endif /* !defined(NDEBUG) */
858 if(bufqsize == 0) return;
860 #if !defined(NDEBUG)
861 /* for READING, check all buffers on the list are obtained and held
862 * (count > 0)
864 if (rw_flag == READING) {
865 assert(bufqsize <= LMFS_MAX_PREFETCH);
867 for(i = 0; i < bufqsize; i++) {
868 assert(bufq[i] != NULL);
869 assert(bufq[i]->lmfs_count > 0);
872 /* therefore they are all 'in use' and must be at least this many */
873 assert(start_in_use >= start_bufqsize);
876 assert(dev != NO_DEV);
877 assert(fs_block_size > 0);
878 assert(howmany(fs_block_size, PAGE_SIZE) <= NR_IOREQS);
879 #endif /* !defined(NDEBUG) */
881 /* For WRITING, (Shell) sort buffers on lmfs_blocknr.
882 * For READING, the buffers are already sorted.
884 if (rw_flag == WRITING)
885 sort_blocks(bufq, bufqsize);
887 /* Set up I/O vector and do I/O. The result of bdev I/O is OK if everything
888 * went fine, otherwise the error code for the first failed transfer.
890 while (bufqsize > 0) {
891 unsigned int p, nblocks = 0, niovecs = 0;
892 int r;
893 for (iop = iovec; nblocks < bufqsize; nblocks++) {
894 vir_bytes vdata, blockrem;
895 bp = bufq[nblocks];
896 if (bp->lmfs_blocknr != bufq[0]->lmfs_blocknr + nblocks)
897 break;
898 blockrem = bp->lmfs_bytes;
899 iov_per_block = howmany(blockrem, PAGE_SIZE);
900 if (niovecs > NR_IOREQS - iov_per_block) break;
901 vdata = (vir_bytes) bp->data;
902 for(p = 0; p < iov_per_block; p++) {
903 vir_bytes chunk =
904 blockrem < PAGE_SIZE ? blockrem : PAGE_SIZE;
905 iop->iov_addr = vdata;
906 iop->iov_size = chunk;
907 vdata += PAGE_SIZE;
908 blockrem -= chunk;
909 iop++;
910 niovecs++;
912 assert(p == iov_per_block);
913 assert(blockrem == 0);
916 assert(nblocks > 0);
917 assert(niovecs > 0 && niovecs <= NR_IOREQS);
919 pos = (off_t)bufq[0]->lmfs_blocknr * fs_block_size;
920 if (rw_flag == READING)
921 r = bdev_gather(dev, pos, iovec, niovecs, BDEV_NOFLAGS);
922 else
923 r = bdev_scatter(dev, pos, iovec, niovecs, BDEV_NOFLAGS);
925 /* Harvest the results. The driver may have returned an error, or it
926 * may have done less than what we asked for.
928 if (r < 0) {
929 printf("fs cache: I/O error %d on device %d/%d, "
930 "block %"PRIu64"\n",
931 r, major(dev), minor(dev), bufq[0]->lmfs_blocknr);
933 for (i = 0; i < nblocks; i++) {
934 bp = bufq[i];
935 if (r < (ssize_t)bp->lmfs_bytes) {
936 /* Transfer failed. */
937 if (i == 0) {
938 bp->lmfs_dev = NO_DEV; /* Invalidate block */
940 break;
942 if (rw_flag == READING) {
943 lmfs_put_block(bp);
944 } else {
945 MARKCLEAN(bp);
947 r -= bp->lmfs_bytes;
950 bufq += i;
951 bufqsize -= i;
953 if (rw_flag == READING) {
954 /* Don't bother reading more than the device is willing to
955 * give at this time. Don't forget to release those extras.
957 while (bufqsize > 0) {
958 bp = *bufq++;
959 bp->lmfs_dev = NO_DEV; /* invalidate block */
960 lmfs_put_block(bp);
961 bufqsize--;
964 if (rw_flag == WRITING && i == 0) {
965 /* We're not making progress, this means we might keep
966 * looping. Buffers remain dirty if un-written. Buffers are
967 * lost if invalidate()d or LRU-removed while dirty. This
968 * is better than keeping unwritable blocks around forever..
970 break;
974 #if !defined(NDEBUG)
975 if(rw_flag == READING) {
976 assert(start_in_use >= start_bufqsize);
978 /* READING callers assume all bufs are released. */
979 assert(start_in_use - start_bufqsize == bufs_in_use);
981 #endif /* !defined(NDEBUG) */
984 /*===========================================================================*
985 * lmfs_readahead *
986 *===========================================================================*/
987 void lmfs_readahead(dev_t dev, block64_t base_block, unsigned int nblocks,
988 size_t last_size)
990 /* Read ahead 'nblocks' blocks starting from the block 'base_block' on device
991 * 'dev'. The number of blocks must be between 1 and LMFS_MAX_PREFETCH,
992 * inclusive. All blocks have the file system's block size, possibly except the
993 * last block in the range, which is of size 'last_size'. The caller must
994 * ensure that none of the blocks in the range are already in the cache.
995 * However, the caller must also not rely on all or even any of the blocks to
996 * be present in the cache afterwards--failures are (deliberately!) ignored.
998 static noxfer_buf_ptr_t bufq[LMFS_MAX_PREFETCH]; /* static for size only */
999 struct buf *bp;
1000 unsigned int count;
1001 int r;
1003 assert(nblocks >= 1 && nblocks <= LMFS_MAX_PREFETCH);
1005 for (count = 0; count < nblocks; count++) {
1006 if (count == nblocks - 1)
1007 r = lmfs_get_partial_block(&bp, dev, base_block + count,
1008 NO_READ, last_size);
1009 else
1010 r = lmfs_get_block(&bp, dev, base_block + count, NO_READ);
1012 if (r != OK)
1013 break;
1015 /* We could add a flag that makes the get_block() calls fail if the
1016 * block is already in the cache, but it is not a major concern if it
1017 * is: we just perform a useless read in that case. However, if the
1018 * block is cached *and* dirty, we are about to lose its new contents.
1020 assert(lmfs_isclean(bp));
1022 bufq[count] = bp;
1025 rw_scattered(dev, bufq, count, READING);
1028 /*===========================================================================*
1029 * lmfs_prefetch *
1030 *===========================================================================*/
1031 unsigned int lmfs_readahead_limit(void)
1033 /* Return the maximum number of blocks that should be read ahead at once. The
1034 * return value is guaranteed to be between 1 and LMFS_MAX_PREFETCH, inclusive.
1036 unsigned int max_transfer, max_bufs;
1038 /* The returned value is the minimum of two factors: the maximum number of
1039 * blocks that can be transferred in a single I/O gather request (see how
1040 * rw_scattered() generates I/O requests), and a policy limit on the number
1041 * of buffers that any read-ahead operation may use (that is, thrash).
1043 max_transfer = NR_IOREQS / MAX(fs_block_size / PAGE_SIZE, 1);
1045 /* The constants have been imported from MFS as is, and may need tuning. */
1046 if (nr_bufs < 50)
1047 max_bufs = 18;
1048 else
1049 max_bufs = nr_bufs - 4;
1051 return MIN(max_transfer, max_bufs);
1054 /*===========================================================================*
1055 * lmfs_prefetch *
1056 *===========================================================================*/
1057 void lmfs_prefetch(dev_t dev, const block64_t *blockset, unsigned int nblocks)
1059 /* The given set of blocks is expected to be needed soon, so prefetch a
1060 * convenient subset. The blocks are expected to be sorted by likelihood of
1061 * being accessed soon, making the first block of the set the most important
1062 * block to prefetch right now. The caller must have made sure that the blocks
1063 * are not in the cache already. The array may have duplicate block numbers.
1065 bitchunk_t blocks_before[BITMAP_CHUNKS(LMFS_MAX_PREFETCH)];
1066 bitchunk_t blocks_after[BITMAP_CHUNKS(LMFS_MAX_PREFETCH)];
1067 block64_t block, base_block;
1068 unsigned int i, bit, nr_before, nr_after, span, limit, nr_blocks;
1070 if (nblocks == 0)
1071 return;
1073 /* Here is the deal. We are going to prefetch one range only, because seeking
1074 * is too expensive for just prefetching. The range we select should at least
1075 * include the first ("base") block of the given set, since that is the block
1076 * the caller is primarily interested in. Thus, the rest of the range is
1077 * going to have to be directly around this base block. We first check which
1078 * blocks from the set fall just before and after the base block, which then
1079 * allows us to construct a contiguous range of desired blocks directly
1080 * around the base block, in O(n) time. As a natural part of this, we ignore
1081 * duplicate blocks in the given set. We then read from the beginning of this
1082 * range, in order to maximize the chance that a next prefetch request will
1083 * continue from the last disk position without requiring a seek. However, we
1084 * do correct for the maximum number of blocks we can (or should) read in at
1085 * once, such that we will still end up reading the base block.
1087 base_block = blockset[0];
1089 memset(blocks_before, 0, sizeof(blocks_before));
1090 memset(blocks_after, 0, sizeof(blocks_after));
1092 for (i = 1; i < nblocks; i++) {
1093 block = blockset[i];
1095 if (block < base_block && block + LMFS_MAX_PREFETCH >= base_block) {
1096 bit = base_block - block - 1;
1097 assert(bit < LMFS_MAX_PREFETCH);
1098 SET_BIT(blocks_before, bit);
1099 } else if (block > base_block &&
1100 block - LMFS_MAX_PREFETCH <= base_block) {
1101 bit = block - base_block - 1;
1102 assert(bit < LMFS_MAX_PREFETCH);
1103 SET_BIT(blocks_after, bit);
1107 for (nr_before = 0; nr_before < LMFS_MAX_PREFETCH; nr_before++)
1108 if (!GET_BIT(blocks_before, nr_before))
1109 break;
1111 for (nr_after = 0; nr_after < LMFS_MAX_PREFETCH; nr_after++)
1112 if (!GET_BIT(blocks_after, nr_after))
1113 break;
1115 /* The number of blocks to prefetch is the minimum of two factors: the number
1116 * of blocks in the range around the base block, and the maximum number of
1117 * blocks that should be read ahead at once at all.
1119 span = nr_before + 1 + nr_after;
1120 limit = lmfs_readahead_limit();
1122 nr_blocks = MIN(span, limit);
1123 assert(nr_blocks >= 1 && nr_blocks <= LMFS_MAX_PREFETCH);
1125 /* Start prefetching from the lowest block within the contiguous range, but
1126 * make sure that we read at least the original base block itself, too.
1128 base_block -= MIN(nr_before, nr_blocks - 1);
1130 lmfs_readahead(dev, base_block, nr_blocks, fs_block_size);
1133 /*===========================================================================*
1134 * lmfs_flushdev *
1135 *===========================================================================*/
1136 void lmfs_flushdev(dev_t dev)
1138 /* Flush all dirty blocks for one device. */
1140 register struct buf *bp;
1141 static noxfer_buf_ptr_t *dirty;
1142 static unsigned int dirtylistsize = 0;
1143 unsigned int ndirty;
1145 if(dirtylistsize != nr_bufs) {
1146 if(dirtylistsize > 0) {
1147 assert(dirty != NULL);
1148 free(dirty);
1150 if(!(dirty = malloc(sizeof(dirty[0])*nr_bufs)))
1151 panic("couldn't allocate dirty buf list");
1152 dirtylistsize = nr_bufs;
1155 for (bp = &buf[0], ndirty = 0; bp < &buf[nr_bufs]; bp++) {
1156 /* Do not flush dirty blocks that are in use (lmfs_count>0): the file
1157 * system may mark the block as dirty before changing its contents, in
1158 * which case the new contents could end up being lost.
1160 if (!lmfs_isclean(bp) && bp->lmfs_dev == dev && bp->lmfs_count == 0) {
1161 dirty[ndirty++] = bp;
1165 rw_scattered(dev, dirty, ndirty, WRITING);
1168 /*===========================================================================*
1169 * rm_lru *
1170 *===========================================================================*/
1171 static void rm_lru(struct buf *bp)
1173 /* Remove a block from its LRU chain. */
1174 struct buf *next_ptr, *prev_ptr;
1176 next_ptr = bp->lmfs_next; /* successor on LRU chain */
1177 prev_ptr = bp->lmfs_prev; /* predecessor on LRU chain */
1178 if (prev_ptr != NULL)
1179 prev_ptr->lmfs_next = next_ptr;
1180 else
1181 front = next_ptr; /* this block was at front of chain */
1183 if (next_ptr != NULL)
1184 next_ptr->lmfs_prev = prev_ptr;
1185 else
1186 rear = prev_ptr; /* this block was at rear of chain */
1189 /*===========================================================================*
1190 * cache_resize *
1191 *===========================================================================*/
1192 static void cache_resize(size_t blocksize, unsigned int bufs)
1194 struct buf *bp;
1196 assert(blocksize > 0);
1197 assert(bufs >= MINBUFS);
1199 for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++)
1200 if(bp->lmfs_count != 0) panic("change blocksize with buffer in use");
1202 lmfs_buf_pool(bufs);
1204 fs_block_size = blocksize;
1207 static void cache_heuristic_check(void)
1209 int bufs, d;
1211 bufs = fs_bufs_heuristic(MINBUFS, fs_btotal, fs_bused, fs_block_size);
1213 /* set the cache to the new heuristic size if the new one
1214 * is more than 10% off from the current one.
1216 d = bufs-nr_bufs;
1217 if(d < 0) d = -d;
1218 if(d*100/nr_bufs > 10) {
1219 cache_resize(fs_block_size, bufs);
1223 /*===========================================================================*
1224 * lmfs_set_blocksize *
1225 *===========================================================================*/
1226 void lmfs_set_blocksize(size_t new_block_size)
1228 cache_resize(new_block_size, MINBUFS);
1229 cache_heuristic_check();
1231 /* Decide whether to use seconday cache or not.
1232 * Only do this if the block size is a multiple of the page size, and using
1233 * the VM cache has been enabled for this FS.
1236 vmcache = 0;
1238 if(may_use_vmcache && !(new_block_size % PAGE_SIZE))
1239 vmcache = 1;
1242 /*===========================================================================*
1243 * lmfs_buf_pool *
1244 *===========================================================================*/
1245 void lmfs_buf_pool(int new_nr_bufs)
1247 /* Initialize the buffer pool. */
1248 register struct buf *bp;
1250 assert(new_nr_bufs >= MINBUFS);
1252 if(nr_bufs > 0) {
1253 assert(buf);
1254 lmfs_flushall();
1255 for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) {
1256 if(bp->data) {
1257 assert(bp->lmfs_bytes > 0);
1258 munmap_t(bp->data, bp->lmfs_bytes);
1263 if(buf)
1264 free(buf);
1266 if(!(buf = calloc(sizeof(buf[0]), new_nr_bufs)))
1267 panic("couldn't allocate buf list (%d)", new_nr_bufs);
1269 if(buf_hash)
1270 free(buf_hash);
1271 if(!(buf_hash = calloc(sizeof(buf_hash[0]), new_nr_bufs)))
1272 panic("couldn't allocate buf hash list (%d)", new_nr_bufs);
1274 nr_bufs = new_nr_bufs;
1276 bufs_in_use = 0;
1277 front = &buf[0];
1278 rear = &buf[nr_bufs - 1];
1280 for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) {
1281 bp->lmfs_blocknr = NO_BLOCK;
1282 bp->lmfs_dev = NO_DEV;
1283 bp->lmfs_next = bp + 1;
1284 bp->lmfs_prev = bp - 1;
1285 bp->data = NULL;
1286 bp->lmfs_bytes = 0;
1288 front->lmfs_prev = NULL;
1289 rear->lmfs_next = NULL;
1291 for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) bp->lmfs_hash = bp->lmfs_next;
1292 buf_hash[0] = front;
1295 void lmfs_flushall(void)
1297 struct buf *bp;
1298 for(bp = &buf[0]; bp < &buf[nr_bufs]; bp++)
1299 if(bp->lmfs_dev != NO_DEV && !lmfs_isclean(bp))
1300 lmfs_flushdev(bp->lmfs_dev);
1302 /* This is the moment where it is least likely (although certainly not
1303 * impossible!) that there are buffers in use, since buffers should not
1304 * be held across file system syncs. See if we already intended to
1305 * resize the buffer cache, but couldn't. Be aware that we may be
1306 * called indirectly from within lmfs_change_blockusage(), so care must
1307 * be taken not to recurse infinitely. TODO: see if it is better to
1308 * resize the cache from here *only*, thus guaranteeing a clean cache.
1310 lmfs_change_blockusage(0);
1313 size_t lmfs_fs_block_size(void)
1315 return fs_block_size;
1318 void lmfs_may_use_vmcache(int ok)
1320 may_use_vmcache = ok;