10 #include <machine/vmparam.h>
12 #include <sys/param.h>
15 #include <minix/dmap.h>
16 #include <minix/libminixfs.h>
17 #include <minix/syslib.h>
18 #include <minix/sysutil.h>
19 #include <minix/u64.h>
20 #include <minix/bdev.h>
21 #include <minix/bitmap.h>
25 /* Buffer (block) cache. To acquire a block, a routine calls lmfs_get_block(),
26 * telling which block it wants. The block is then regarded as "in use" and
27 * has its reference count incremented. All the blocks that are not in use are
28 * chained together in an LRU list, with 'front' pointing to the least recently
29 * used block, and 'rear' to the most recently used block. A reverse chain is
30 * also maintained. Usage for LRU is measured by the time the put_block() is
31 * done. The second parameter to put_block() can violate the LRU order and put
32 * a block on the front of the list, if it will probably not be needed again.
33 * This is used internally only; the lmfs_put_block() API call has no second
34 * parameter. If a block is modified, the modifying routine must mark the
35 * block as dirty, so the block will eventually be rewritten to the disk.
38 /* Flags to put_block(). */
39 #define ONE_SHOT 0x1 /* set if block will not be needed again */
41 #define BUFHASH(b) ((unsigned int)((b) % nr_bufs))
42 #define MARKCLEAN lmfs_markclean
44 #define MINBUFS 6 /* minimal no of bufs for sanity check */
46 static struct buf
*front
; /* points to least recently used free block */
47 static struct buf
*rear
; /* points to most recently used free block */
48 static unsigned int bufs_in_use
;/* # bufs currently in use (not on free list)*/
50 static void rm_lru(struct buf
*bp
);
51 static int read_block(struct buf
*bp
, size_t size
);
52 static void freeblock(struct buf
*bp
);
53 static void cache_heuristic_check(void);
54 static void put_block(struct buf
*bp
, int put_flags
);
56 static int vmcache
= 0; /* are we using vm's secondary cache? (initially not) */
58 static struct buf
*buf
;
59 static struct buf
**buf_hash
; /* the buffer hash table */
60 static unsigned int nr_bufs
;
61 static int may_use_vmcache
;
63 static size_t fs_block_size
= PAGE_SIZE
; /* raw i/o block size */
65 static fsblkcnt_t fs_btotal
= 0, fs_bused
= 0;
69 typedef struct buf
*noxfer_buf_ptr_t
; /* annotation for temporary buf ptrs */
71 void lmfs_setquiet(int q
) { quiet
= q
; }
73 static int fs_bufs_heuristic(int minbufs
, fsblkcnt_t btotal
,
74 fsblkcnt_t bused
, int blocksize
)
76 struct vm_stats_info vsi
;
78 u32_t kbytes_used_fs
, kbytes_total_fs
, kbcache
, kb_fsmax
;
79 u32_t kbytes_remain_mem
;
81 /* set a reasonable cache size; cache at most a certain
82 * portion of the used FS, and at most a certain %age of remaining
85 if(vm_info_stats(&vsi
) != OK
) {
88 printf("fslib: heuristic info fail: default to %d bufs\n", bufs
);
92 /* remaining free memory is unused memory plus memory in used for cache,
93 * as the cache can be evicted
95 kbytes_remain_mem
= (u64_t
)(vsi
.vsi_free
+ vsi
.vsi_cached
) *
96 vsi
.vsi_pagesize
/ 1024;
99 kbytes_used_fs
= (unsigned long)(((u64_t
)bused
* blocksize
) / 1024);
100 kbytes_total_fs
= (unsigned long)(((u64_t
)btotal
* blocksize
) / 1024);
102 /* heuristic for a desired cache size based on FS usage;
103 * but never bigger than half of the total filesystem
105 kb_fsmax
= sqrt_approx(kbytes_used_fs
)*40;
106 kb_fsmax
= MIN(kb_fsmax
, kbytes_total_fs
/2);
108 /* heuristic for a maximum usage - 10% of remaining memory */
109 kbcache
= MIN(kbytes_remain_mem
/10, kb_fsmax
);
110 bufs
= kbcache
* 1024 / blocksize
;
112 /* but we simply need MINBUFS no matter what */
119 void lmfs_change_blockusage(int delta
)
121 /* Change the number of allocated blocks by 'delta.'
122 * Also accumulate the delta since the last cache re-evaluation.
123 * If it is outside a certain band, ask the cache library to
124 * re-evaluate the cache size.
126 static int bitdelta
= 0, warn_low
= TRUE
, warn_high
= TRUE
;
128 /* Adjust the file system block usage counter accordingly. Do bounds
129 * checking, and report file system misbehavior.
131 if (delta
> 0 && (fsblkcnt_t
)delta
> fs_btotal
- fs_bused
) {
133 printf("libminixfs: block usage overflow\n");
136 delta
= (int)(fs_btotal
- fs_bused
);
137 } else if (delta
< 0 && (fsblkcnt_t
)-delta
> fs_bused
) {
139 printf("libminixfs: block usage underflow\n");
142 delta
= -(int)fs_bused
;
148 #define BAND_KB (10*1024) /* recheck cache every 10MB change */
150 /* If the accumulated delta exceeds the configured threshold, resize
151 * the cache, but only if the cache isn't in use any more. In order to
152 * avoid that the latter case blocks a resize forever, we also call
153 * this function from lmfs_flushall(). Since lmfs_buf_pool() may call
154 * lmfs_flushall(), reset 'bitdelta' before doing the heuristics check.
156 if (bufs_in_use
== 0 &&
157 (bitdelta
*(int)fs_block_size
/1024 > BAND_KB
||
158 bitdelta
*(int)fs_block_size
/1024 < -BAND_KB
)) {
160 cache_heuristic_check();
164 void lmfs_markdirty(struct buf
*bp
)
166 bp
->lmfs_flags
|= VMMC_DIRTY
;
169 void lmfs_markclean(struct buf
*bp
)
171 bp
->lmfs_flags
&= ~VMMC_DIRTY
;
174 int lmfs_isclean(struct buf
*bp
)
176 return !(bp
->lmfs_flags
& VMMC_DIRTY
);
179 static void free_unused_blocks(void)
183 int freed
= 0, bytes
= 0;
184 printf("libminixfs: freeing; %d blocks in use\n", bufs_in_use
);
185 for(bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) {
186 if(bp
->lmfs_bytes
> 0 && bp
->lmfs_count
== 0) {
188 bytes
+= bp
->lmfs_bytes
;
192 printf("libminixfs: freeing; %d blocks, %d bytes\n", freed
, bytes
);
195 static void lmfs_alloc_block(struct buf
*bp
, size_t block_size
)
198 ASSERT(bp
->lmfs_bytes
== 0);
200 if((bp
->data
= mmap(0, block_size
, PROT_READ
|PROT_WRITE
,
201 MAP_PREALLOC
|MAP_ANON
, -1, 0)) == MAP_FAILED
) {
202 free_unused_blocks();
203 if((bp
->data
= mmap(0, block_size
, PROT_READ
|PROT_WRITE
,
204 MAP_PREALLOC
|MAP_ANON
, -1, 0)) == MAP_FAILED
) {
205 panic("libminixfs: could not allocate block");
209 bp
->lmfs_bytes
= block_size
;
210 bp
->lmfs_needsetcache
= 1;
213 /*===========================================================================*
215 *===========================================================================*/
216 int lmfs_get_block(struct buf
**bpp
, dev_t dev
, block64_t block
, int how
)
218 return lmfs_get_block_ino(bpp
, dev
, block
, how
, VMC_NO_INODE
, 0);
221 static void munmap_t(void *a
, int len
)
224 assert(a
!= MAP_FAILED
);
225 assert(!((vir_bytes
)a
% PAGE_SIZE
));
228 len
= roundup(len
, PAGE_SIZE
);
230 assert(!(len
% PAGE_SIZE
));
232 if(munmap(a
, len
) < 0)
233 panic("libminixfs cache: munmap failed");
236 static void raisecount(struct buf
*bp
)
238 ASSERT(bp
->lmfs_count
< CHAR_MAX
);
240 if(bp
->lmfs_count
== 1) bufs_in_use
++;
241 assert(bufs_in_use
> 0);
244 static void lowercount(struct buf
*bp
)
246 assert(bufs_in_use
> 0);
247 ASSERT(bp
->lmfs_count
> 0);
249 if(bp
->lmfs_count
== 0) bufs_in_use
--;
252 static void freeblock(struct buf
*bp
)
254 ASSERT(bp
->lmfs_count
== 0);
255 /* If the block taken is dirty, make it clean by writing it to the disk.
256 * Avoid hysteresis by flushing all other dirty blocks for the same device.
258 if (bp
->lmfs_dev
!= NO_DEV
) {
259 if (!lmfs_isclean(bp
)) lmfs_flushdev(bp
->lmfs_dev
);
260 assert(bp
->lmfs_bytes
> 0);
261 bp
->lmfs_dev
= NO_DEV
;
264 /* Fill in block's parameters and add it to the hash chain where it goes. */
265 MARKCLEAN(bp
); /* NO_DEV blocks may be marked dirty */
266 if(bp
->lmfs_bytes
> 0) {
268 munmap_t(bp
->data
, bp
->lmfs_bytes
);
271 } else assert(!bp
->data
);
274 /*===========================================================================*
276 *===========================================================================*/
277 static struct buf
*find_block(dev_t dev
, block64_t block
)
279 /* Search the hash chain for (dev, block). Return the buffer structure if
280 * found, or NULL otherwise.
285 assert(dev
!= NO_DEV
);
288 for (bp
= buf_hash
[b
]; bp
!= NULL
; bp
= bp
->lmfs_hash
)
289 if (bp
->lmfs_blocknr
== block
&& bp
->lmfs_dev
== dev
)
295 /*===========================================================================*
297 *===========================================================================*/
298 static int get_block_ino(struct buf
**bpp
, dev_t dev
, block64_t block
, int how
,
299 ino_t ino
, u64_t ino_off
, size_t block_size
)
301 /* Check to see if the requested block is in the block cache. The requested
302 * block is identified by the block number in 'block' on device 'dev', counted
303 * in the file system block size. The amount of data requested for this block
304 * is given in 'block_size', which may be less than the file system block size
305 * iff the requested block is the last (partial) block on a device. Note that
306 * the given block size does *not* affect the conversion of 'block' to a byte
307 * offset! Either way, if the block could be obtained, either from the cache
308 * or by reading from the device, return OK, with a pointer to the buffer
309 * structure stored in 'bpp'. If not, return a negative error code (and no
310 * buffer). If necessary, evict some other block and fetch the contents from
311 * disk (if 'how' is NORMAL). If 'how' is NO_READ, the caller intends to
312 * overwrite the requested block in its entirety, so it is only necessary to
313 * see if it is in the cache; if it is not, any free buffer will do. If 'how'
314 * is PEEK, the function returns the block if it is in the cache or the VM
315 * cache, and an ENOENT error code otherwise.
316 * In addition to the LRU chain, there is also a hash chain to link together
317 * blocks whose block numbers end with the same bit strings, for fast lookup.
320 static struct buf
*bp
;
322 struct buf
*prev_ptr
;
328 ASSERT(fs_block_size
> 0);
330 assert(dev
!= NO_DEV
);
332 assert(block
<= UINT64_MAX
/ fs_block_size
);
334 dev_off
= block
* fs_block_size
;
336 if((ino_off
% fs_block_size
)) {
338 printf("cache: unaligned lmfs_get_block_ino ino_off %llu\n",
343 /* See if the block is in the cache. If so, we can return it right away. */
344 bp
= find_block(dev
, block
);
345 if (bp
!= NULL
&& !(bp
->lmfs_flags
& VMMC_EVICTED
)) {
346 ASSERT(bp
->lmfs_dev
== dev
);
347 ASSERT(bp
->lmfs_dev
!= NO_DEV
);
349 /* The block must have exactly the requested number of bytes. */
350 if (bp
->lmfs_bytes
!= block_size
)
353 /* Block needed has been found. */
354 if (bp
->lmfs_count
== 0) {
356 ASSERT(bp
->lmfs_needsetcache
== 0);
357 ASSERT(!(bp
->lmfs_flags
& VMMC_BLOCK_LOCKED
));
358 /* FIXME: race condition against the VMMC_EVICTED check */
359 bp
->lmfs_flags
|= VMMC_BLOCK_LOCKED
;
362 ASSERT(bp
->lmfs_flags
& VMMC_BLOCK_LOCKED
);
365 if(ino
!= VMC_NO_INODE
) {
366 if(bp
->lmfs_inode
== VMC_NO_INODE
367 || bp
->lmfs_inode
!= ino
368 || bp
->lmfs_inode_offset
!= ino_off
) {
369 bp
->lmfs_inode
= ino
;
370 bp
->lmfs_inode_offset
= ino_off
;
371 bp
->lmfs_needsetcache
= 1;
379 /* We had the block in the cache but VM evicted it; invalidate it. */
381 assert(bp
->lmfs_flags
& VMMC_EVICTED
);
382 ASSERT(bp
->lmfs_count
== 0);
383 ASSERT(!(bp
->lmfs_flags
& VMMC_BLOCK_LOCKED
));
384 ASSERT(!(bp
->lmfs_flags
& VMMC_DIRTY
));
385 bp
->lmfs_dev
= NO_DEV
;
390 /* Desired block is not on available chain. Find a free block to use. */
392 ASSERT(bp
->lmfs_flags
& VMMC_EVICTED
);
394 if ((bp
= front
) == NULL
) panic("all buffers in use: %d", nr_bufs
);
400 /* Remove the block that was just taken from its hash chain. */
401 b
= BUFHASH(bp
->lmfs_blocknr
);
402 prev_ptr
= buf_hash
[b
];
403 if (prev_ptr
== bp
) {
404 buf_hash
[b
] = bp
->lmfs_hash
;
406 /* The block just taken is not on the front of its hash chain. */
407 while (prev_ptr
->lmfs_hash
!= NULL
)
408 if (prev_ptr
->lmfs_hash
== bp
) {
409 prev_ptr
->lmfs_hash
= bp
->lmfs_hash
; /* found it */
412 prev_ptr
= prev_ptr
->lmfs_hash
; /* keep looking */
418 bp
->lmfs_inode
= ino
;
419 bp
->lmfs_inode_offset
= ino_off
;
421 bp
->lmfs_flags
= VMMC_BLOCK_LOCKED
;
422 bp
->lmfs_needsetcache
= 0;
423 bp
->lmfs_dev
= dev
; /* fill in device number */
424 bp
->lmfs_blocknr
= block
; /* fill in block number */
425 ASSERT(bp
->lmfs_count
== 0);
427 b
= BUFHASH(bp
->lmfs_blocknr
);
428 bp
->lmfs_hash
= buf_hash
[b
];
430 buf_hash
[b
] = bp
; /* add to hash list */
432 assert(dev
!= NO_DEV
);
434 /* The block is not found in our cache, but we do want it if it's in the VM
435 * cache. The exception is NO_READ, purely for context switching performance
436 * reasons. NO_READ is used for 1) newly allocated blocks, 2) blocks being
437 * prefetched, and 3) blocks about to be fully overwritten. In the first two
438 * cases, VM will not have the block in its cache anyway, and for the third
439 * we save on one VM call only if the block is in the VM cache.
442 assert(!bp
->lmfs_bytes
);
443 if (how
!= NO_READ
&& vmcache
) {
444 if((bp
->data
= vm_map_cacheblock(dev
, dev_off
, ino
, ino_off
,
445 &bp
->lmfs_flags
, roundup(block_size
, PAGE_SIZE
))) != MAP_FAILED
) {
446 bp
->lmfs_bytes
= block_size
;
447 ASSERT(!bp
->lmfs_needsetcache
);
454 /* The block is not in the cache, and VM does not know about it. If we were
455 * requested to search for the block only, we can now return failure to the
456 * caller. Return the block to the pool without allocating data pages, since
457 * these would be freed upon recycling the block anyway.
460 bp
->lmfs_dev
= NO_DEV
;
462 put_block(bp
, ONE_SHOT
);
467 /* Not in the cache; reserve memory for its contents. */
469 lmfs_alloc_block(bp
, block_size
);
474 /* Try to read the block. Return an error code on failure. */
475 if ((r
= read_block(bp
, block_size
)) != OK
) {
480 } else if(how
== NO_READ
) {
481 /* This block will be overwritten by new contents. */
483 panic("unexpected 'how' value: %d", how
);
487 *bpp
= bp
; /* return the newly acquired block */
491 /*===========================================================================*
492 * lmfs_get_block_ino *
493 *===========================================================================*/
494 int lmfs_get_block_ino(struct buf
**bpp
, dev_t dev
, block64_t block
, int how
,
495 ino_t ino
, u64_t ino_off
)
497 return get_block_ino(bpp
, dev
, block
, how
, ino
, ino_off
, fs_block_size
);
500 /*===========================================================================*
501 * lmfs_get_partial_block *
502 *===========================================================================*/
503 int lmfs_get_partial_block(struct buf
**bpp
, dev_t dev
, block64_t block
,
504 int how
, size_t block_size
)
506 return get_block_ino(bpp
, dev
, block
, how
, VMC_NO_INODE
, 0, block_size
);
509 /*===========================================================================*
511 *===========================================================================*/
512 static void put_block(struct buf
*bp
, int put_flags
)
514 /* Return a block to the list of available blocks. Depending on 'put_flags'
515 * it may be put on the front or rear of the LRU chain. Blocks that are
516 * expected to be needed again at some point go on the rear; blocks that are
517 * unlikely to be needed again at all go on the front.
527 dev_off
= bp
->lmfs_blocknr
* fs_block_size
;
530 if (bp
->lmfs_count
!= 0) return; /* block is still in use */
532 /* Put this block back on the LRU chain. */
533 if (dev
== NO_DEV
|| dev
== DEV_RAM
|| (put_flags
& ONE_SHOT
)) {
534 /* Block will not be needed again. Put it on front of chain.
535 * It will be the next block to be evicted from the cache.
537 bp
->lmfs_prev
= NULL
;
538 bp
->lmfs_next
= front
;
540 rear
= bp
; /* LRU chain was empty */
542 front
->lmfs_prev
= bp
;
546 /* Block may be needed again. Put it on rear of chain.
547 * It will not be evicted from the cache for a long time.
549 bp
->lmfs_prev
= rear
;
550 bp
->lmfs_next
= NULL
;
554 rear
->lmfs_next
= bp
;
558 assert(bp
->lmfs_flags
& VMMC_BLOCK_LOCKED
);
559 bp
->lmfs_flags
&= ~VMMC_BLOCK_LOCKED
;
561 /* block has sensible content - if necessary, identify it to VM */
562 if(vmcache
&& bp
->lmfs_needsetcache
&& dev
!= NO_DEV
) {
565 setflags
= (put_flags
& ONE_SHOT
) ? VMSF_ONCE
: 0;
567 if ((r
= vm_set_cacheblock(bp
->data
, dev
, dev_off
, bp
->lmfs_inode
,
568 bp
->lmfs_inode_offset
, &bp
->lmfs_flags
,
569 roundup(bp
->lmfs_bytes
, PAGE_SIZE
), setflags
)) != OK
) {
571 printf("libminixfs: ENOSYS, disabling VM calls\n");
573 } else if (r
== ENOMEM
) {
574 /* Do not panic in this case. Running out of memory is
575 * bad, especially since it may lead to applications
576 * crashing when trying to access memory-mapped pages
577 * we haven't been able to pass off to the VM cache,
578 * but the entire file system crashing is always worse.
580 printf("libminixfs: no memory for cache block!\n");
582 panic("libminixfs: setblock of %p dev 0x%llx off "
583 "0x%llx failed\n", bp
->data
, dev
, dev_off
);
587 bp
->lmfs_needsetcache
= 0;
589 /* Now that we (may) have given the block to VM, invalidate the block if it
590 * is a one-shot block. Otherwise, it may still be reobtained immediately
591 * after, which could be a problem if VM already forgot the block and we are
592 * expected to pass it to VM again, which then wouldn't happen.
594 if (put_flags
& ONE_SHOT
)
595 bp
->lmfs_dev
= NO_DEV
;
598 /*===========================================================================*
600 *===========================================================================*/
601 void lmfs_put_block(struct buf
*bp
)
603 /* User interface to put_block(). */
605 if (bp
== NULL
) return; /* for poorly written file systems */
610 /*===========================================================================*
612 *===========================================================================*/
613 void lmfs_free_block(dev_t dev
, block64_t block
)
615 /* The file system has just freed the given block. The block may previously
616 * have been in use as data block for an inode. Therefore, we now need to tell
617 * VM that the block is no longer associated with an inode. If we fail to do so
618 * and the inode now has a hole at this location, mapping in the hole would
619 * yield the old block contents rather than a zeroed page. In addition, if the
620 * block is in the cache, it will be removed, even if it was dirty.
625 /* Tell VM to forget about the block. The primary purpose of this call is to
626 * break the inode association, but since the block is part of a mounted file
627 * system, it is not expected to be accessed directly anyway. So, save some
628 * cache memory by throwing it out of the VM cache altogether.
631 if ((r
= vm_forget_cacheblock(dev
, block
* fs_block_size
,
632 fs_block_size
)) != OK
)
633 printf("libminixfs: vm_forget_cacheblock failed (%d)\n", r
);
636 if ((bp
= find_block(dev
, block
)) != NULL
) {
639 /* Invalidate the block. The block may or may not be in use right now,
640 * so don't be smart about freeing memory or repositioning in the LRU.
642 bp
->lmfs_dev
= NO_DEV
;
645 /* Note that this is *not* the right place to implement TRIM support. Even
646 * though the block is freed, on the device it may still be part of a
647 * previous checkpoint or snapshot of some sort. Only the file system can
648 * be trusted to decide which blocks can be reused on the device!
652 /*===========================================================================*
653 * lmfs_zero_block_ino *
654 *===========================================================================*/
655 void lmfs_zero_block_ino(dev_t dev
, ino_t ino
, u64_t ino_off
)
657 /* Files may have holes. From an application perspective, these are just file
658 * regions filled with zeroes. From a file system perspective however, holes
659 * may represent unallocated regions on disk. Thus, these holes do not have
660 * corresponding blocks on the disk, and therefore also no block number.
661 * Therefore, we cannot simply use lmfs_get_block_ino() for them. For reads,
662 * this is not a problem, since the file system can just zero out the target
663 * application buffer instead. For mapped pages however, this *is* a problem,
664 * since the VM cache needs to be told about the corresponding block, and VM
665 * does not accept blocks without a device offset. The role of this function is
666 * therefore to tell VM about the hole using a fake device offset. The device
667 * offsets are picked so that the VM cache will see a block memory-mapped for
668 * the hole in the file, while the same block is not visible when
669 * memory-mapping the block device.
672 static block64_t fake_block
= 0;
678 assert(fs_block_size
> 0);
680 /* Pick a block number which is above the threshold of what can possibly be
681 * mapped in by mmap'ing the device, since off_t is signed, and it is safe to
682 * say that it will take a while before we have 8-exabyte devices. Pick a
683 * different block number each time to avoid possible concurrency issues.
684 * FIXME: it does not seem like VM actually verifies mmap offsets though..
686 if (fake_block
== 0 || ++fake_block
>= UINT64_MAX
/ fs_block_size
)
687 fake_block
= ((uint64_t)INT64_MAX
+ 1) / fs_block_size
;
689 /* Obtain a block. */
690 if ((r
= lmfs_get_block_ino(&bp
, dev
, fake_block
, NO_READ
, ino
,
692 panic("libminixfs: getting a NO_READ block failed: %d", r
);
694 assert(bp
->lmfs_dev
!= NO_DEV
);
696 /* The block is already zeroed, as it has just been allocated with mmap. File
697 * systems do not rely on this assumption yet, so if VM ever gets changed to
698 * not clear the blocks we allocate (e.g., by recycling pages in the VM cache
699 * for the same process, which would be safe), we need to add a memset here.
702 /* Release the block. We don't expect it to be accessed ever again. Moreover,
703 * if we keep the block around in the VM cache, it may erroneously be mapped
704 * in beyond the file end later. Hence, use VMSF_ONCE when passing it to VM.
705 * TODO: tell VM that it is an all-zeroes block, so that VM can deduplicate
706 * all such pages in its cache.
708 put_block(bp
, ONE_SHOT
);
711 void lmfs_set_blockusage(fsblkcnt_t btotal
, fsblkcnt_t bused
)
714 assert(bused
<= btotal
);
718 /* if the cache isn't in use, we could resize it. */
719 if (bufs_in_use
== 0)
720 cache_heuristic_check();
723 /*===========================================================================*
725 *===========================================================================*/
726 static int read_block(struct buf
*bp
, size_t block_size
)
728 /* Read a disk block of 'size' bytes. The given size is always the FS block
729 * size, except for the last block of a device. If an I/O error occurs,
730 * invalidate the block and return an error code.
734 dev_t dev
= bp
->lmfs_dev
;
736 assert(dev
!= NO_DEV
);
738 ASSERT(bp
->lmfs_bytes
== block_size
);
739 ASSERT(fs_block_size
> 0);
741 pos
= (off_t
)bp
->lmfs_blocknr
* fs_block_size
;
742 if (block_size
> PAGE_SIZE
) {
744 vir_bytes blockrem
, vaddr
= (vir_bytes
) bp
->data
;
746 static iovec_t iovec
[MAXPAGES
];
747 blockrem
= block_size
;
748 while(blockrem
> 0) {
749 vir_bytes chunk
= blockrem
>= PAGE_SIZE
? PAGE_SIZE
: blockrem
;
750 iovec
[p
].iov_addr
= vaddr
;
751 iovec
[p
].iov_size
= chunk
;
756 r
= bdev_gather(dev
, pos
, iovec
, p
, BDEV_NOFLAGS
);
758 r
= bdev_read(dev
, pos
, bp
->data
, block_size
, BDEV_NOFLAGS
);
760 if (r
!= (ssize_t
)block_size
) {
761 /* Aesthetics: do not report EOF errors on superblock reads, because
762 * this is a fairly common occurrence, e.g. during system installation.
764 if (bp
->lmfs_blocknr
!= 0 /*first block*/ || r
!= 0 /*EOF*/)
765 printf("fs cache: I/O error on device %d/%d, block %"PRIu64
766 " (%zd)\n", major(dev
), minor(dev
), bp
->lmfs_blocknr
, r
);
769 r
= EIO
; /* TODO: retry retrieving (just) the remaining part */
771 bp
->lmfs_dev
= NO_DEV
; /* invalidate block */
779 /*===========================================================================*
781 *===========================================================================*/
782 void lmfs_invalidate(
783 dev_t device
/* device whose blocks are to be purged */
786 /* Remove all the blocks belonging to some device from the cache. */
788 register struct buf
*bp
;
790 assert(device
!= NO_DEV
);
792 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) {
793 if (bp
->lmfs_dev
== device
) {
795 assert(bp
->lmfs_bytes
> 0);
796 munmap_t(bp
->data
, bp
->lmfs_bytes
);
797 bp
->lmfs_dev
= NO_DEV
;
803 /* Clear the cache even if VM caching is disabled for the file system:
804 * caching may be disabled as side effect of an error, leaving blocks behind
805 * in the actual VM cache.
807 vm_clear_cache(device
);
810 /*===========================================================================*
812 *===========================================================================*/
813 static void sort_blocks(struct buf
**bufq
, unsigned int bufqsize
)
821 while ((unsigned int)gap
<= bufqsize
);
825 for (j
= gap
; (unsigned int)j
< bufqsize
; j
++) {
826 for (i
= j
- gap
; i
>= 0 &&
827 bufq
[i
]->lmfs_blocknr
> bufq
[i
+ gap
]->lmfs_blocknr
;
830 bufq
[i
] = bufq
[i
+ gap
];
837 /*===========================================================================*
839 *===========================================================================*/
840 static void rw_scattered(
841 dev_t dev
, /* major-minor device number */
842 struct buf
**bufq
, /* pointer to array of buffers */
843 unsigned int bufqsize
, /* number of buffers */
844 int rw_flag
/* READING or WRITING */
847 /* Read or write scattered data from a device. */
849 register struct buf
*bp
;
850 register iovec_t
*iop
;
851 static iovec_t iovec
[NR_IOREQS
];
853 unsigned int i
, iov_per_block
;
855 unsigned int start_in_use
= bufs_in_use
, start_bufqsize
= bufqsize
;
856 #endif /* !defined(NDEBUG) */
858 if(bufqsize
== 0) return;
861 /* for READING, check all buffers on the list are obtained and held
864 if (rw_flag
== READING
) {
865 assert(bufqsize
<= LMFS_MAX_PREFETCH
);
867 for(i
= 0; i
< bufqsize
; i
++) {
868 assert(bufq
[i
] != NULL
);
869 assert(bufq
[i
]->lmfs_count
> 0);
872 /* therefore they are all 'in use' and must be at least this many */
873 assert(start_in_use
>= start_bufqsize
);
876 assert(dev
!= NO_DEV
);
877 assert(fs_block_size
> 0);
878 assert(howmany(fs_block_size
, PAGE_SIZE
) <= NR_IOREQS
);
879 #endif /* !defined(NDEBUG) */
881 /* For WRITING, (Shell) sort buffers on lmfs_blocknr.
882 * For READING, the buffers are already sorted.
884 if (rw_flag
== WRITING
)
885 sort_blocks(bufq
, bufqsize
);
887 /* Set up I/O vector and do I/O. The result of bdev I/O is OK if everything
888 * went fine, otherwise the error code for the first failed transfer.
890 while (bufqsize
> 0) {
891 unsigned int p
, nblocks
= 0, niovecs
= 0;
893 for (iop
= iovec
; nblocks
< bufqsize
; nblocks
++) {
894 vir_bytes vdata
, blockrem
;
896 if (bp
->lmfs_blocknr
!= bufq
[0]->lmfs_blocknr
+ nblocks
)
898 blockrem
= bp
->lmfs_bytes
;
899 iov_per_block
= howmany(blockrem
, PAGE_SIZE
);
900 if (niovecs
> NR_IOREQS
- iov_per_block
) break;
901 vdata
= (vir_bytes
) bp
->data
;
902 for(p
= 0; p
< iov_per_block
; p
++) {
904 blockrem
< PAGE_SIZE
? blockrem
: PAGE_SIZE
;
905 iop
->iov_addr
= vdata
;
906 iop
->iov_size
= chunk
;
912 assert(p
== iov_per_block
);
913 assert(blockrem
== 0);
917 assert(niovecs
> 0 && niovecs
<= NR_IOREQS
);
919 pos
= (off_t
)bufq
[0]->lmfs_blocknr
* fs_block_size
;
920 if (rw_flag
== READING
)
921 r
= bdev_gather(dev
, pos
, iovec
, niovecs
, BDEV_NOFLAGS
);
923 r
= bdev_scatter(dev
, pos
, iovec
, niovecs
, BDEV_NOFLAGS
);
925 /* Harvest the results. The driver may have returned an error, or it
926 * may have done less than what we asked for.
929 printf("fs cache: I/O error %d on device %d/%d, "
931 r
, major(dev
), minor(dev
), bufq
[0]->lmfs_blocknr
);
933 for (i
= 0; i
< nblocks
; i
++) {
935 if (r
< (ssize_t
)bp
->lmfs_bytes
) {
936 /* Transfer failed. */
938 bp
->lmfs_dev
= NO_DEV
; /* Invalidate block */
942 if (rw_flag
== READING
) {
953 if (rw_flag
== READING
) {
954 /* Don't bother reading more than the device is willing to
955 * give at this time. Don't forget to release those extras.
957 while (bufqsize
> 0) {
959 bp
->lmfs_dev
= NO_DEV
; /* invalidate block */
964 if (rw_flag
== WRITING
&& i
== 0) {
965 /* We're not making progress, this means we might keep
966 * looping. Buffers remain dirty if un-written. Buffers are
967 * lost if invalidate()d or LRU-removed while dirty. This
968 * is better than keeping unwritable blocks around forever..
975 if(rw_flag
== READING
) {
976 assert(start_in_use
>= start_bufqsize
);
978 /* READING callers assume all bufs are released. */
979 assert(start_in_use
- start_bufqsize
== bufs_in_use
);
981 #endif /* !defined(NDEBUG) */
984 /*===========================================================================*
986 *===========================================================================*/
987 void lmfs_readahead(dev_t dev
, block64_t base_block
, unsigned int nblocks
,
990 /* Read ahead 'nblocks' blocks starting from the block 'base_block' on device
991 * 'dev'. The number of blocks must be between 1 and LMFS_MAX_PREFETCH,
992 * inclusive. All blocks have the file system's block size, possibly except the
993 * last block in the range, which is of size 'last_size'. The caller must
994 * ensure that none of the blocks in the range are already in the cache.
995 * However, the caller must also not rely on all or even any of the blocks to
996 * be present in the cache afterwards--failures are (deliberately!) ignored.
998 static noxfer_buf_ptr_t bufq
[LMFS_MAX_PREFETCH
]; /* static for size only */
1003 assert(nblocks
>= 1 && nblocks
<= LMFS_MAX_PREFETCH
);
1005 for (count
= 0; count
< nblocks
; count
++) {
1006 if (count
== nblocks
- 1)
1007 r
= lmfs_get_partial_block(&bp
, dev
, base_block
+ count
,
1008 NO_READ
, last_size
);
1010 r
= lmfs_get_block(&bp
, dev
, base_block
+ count
, NO_READ
);
1015 /* We could add a flag that makes the get_block() calls fail if the
1016 * block is already in the cache, but it is not a major concern if it
1017 * is: we just perform a useless read in that case. However, if the
1018 * block is cached *and* dirty, we are about to lose its new contents.
1020 assert(lmfs_isclean(bp
));
1025 rw_scattered(dev
, bufq
, count
, READING
);
1028 /*===========================================================================*
1030 *===========================================================================*/
1031 unsigned int lmfs_readahead_limit(void)
1033 /* Return the maximum number of blocks that should be read ahead at once. The
1034 * return value is guaranteed to be between 1 and LMFS_MAX_PREFETCH, inclusive.
1036 unsigned int max_transfer
, max_bufs
;
1038 /* The returned value is the minimum of two factors: the maximum number of
1039 * blocks that can be transferred in a single I/O gather request (see how
1040 * rw_scattered() generates I/O requests), and a policy limit on the number
1041 * of buffers that any read-ahead operation may use (that is, thrash).
1043 max_transfer
= NR_IOREQS
/ MAX(fs_block_size
/ PAGE_SIZE
, 1);
1045 /* The constants have been imported from MFS as is, and may need tuning. */
1049 max_bufs
= nr_bufs
- 4;
1051 return MIN(max_transfer
, max_bufs
);
1054 /*===========================================================================*
1056 *===========================================================================*/
1057 void lmfs_prefetch(dev_t dev
, const block64_t
*blockset
, unsigned int nblocks
)
1059 /* The given set of blocks is expected to be needed soon, so prefetch a
1060 * convenient subset. The blocks are expected to be sorted by likelihood of
1061 * being accessed soon, making the first block of the set the most important
1062 * block to prefetch right now. The caller must have made sure that the blocks
1063 * are not in the cache already. The array may have duplicate block numbers.
1065 bitchunk_t blocks_before
[BITMAP_CHUNKS(LMFS_MAX_PREFETCH
)];
1066 bitchunk_t blocks_after
[BITMAP_CHUNKS(LMFS_MAX_PREFETCH
)];
1067 block64_t block
, base_block
;
1068 unsigned int i
, bit
, nr_before
, nr_after
, span
, limit
, nr_blocks
;
1073 /* Here is the deal. We are going to prefetch one range only, because seeking
1074 * is too expensive for just prefetching. The range we select should at least
1075 * include the first ("base") block of the given set, since that is the block
1076 * the caller is primarily interested in. Thus, the rest of the range is
1077 * going to have to be directly around this base block. We first check which
1078 * blocks from the set fall just before and after the base block, which then
1079 * allows us to construct a contiguous range of desired blocks directly
1080 * around the base block, in O(n) time. As a natural part of this, we ignore
1081 * duplicate blocks in the given set. We then read from the beginning of this
1082 * range, in order to maximize the chance that a next prefetch request will
1083 * continue from the last disk position without requiring a seek. However, we
1084 * do correct for the maximum number of blocks we can (or should) read in at
1085 * once, such that we will still end up reading the base block.
1087 base_block
= blockset
[0];
1089 memset(blocks_before
, 0, sizeof(blocks_before
));
1090 memset(blocks_after
, 0, sizeof(blocks_after
));
1092 for (i
= 1; i
< nblocks
; i
++) {
1093 block
= blockset
[i
];
1095 if (block
< base_block
&& block
+ LMFS_MAX_PREFETCH
>= base_block
) {
1096 bit
= base_block
- block
- 1;
1097 assert(bit
< LMFS_MAX_PREFETCH
);
1098 SET_BIT(blocks_before
, bit
);
1099 } else if (block
> base_block
&&
1100 block
- LMFS_MAX_PREFETCH
<= base_block
) {
1101 bit
= block
- base_block
- 1;
1102 assert(bit
< LMFS_MAX_PREFETCH
);
1103 SET_BIT(blocks_after
, bit
);
1107 for (nr_before
= 0; nr_before
< LMFS_MAX_PREFETCH
; nr_before
++)
1108 if (!GET_BIT(blocks_before
, nr_before
))
1111 for (nr_after
= 0; nr_after
< LMFS_MAX_PREFETCH
; nr_after
++)
1112 if (!GET_BIT(blocks_after
, nr_after
))
1115 /* The number of blocks to prefetch is the minimum of two factors: the number
1116 * of blocks in the range around the base block, and the maximum number of
1117 * blocks that should be read ahead at once at all.
1119 span
= nr_before
+ 1 + nr_after
;
1120 limit
= lmfs_readahead_limit();
1122 nr_blocks
= MIN(span
, limit
);
1123 assert(nr_blocks
>= 1 && nr_blocks
<= LMFS_MAX_PREFETCH
);
1125 /* Start prefetching from the lowest block within the contiguous range, but
1126 * make sure that we read at least the original base block itself, too.
1128 base_block
-= MIN(nr_before
, nr_blocks
- 1);
1130 lmfs_readahead(dev
, base_block
, nr_blocks
, fs_block_size
);
1133 /*===========================================================================*
1135 *===========================================================================*/
1136 void lmfs_flushdev(dev_t dev
)
1138 /* Flush all dirty blocks for one device. */
1140 register struct buf
*bp
;
1141 static noxfer_buf_ptr_t
*dirty
;
1142 static unsigned int dirtylistsize
= 0;
1143 unsigned int ndirty
;
1145 if(dirtylistsize
!= nr_bufs
) {
1146 if(dirtylistsize
> 0) {
1147 assert(dirty
!= NULL
);
1150 if(!(dirty
= malloc(sizeof(dirty
[0])*nr_bufs
)))
1151 panic("couldn't allocate dirty buf list");
1152 dirtylistsize
= nr_bufs
;
1155 for (bp
= &buf
[0], ndirty
= 0; bp
< &buf
[nr_bufs
]; bp
++) {
1156 /* Do not flush dirty blocks that are in use (lmfs_count>0): the file
1157 * system may mark the block as dirty before changing its contents, in
1158 * which case the new contents could end up being lost.
1160 if (!lmfs_isclean(bp
) && bp
->lmfs_dev
== dev
&& bp
->lmfs_count
== 0) {
1161 dirty
[ndirty
++] = bp
;
1165 rw_scattered(dev
, dirty
, ndirty
, WRITING
);
1168 /*===========================================================================*
1170 *===========================================================================*/
1171 static void rm_lru(struct buf
*bp
)
1173 /* Remove a block from its LRU chain. */
1174 struct buf
*next_ptr
, *prev_ptr
;
1176 next_ptr
= bp
->lmfs_next
; /* successor on LRU chain */
1177 prev_ptr
= bp
->lmfs_prev
; /* predecessor on LRU chain */
1178 if (prev_ptr
!= NULL
)
1179 prev_ptr
->lmfs_next
= next_ptr
;
1181 front
= next_ptr
; /* this block was at front of chain */
1183 if (next_ptr
!= NULL
)
1184 next_ptr
->lmfs_prev
= prev_ptr
;
1186 rear
= prev_ptr
; /* this block was at rear of chain */
1189 /*===========================================================================*
1191 *===========================================================================*/
1192 static void cache_resize(size_t blocksize
, unsigned int bufs
)
1196 assert(blocksize
> 0);
1197 assert(bufs
>= MINBUFS
);
1199 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++)
1200 if(bp
->lmfs_count
!= 0) panic("change blocksize with buffer in use");
1202 lmfs_buf_pool(bufs
);
1204 fs_block_size
= blocksize
;
1207 static void cache_heuristic_check(void)
1211 bufs
= fs_bufs_heuristic(MINBUFS
, fs_btotal
, fs_bused
, fs_block_size
);
1213 /* set the cache to the new heuristic size if the new one
1214 * is more than 10% off from the current one.
1218 if(d
*100/nr_bufs
> 10) {
1219 cache_resize(fs_block_size
, bufs
);
1223 /*===========================================================================*
1224 * lmfs_set_blocksize *
1225 *===========================================================================*/
1226 void lmfs_set_blocksize(size_t new_block_size
)
1228 cache_resize(new_block_size
, MINBUFS
);
1229 cache_heuristic_check();
1231 /* Decide whether to use seconday cache or not.
1232 * Only do this if the block size is a multiple of the page size, and using
1233 * the VM cache has been enabled for this FS.
1238 if(may_use_vmcache
&& !(new_block_size
% PAGE_SIZE
))
1242 /*===========================================================================*
1244 *===========================================================================*/
1245 void lmfs_buf_pool(int new_nr_bufs
)
1247 /* Initialize the buffer pool. */
1248 register struct buf
*bp
;
1250 assert(new_nr_bufs
>= MINBUFS
);
1255 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) {
1257 assert(bp
->lmfs_bytes
> 0);
1258 munmap_t(bp
->data
, bp
->lmfs_bytes
);
1266 if(!(buf
= calloc(sizeof(buf
[0]), new_nr_bufs
)))
1267 panic("couldn't allocate buf list (%d)", new_nr_bufs
);
1271 if(!(buf_hash
= calloc(sizeof(buf_hash
[0]), new_nr_bufs
)))
1272 panic("couldn't allocate buf hash list (%d)", new_nr_bufs
);
1274 nr_bufs
= new_nr_bufs
;
1278 rear
= &buf
[nr_bufs
- 1];
1280 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) {
1281 bp
->lmfs_blocknr
= NO_BLOCK
;
1282 bp
->lmfs_dev
= NO_DEV
;
1283 bp
->lmfs_next
= bp
+ 1;
1284 bp
->lmfs_prev
= bp
- 1;
1288 front
->lmfs_prev
= NULL
;
1289 rear
->lmfs_next
= NULL
;
1291 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) bp
->lmfs_hash
= bp
->lmfs_next
;
1292 buf_hash
[0] = front
;
1295 void lmfs_flushall(void)
1298 for(bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++)
1299 if(bp
->lmfs_dev
!= NO_DEV
&& !lmfs_isclean(bp
))
1300 lmfs_flushdev(bp
->lmfs_dev
);
1302 /* This is the moment where it is least likely (although certainly not
1303 * impossible!) that there are buffers in use, since buffers should not
1304 * be held across file system syncs. See if we already intended to
1305 * resize the buffer cache, but couldn't. Be aware that we may be
1306 * called indirectly from within lmfs_change_blockusage(), so care must
1307 * be taken not to recurse infinitely. TODO: see if it is better to
1308 * resize the cache from here *only*, thus guaranteeing a clean cache.
1310 lmfs_change_blockusage(0);
1313 size_t lmfs_fs_block_size(void)
1315 return fs_block_size
;
1318 void lmfs_may_use_vmcache(int ok
)
1320 may_use_vmcache
= ok
;