10 #include <sys/param.h>
12 #include <minix/dmap.h>
13 #include <minix/libminixfs.h>
14 #include <minix/syslib.h>
15 #include <minix/sysutil.h>
16 #include <minix/u64.h>
17 #include <minix/bdev.h>
19 #define BP_CLEAN 0 /* on-disk block and memory copies identical */
20 #define BP_DIRTY 1 /* on-disk block and memory copies differ */
22 #define BUFHASH(b) ((b) % nr_bufs)
23 #define MARKCLEAN lmfs_markclean
25 #define MINBUFS 6 /* minimal no of bufs for sanity check */
27 static struct buf
*front
; /* points to least recently used free block */
28 static struct buf
*rear
; /* points to most recently used free block */
29 static unsigned int bufs_in_use
;/* # bufs currently in use (not on free list)*/
31 static void rm_lru(struct buf
*bp
);
32 static void read_block(struct buf
*);
33 static void flushall(dev_t dev
);
35 static int vmcache
= 0; /* are we using vm's secondary cache? (initially not) */
37 static struct buf
*buf
;
38 static struct buf
**buf_hash
; /* the buffer hash table */
39 static unsigned int nr_bufs
;
40 static int may_use_vmcache
;
42 static int fs_block_size
= 1024; /* raw i/o block size */
46 u32_t
fs_bufs_heuristic(int minbufs
, u32_t btotal
, u32_t bfree
,
47 int blocksize
, dev_t majordev
)
49 struct vm_stats_info vsi
;
51 u32_t kbytes_used_fs
, kbytes_total_fs
, kbcache
, kb_fsmax
;
52 u32_t kbytes_remain_mem
, bused
;
56 /* but we simply need minbufs no matter what, and we don't
57 * want more than that if we're a memory device
59 if(majordev
== MEMORY_MAJOR
) {
63 /* set a reasonable cache size; cache at most a certain
64 * portion of the used FS, and at most a certain %age of remaining
67 if((vm_info_stats(&vsi
) != OK
)) {
69 printf("fslib: heuristic info fail: default to %d bufs\n", bufs
);
73 kbytes_remain_mem
= div64u(mul64u(vsi
.vsi_free
, vsi
.vsi_pagesize
), 1024);
76 kbytes_used_fs
= div64u(mul64u(bused
, blocksize
), 1024);
77 kbytes_total_fs
= div64u(mul64u(btotal
, blocksize
), 1024);
79 /* heuristic for a desired cache size based on FS usage;
80 * but never bigger than half of the total filesystem
82 kb_fsmax
= sqrt_approx(kbytes_used_fs
)*40;
83 kb_fsmax
= MIN(kb_fsmax
, kbytes_total_fs
/2);
85 /* heuristic for a maximum usage - 10% of remaining memory */
86 kbcache
= MIN(kbytes_remain_mem
/10, kb_fsmax
);
87 bufs
= kbcache
* 1024 / blocksize
;
89 /* but we simply need MINBUFS no matter what */
97 lmfs_markdirty(struct buf
*bp
)
99 bp
->lmfs_dirt
= BP_DIRTY
;
103 lmfs_markclean(struct buf
*bp
)
105 bp
->lmfs_dirt
= BP_CLEAN
;
109 lmfs_isclean(struct buf
*bp
)
111 return bp
->lmfs_dirt
== BP_CLEAN
;
115 lmfs_dev(struct buf
*bp
)
120 int lmfs_bytes(struct buf
*bp
)
122 return bp
->lmfs_bytes
;
125 /*===========================================================================*
127 *===========================================================================*/
128 struct buf
*lmfs_get_block(
129 register dev_t dev
, /* on which device is the block? */
130 register block_t block
, /* which block is wanted? */
131 int only_search
/* if NO_READ, don't read, else act normal */
134 /* Check to see if the requested block is in the block cache. If so, return
135 * a pointer to it. If not, evict some other block and fetch it (unless
136 * 'only_search' is 1). All the blocks in the cache that are not in use
137 * are linked together in a chain, with 'front' pointing to the least recently
138 * used block and 'rear' to the most recently used block. If 'only_search' is
139 * 1, the block being requested will be overwritten in its entirety, so it is
140 * only necessary to see if it is in the cache; if it is not, any free buffer
141 * will do. It is not necessary to actually read the block in from disk.
142 * If 'only_search' is PREFETCH, the block need not be read from the disk,
143 * and the device is not to be marked on the block, so callers can tell if
144 * the block returned is valid.
145 * In addition to the LRU chain, there is also a hash chain to link together
146 * blocks whose block numbers end with the same bit strings, for fast lookup.
150 static struct buf
*bp
, *prev_ptr
;
151 u64_t yieldid
= VM_BLOCKID_NONE
, getid
= make64(dev
, block
);
157 ASSERT(fs_block_size
> 0);
159 /* Search the hash chain for (dev, block). Do_read() can use
160 * lmfs_get_block(NO_DEV ...) to get an unnamed block to fill with zeros when
161 * someone wants to read from a hole in a file, in which case this search
168 if (bp
->lmfs_blocknr
== block
&& bp
->lmfs_dev
== dev
) {
169 /* Block needed has been found. */
170 if (bp
->lmfs_count
== 0) rm_lru(bp
);
171 bp
->lmfs_count
++; /* record that block is in use */
172 ASSERT(bp
->lmfs_bytes
== fs_block_size
);
173 ASSERT(bp
->lmfs_dev
== dev
);
174 ASSERT(bp
->lmfs_dev
!= NO_DEV
);
178 /* This block is not the one sought. */
179 bp
= bp
->lmfs_hash
; /* move to next block on hash chain */
184 /* Desired block is not on available chain. Take oldest block ('front'). */
185 if ((bp
= front
) == NULL
) panic("all buffers in use: %d", nr_bufs
);
187 if(bp
->lmfs_bytes
< fs_block_size
) {
189 ASSERT(bp
->lmfs_bytes
== 0);
190 if(!(bp
->data
= alloc_contig( (size_t) fs_block_size
, 0, NULL
))) {
191 printf("fs cache: couldn't allocate a new block.\n");
193 bp
&& bp
->lmfs_bytes
< fs_block_size
; bp
= bp
->lmfs_next
)
196 panic("no buffer available");
199 bp
->lmfs_bytes
= fs_block_size
;
205 ASSERT(bp
->lmfs_bytes
== fs_block_size
);
206 ASSERT(bp
->lmfs_count
== 0);
210 /* Remove the block that was just taken from its hash chain. */
211 b
= BUFHASH(bp
->lmfs_blocknr
);
212 prev_ptr
= buf_hash
[b
];
213 if (prev_ptr
== bp
) {
214 buf_hash
[b
] = bp
->lmfs_hash
;
216 /* The block just taken is not on the front of its hash chain. */
217 while (prev_ptr
->lmfs_hash
!= NULL
)
218 if (prev_ptr
->lmfs_hash
== bp
) {
219 prev_ptr
->lmfs_hash
= bp
->lmfs_hash
; /* found it */
222 prev_ptr
= prev_ptr
->lmfs_hash
; /* keep looking */
226 /* If the block taken is dirty, make it clean by writing it to the disk.
227 * Avoid hysteresis by flushing all other dirty blocks for the same device.
229 if (bp
->lmfs_dev
!= NO_DEV
) {
230 if (bp
->lmfs_dirt
== BP_DIRTY
) flushall(bp
->lmfs_dev
);
232 /* Are we throwing out a block that contained something?
233 * Give it to VM for the second-layer cache.
235 yieldid
= make64(bp
->lmfs_dev
, bp
->lmfs_blocknr
);
236 assert(bp
->lmfs_bytes
== fs_block_size
);
237 bp
->lmfs_dev
= NO_DEV
;
240 /* Fill in block's parameters and add it to the hash chain where it goes. */
241 MARKCLEAN(bp
); /* NO_DEV blocks may be marked dirty */
242 bp
->lmfs_dev
= dev
; /* fill in device number */
243 bp
->lmfs_blocknr
= block
; /* fill in block number */
244 bp
->lmfs_count
++; /* record that block is being used */
245 b
= BUFHASH(bp
->lmfs_blocknr
);
246 bp
->lmfs_hash
= buf_hash
[b
];
248 buf_hash
[b
] = bp
; /* add to hash list */
251 if(vmcache
&& cmp64(yieldid
, VM_BLOCKID_NONE
) != 0) {
252 vm_yield_block_get_block(yieldid
, VM_BLOCKID_NONE
,
253 bp
->data
, fs_block_size
);
255 return(bp
); /* If the caller wanted a NO_DEV block, work is done. */
258 /* Go get the requested block unless searching or prefetching. */
259 if(only_search
== PREFETCH
|| only_search
== NORMAL
) {
260 /* Block is not found in our cache, but we do want it
261 * if it's in the vm cache.
264 /* If we can satisfy the PREFETCH or NORMAL request
265 * from the vm cache, work is done.
267 if(vm_yield_block_get_block(yieldid
, getid
,
268 bp
->data
, fs_block_size
) == OK
) {
274 if(only_search
== PREFETCH
) {
275 /* PREFETCH: don't do i/o. */
276 bp
->lmfs_dev
= NO_DEV
;
277 } else if (only_search
== NORMAL
) {
279 } else if(only_search
== NO_READ
) {
280 /* we want this block, but its contents
281 * will be overwritten. VM has to forget
285 vm_forgetblock(getid
);
288 panic("unexpected only_search value: %d", only_search
);
292 return(bp
); /* return the newly acquired block */
295 /*===========================================================================*
297 *===========================================================================*/
298 void lmfs_put_block(bp
, block_type
)
299 register struct buf
*bp
; /* pointer to the buffer to be released */
300 int block_type
; /* INODE_BLOCK, DIRECTORY_BLOCK, or whatever */
302 /* Return a block to the list of available blocks. Depending on 'block_type'
303 * it may be put on the front or rear of the LRU chain. Blocks that are
304 * expected to be needed again shortly (e.g., partially full data blocks)
305 * go on the rear; blocks that are unlikely to be needed again shortly
306 * (e.g., full data blocks) go on the front. Blocks whose loss can hurt
307 * the integrity of the file system (e.g., inode blocks) are written to
308 * disk immediately if they are dirty.
310 if (bp
== NULL
) return; /* it is easier to check here than in caller */
312 bp
->lmfs_count
--; /* there is one use fewer now */
313 if (bp
->lmfs_count
!= 0) return; /* block is still in use */
315 bufs_in_use
--; /* one fewer block buffers in use */
317 /* Put this block back on the LRU chain. */
318 if (bp
->lmfs_dev
== DEV_RAM
|| (block_type
& ONE_SHOT
)) {
319 /* Block probably won't be needed quickly. Put it on front of chain.
320 * It will be the next block to be evicted from the cache.
322 bp
->lmfs_prev
= NULL
;
323 bp
->lmfs_next
= front
;
325 rear
= bp
; /* LRU chain was empty */
327 front
->lmfs_prev
= bp
;
331 /* Block probably will be needed quickly. Put it on rear of chain.
332 * It will not be evicted from the cache for a long time.
334 bp
->lmfs_prev
= rear
;
335 bp
->lmfs_next
= NULL
;
339 rear
->lmfs_next
= bp
;
344 /*===========================================================================*
346 *===========================================================================*/
347 static void read_block(bp
)
348 register struct buf
*bp
; /* buffer pointer */
350 /* Read or write a disk block. This is the only routine in which actual disk
351 * I/O is invoked. If an error occurs, a message is printed here, but the error
352 * is not reported to the caller. If the error occurred while purging a block
353 * from the cache, it is not clear what the caller could do about it anyway.
361 if ( (dev
= bp
->lmfs_dev
) != NO_DEV
) {
362 pos
= mul64u(bp
->lmfs_blocknr
, fs_block_size
);
363 r
= bdev_read(dev
, pos
, bp
->data
, fs_block_size
,
366 printf("fs cache: I/O error on device %d/%d, block %u\n",
367 major(dev
), minor(dev
), bp
->lmfs_blocknr
);
369 } else if (r
!= (ssize_t
) fs_block_size
) {
375 bp
->lmfs_dev
= NO_DEV
; /* invalidate block */
377 /* Report read errors to interested parties. */
383 /*===========================================================================*
385 *===========================================================================*/
386 void lmfs_invalidate(
387 dev_t device
/* device whose blocks are to be purged */
390 /* Remove all the blocks belonging to some device from the cache. */
392 register struct buf
*bp
;
394 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++)
395 if (bp
->lmfs_dev
== device
) bp
->lmfs_dev
= NO_DEV
;
400 /*===========================================================================*
402 *===========================================================================*/
403 static void flushall(dev_t dev
)
405 /* Flush all dirty blocks for one device. */
407 register struct buf
*bp
;
408 static struct buf
**dirty
; /* static so it isn't on stack */
409 static unsigned int dirtylistsize
= 0;
412 if(dirtylistsize
!= nr_bufs
) {
413 if(dirtylistsize
> 0) {
414 assert(dirty
!= NULL
);
417 if(!(dirty
= malloc(sizeof(dirty
[0])*nr_bufs
)))
418 panic("couldn't allocate dirty buf list");
419 dirtylistsize
= nr_bufs
;
422 for (bp
= &buf
[0], ndirty
= 0; bp
< &buf
[nr_bufs
]; bp
++) {
423 if (bp
->lmfs_dirt
== BP_DIRTY
&& bp
->lmfs_dev
== dev
) {
424 dirty
[ndirty
++] = bp
;
428 lmfs_rw_scattered(dev
, dirty
, ndirty
, WRITING
);
431 /*===========================================================================*
432 * lmfs_rw_scattered *
433 *===========================================================================*/
434 void lmfs_rw_scattered(
435 dev_t dev
, /* major-minor device number */
436 struct buf
**bufq
, /* pointer to array of buffers */
437 int bufqsize
, /* number of buffers */
438 int rw_flag
/* READING or WRITING */
441 /* Read or write scattered data from a device. */
443 register struct buf
*bp
;
446 register iovec_t
*iop
;
447 static iovec_t
*iovec
= NULL
;
451 STATICINIT(iovec
, NR_IOREQS
);
453 /* (Shell) sort buffers on lmfs_blocknr. */
457 while (gap
<= bufqsize
);
460 for (j
= gap
; j
< bufqsize
; j
++) {
462 i
>= 0 && bufq
[i
]->lmfs_blocknr
> bufq
[i
+ gap
]->lmfs_blocknr
;
465 bufq
[i
] = bufq
[i
+ gap
];
471 /* Set up I/O vector and do I/O. The result of bdev I/O is OK if everything
472 * went fine, otherwise the error code for the first failed transfer.
474 while (bufqsize
> 0) {
475 for (j
= 0, iop
= iovec
; j
< NR_IOREQS
&& j
< bufqsize
; j
++, iop
++) {
477 if (bp
->lmfs_blocknr
!= (block_t
) bufq
[0]->lmfs_blocknr
+ j
) break;
478 iop
->iov_addr
= (vir_bytes
) bp
->data
;
479 iop
->iov_size
= (vir_bytes
) fs_block_size
;
481 pos
= mul64u(bufq
[0]->lmfs_blocknr
, fs_block_size
);
482 if (rw_flag
== READING
)
483 r
= bdev_gather(dev
, pos
, iovec
, j
, BDEV_NOFLAGS
);
485 r
= bdev_scatter(dev
, pos
, iovec
, j
, BDEV_NOFLAGS
);
487 /* Harvest the results. The driver may have returned an error, or it
488 * may have done less than what we asked for.
491 printf("fs cache: I/O error %d on device %d/%d, block %u\n",
492 r
, major(dev
), minor(dev
), bufq
[0]->lmfs_blocknr
);
494 for (i
= 0; i
< j
; i
++) {
496 if (r
< (ssize_t
) fs_block_size
) {
497 /* Transfer failed. */
499 bp
->lmfs_dev
= NO_DEV
; /* Invalidate block */
504 if (rw_flag
== READING
) {
505 bp
->lmfs_dev
= dev
; /* validate block */
506 lmfs_put_block(bp
, PARTIAL_DATA_BLOCK
);
514 if (rw_flag
== READING
) {
515 /* Don't bother reading more than the device is willing to
516 * give at this time. Don't forget to release those extras.
518 while (bufqsize
> 0) {
519 lmfs_put_block(*bufq
++, PARTIAL_DATA_BLOCK
);
523 if (rw_flag
== WRITING
&& i
== 0) {
524 /* We're not making progress, this means we might keep
525 * looping. Buffers remain dirty if un-written. Buffers are
526 * lost if invalidate()d or LRU-removed while dirty. This
527 * is better than keeping unwritable blocks around forever..
534 /*===========================================================================*
536 *===========================================================================*/
537 static void rm_lru(bp
)
540 /* Remove a block from its LRU chain. */
541 struct buf
*next_ptr
, *prev_ptr
;
544 next_ptr
= bp
->lmfs_next
; /* successor on LRU chain */
545 prev_ptr
= bp
->lmfs_prev
; /* predecessor on LRU chain */
546 if (prev_ptr
!= NULL
)
547 prev_ptr
->lmfs_next
= next_ptr
;
549 front
= next_ptr
; /* this block was at front of chain */
551 if (next_ptr
!= NULL
)
552 next_ptr
->lmfs_prev
= prev_ptr
;
554 rear
= prev_ptr
; /* this block was at rear of chain */
557 /*===========================================================================*
559 *===========================================================================*/
560 static void cache_resize(unsigned int blocksize
, unsigned int bufs
)
564 assert(blocksize
> 0);
565 assert(bufs
>= MINBUFS
);
567 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++)
568 if(bp
->lmfs_count
!= 0) panic("change blocksize with buffer in use");
572 fs_block_size
= blocksize
;
575 /*===========================================================================*
576 * lmfs_set_blocksize *
577 *===========================================================================*/
578 void lmfs_set_blocksize(int new_block_size
, int major
)
581 u32_t btotal
, bfree
, bused
;
583 cache_resize(new_block_size
, MINBUFS
);
585 fs_blockstats(&btotal
, &bfree
, &bused
);
587 bufs
= fs_bufs_heuristic(10, btotal
, bfree
,
588 new_block_size
, major
);
590 cache_resize(new_block_size
, bufs
);
592 /* Decide whether to use seconday cache or not.
594 * - it's available, and
595 * - use of it hasn't been disabled for this fs, and
596 * - our main FS device isn't a memory device
600 if(vm_forgetblock(VM_BLOCKID_NONE
) != ENOSYS
&&
601 may_use_vmcache
&& major
!= MEMORY_MAJOR
) {
606 /*===========================================================================*
608 *===========================================================================*/
609 void lmfs_buf_pool(int new_nr_bufs
)
611 /* Initialize the buffer pool. */
612 register struct buf
*bp
;
614 assert(new_nr_bufs
>= MINBUFS
);
619 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) {
621 assert(bp
->lmfs_bytes
> 0);
622 free_contig(bp
->data
, bp
->lmfs_bytes
);
630 if(!(buf
= calloc(sizeof(buf
[0]), new_nr_bufs
)))
631 panic("couldn't allocate buf list (%d)", new_nr_bufs
);
635 if(!(buf_hash
= calloc(sizeof(buf_hash
[0]), new_nr_bufs
)))
636 panic("couldn't allocate buf hash list (%d)", new_nr_bufs
);
638 nr_bufs
= new_nr_bufs
;
642 rear
= &buf
[nr_bufs
- 1];
644 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) {
645 bp
->lmfs_blocknr
= NO_BLOCK
;
646 bp
->lmfs_dev
= NO_DEV
;
647 bp
->lmfs_next
= bp
+ 1;
648 bp
->lmfs_prev
= bp
- 1;
652 front
->lmfs_prev
= NULL
;
653 rear
->lmfs_next
= NULL
;
655 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) bp
->lmfs_hash
= bp
->lmfs_next
;
661 int lmfs_bufs_in_use(void)
666 int lmfs_nr_bufs(void)
671 void lmfs_flushall(void)
674 for(bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++)
675 if(bp
->lmfs_dev
!= NO_DEV
&& bp
->lmfs_dirt
== BP_DIRTY
)
676 flushall(bp
->lmfs_dev
);
679 int lmfs_fs_block_size(void)
681 return fs_block_size
;
684 void lmfs_may_use_vmcache(int ok
)
686 may_use_vmcache
= ok
;
689 void lmfs_reset_rdwt_err(void)
694 int lmfs_rdwt_err(void)