10 #include <sys/param.h>
12 #include <minix/dmap.h>
13 #include <minix/libminixfs.h>
14 #include <minix/syslib.h>
15 #include <minix/sysutil.h>
16 #include <minix/u64.h>
17 #include <minix/bdev.h>
19 #define BP_CLEAN 0 /* on-disk block and memory copies identical */
20 #define BP_DIRTY 1 /* on-disk block and memory copies differ */
22 #define BUFHASH(b) ((b) % nr_bufs)
23 #define MARKCLEAN lmfs_markclean
25 #define MINBUFS 6 /* minimal no of bufs for sanity check */
27 static struct buf
*front
; /* points to least recently used free block */
28 static struct buf
*rear
; /* points to most recently used free block */
29 static unsigned int bufs_in_use
;/* # bufs currently in use (not on free list)*/
31 static void rm_lru(struct buf
*bp
);
32 static void read_block(struct buf
*);
33 static void flushall(dev_t dev
);
35 static int vmcache
= 0; /* are we using vm's secondary cache? (initially not) */
37 static struct buf
*buf
;
38 static struct buf
**buf_hash
; /* the buffer hash table */
39 static unsigned int nr_bufs
;
40 static int may_use_vmcache
;
42 static int fs_block_size
= 1024; /* raw i/o block size */
46 u32_t
fs_bufs_heuristic(int minbufs
, u32_t btotal
, u32_t bfree
,
47 int blocksize
, dev_t majordev
)
49 struct vm_stats_info vsi
;
51 u32_t kbytes_used_fs
, kbytes_total_fs
, kbcache
, kb_fsmax
;
52 u32_t kbytes_remain_mem
, bused
;
56 /* but we simply need minbufs no matter what, and we don't
57 * want more than that if we're a memory device
59 if(majordev
== MEMORY_MAJOR
) {
63 /* set a reasonable cache size; cache at most a certain
64 * portion of the used FS, and at most a certain %age of remaining
67 if((vm_info_stats(&vsi
) != OK
)) {
69 printf("fslib: heuristic info fail: default to %d bufs\n", bufs
);
73 kbytes_remain_mem
= div64u(mul64u(vsi
.vsi_free
, vsi
.vsi_pagesize
), 1024);
76 kbytes_used_fs
= div64u(mul64u(bused
, blocksize
), 1024);
77 kbytes_total_fs
= div64u(mul64u(btotal
, blocksize
), 1024);
79 /* heuristic for a desired cache size based on FS usage;
80 * but never bigger than half of the total filesystem
82 kb_fsmax
= sqrt_approx(kbytes_used_fs
)*40;
83 kb_fsmax
= MIN(kb_fsmax
, kbytes_total_fs
/2);
85 /* heuristic for a maximum usage - 10% of remaining memory */
86 kbcache
= MIN(kbytes_remain_mem
/10, kb_fsmax
);
87 bufs
= kbcache
* 1024 / blocksize
;
89 /* but we simply need MINBUFS no matter what */
97 lmfs_markdirty(struct buf
*bp
)
99 bp
->lmfs_dirt
= BP_DIRTY
;
103 lmfs_markclean(struct buf
*bp
)
105 bp
->lmfs_dirt
= BP_CLEAN
;
109 lmfs_isclean(struct buf
*bp
)
111 return bp
->lmfs_dirt
== BP_CLEAN
;
115 lmfs_dev(struct buf
*bp
)
120 int lmfs_bytes(struct buf
*bp
)
122 return bp
->lmfs_bytes
;
125 /*===========================================================================*
127 *===========================================================================*/
128 struct buf
*lmfs_get_block(
129 register dev_t dev
, /* on which device is the block? */
130 register block_t block
, /* which block is wanted? */
131 int only_search
/* if NO_READ, don't read, else act normal */
134 /* Check to see if the requested block is in the block cache. If so, return
135 * a pointer to it. If not, evict some other block and fetch it (unless
136 * 'only_search' is 1). All the blocks in the cache that are not in use
137 * are linked together in a chain, with 'front' pointing to the least recently
138 * used block and 'rear' to the most recently used block. If 'only_search' is
139 * 1, the block being requested will be overwritten in its entirety, so it is
140 * only necessary to see if it is in the cache; if it is not, any free buffer
141 * will do. It is not necessary to actually read the block in from disk.
142 * If 'only_search' is PREFETCH, the block need not be read from the disk,
143 * and the device is not to be marked on the block, so callers can tell if
144 * the block returned is valid.
145 * In addition to the LRU chain, there is also a hash chain to link together
146 * blocks whose block numbers end with the same bit strings, for fast lookup.
150 static struct buf
*bp
, *prev_ptr
;
151 u64_t yieldid
= VM_BLOCKID_NONE
, getid
= make64(dev
, block
);
157 ASSERT(fs_block_size
> 0);
159 assert(dev
!= NO_DEV
);
161 /* Search the hash chain for (dev, block). Do_read() can use
162 * lmfs_get_block(NO_DEV ...) to get an unnamed block to fill with zeros when
163 * someone wants to read from a hole in a file, in which case this search
169 if (bp
->lmfs_blocknr
== block
&& bp
->lmfs_dev
== dev
) {
170 /* Block needed has been found. */
171 if (bp
->lmfs_count
== 0) rm_lru(bp
);
172 bp
->lmfs_count
++; /* record that block is in use */
173 ASSERT(bp
->lmfs_bytes
== fs_block_size
);
174 ASSERT(bp
->lmfs_dev
== dev
);
175 ASSERT(bp
->lmfs_dev
!= NO_DEV
);
179 /* This block is not the one sought. */
180 bp
= bp
->lmfs_hash
; /* move to next block on hash chain */
184 /* Desired block is not on available chain. Take oldest block ('front'). */
185 if ((bp
= front
) == NULL
) panic("all buffers in use: %d", nr_bufs
);
187 if(bp
->lmfs_bytes
< fs_block_size
) {
189 ASSERT(bp
->lmfs_bytes
== 0);
190 if(!(bp
->data
= alloc_contig( (size_t) fs_block_size
, 0, NULL
))) {
191 printf("fs cache: couldn't allocate a new block.\n");
193 bp
&& bp
->lmfs_bytes
< fs_block_size
; bp
= bp
->lmfs_next
)
196 panic("no buffer available");
199 bp
->lmfs_bytes
= fs_block_size
;
205 ASSERT(bp
->lmfs_bytes
== fs_block_size
);
206 ASSERT(bp
->lmfs_count
== 0);
210 /* Remove the block that was just taken from its hash chain. */
211 b
= BUFHASH(bp
->lmfs_blocknr
);
212 prev_ptr
= buf_hash
[b
];
213 if (prev_ptr
== bp
) {
214 buf_hash
[b
] = bp
->lmfs_hash
;
216 /* The block just taken is not on the front of its hash chain. */
217 while (prev_ptr
->lmfs_hash
!= NULL
)
218 if (prev_ptr
->lmfs_hash
== bp
) {
219 prev_ptr
->lmfs_hash
= bp
->lmfs_hash
; /* found it */
222 prev_ptr
= prev_ptr
->lmfs_hash
; /* keep looking */
226 /* If the block taken is dirty, make it clean by writing it to the disk.
227 * Avoid hysteresis by flushing all other dirty blocks for the same device.
229 if (bp
->lmfs_dev
!= NO_DEV
) {
230 if (bp
->lmfs_dirt
== BP_DIRTY
) flushall(bp
->lmfs_dev
);
232 /* Are we throwing out a block that contained something?
233 * Give it to VM for the second-layer cache.
235 yieldid
= make64(bp
->lmfs_dev
, bp
->lmfs_blocknr
);
236 assert(bp
->lmfs_bytes
== fs_block_size
);
237 bp
->lmfs_dev
= NO_DEV
;
240 /* Fill in block's parameters and add it to the hash chain where it goes. */
241 MARKCLEAN(bp
); /* NO_DEV blocks may be marked dirty */
242 bp
->lmfs_dev
= dev
; /* fill in device number */
243 bp
->lmfs_blocknr
= block
; /* fill in block number */
244 bp
->lmfs_count
++; /* record that block is being used */
245 b
= BUFHASH(bp
->lmfs_blocknr
);
246 bp
->lmfs_hash
= buf_hash
[b
];
248 buf_hash
[b
] = bp
; /* add to hash list */
250 assert(dev
!= NO_DEV
);
252 /* Go get the requested block unless searching or prefetching. */
253 if(only_search
== PREFETCH
|| only_search
== NORMAL
) {
254 /* Block is not found in our cache, but we do want it
255 * if it's in the vm cache.
258 /* If we can satisfy the PREFETCH or NORMAL request
259 * from the vm cache, work is done.
261 if(vm_yield_block_get_block(yieldid
, getid
,
262 bp
->data
, fs_block_size
) == OK
) {
268 if(only_search
== PREFETCH
) {
269 /* PREFETCH: don't do i/o. */
270 bp
->lmfs_dev
= NO_DEV
;
271 } else if (only_search
== NORMAL
) {
273 } else if(only_search
== NO_READ
) {
274 /* we want this block, but its contents
275 * will be overwritten. VM has to forget
279 vm_forgetblock(getid
);
282 panic("unexpected only_search value: %d", only_search
);
286 return(bp
); /* return the newly acquired block */
289 /*===========================================================================*
291 *===========================================================================*/
292 void lmfs_put_block(bp
, block_type
)
293 register struct buf
*bp
; /* pointer to the buffer to be released */
294 int block_type
; /* INODE_BLOCK, DIRECTORY_BLOCK, or whatever */
296 /* Return a block to the list of available blocks. Depending on 'block_type'
297 * it may be put on the front or rear of the LRU chain. Blocks that are
298 * expected to be needed again shortly (e.g., partially full data blocks)
299 * go on the rear; blocks that are unlikely to be needed again shortly
300 * (e.g., full data blocks) go on the front. Blocks whose loss can hurt
301 * the integrity of the file system (e.g., inode blocks) are written to
302 * disk immediately if they are dirty.
304 if (bp
== NULL
) return; /* it is easier to check here than in caller */
306 bp
->lmfs_count
--; /* there is one use fewer now */
307 if (bp
->lmfs_count
!= 0) return; /* block is still in use */
309 bufs_in_use
--; /* one fewer block buffers in use */
311 /* Put this block back on the LRU chain. */
312 if (bp
->lmfs_dev
== DEV_RAM
|| (block_type
& ONE_SHOT
)) {
313 /* Block probably won't be needed quickly. Put it on front of chain.
314 * It will be the next block to be evicted from the cache.
316 bp
->lmfs_prev
= NULL
;
317 bp
->lmfs_next
= front
;
319 rear
= bp
; /* LRU chain was empty */
321 front
->lmfs_prev
= bp
;
325 /* Block probably will be needed quickly. Put it on rear of chain.
326 * It will not be evicted from the cache for a long time.
328 bp
->lmfs_prev
= rear
;
329 bp
->lmfs_next
= NULL
;
333 rear
->lmfs_next
= bp
;
338 /*===========================================================================*
340 *===========================================================================*/
341 static void read_block(bp
)
342 register struct buf
*bp
; /* buffer pointer */
344 /* Read or write a disk block. This is the only routine in which actual disk
345 * I/O is invoked. If an error occurs, a message is printed here, but the error
346 * is not reported to the caller. If the error occurred while purging a block
347 * from the cache, it is not clear what the caller could do about it anyway.
351 dev_t dev
= bp
->lmfs_dev
;
355 assert(dev
!= NO_DEV
);
357 pos
= mul64u(bp
->lmfs_blocknr
, fs_block_size
);
358 r
= bdev_read(dev
, pos
, bp
->data
, fs_block_size
,
361 printf("fs cache: I/O error on device %d/%d, block %u\n",
362 major(dev
), minor(dev
), bp
->lmfs_blocknr
);
364 } else if (r
!= (ssize_t
) fs_block_size
) {
370 bp
->lmfs_dev
= NO_DEV
; /* invalidate block */
372 /* Report read errors to interested parties. */
377 /*===========================================================================*
379 *===========================================================================*/
380 void lmfs_invalidate(
381 dev_t device
/* device whose blocks are to be purged */
384 /* Remove all the blocks belonging to some device from the cache. */
386 register struct buf
*bp
;
388 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++)
389 if (bp
->lmfs_dev
== device
) bp
->lmfs_dev
= NO_DEV
;
394 /*===========================================================================*
396 *===========================================================================*/
397 static void flushall(dev_t dev
)
399 /* Flush all dirty blocks for one device. */
401 register struct buf
*bp
;
402 static struct buf
**dirty
; /* static so it isn't on stack */
403 static unsigned int dirtylistsize
= 0;
406 if(dirtylistsize
!= nr_bufs
) {
407 if(dirtylistsize
> 0) {
408 assert(dirty
!= NULL
);
411 if(!(dirty
= malloc(sizeof(dirty
[0])*nr_bufs
)))
412 panic("couldn't allocate dirty buf list");
413 dirtylistsize
= nr_bufs
;
416 for (bp
= &buf
[0], ndirty
= 0; bp
< &buf
[nr_bufs
]; bp
++) {
417 if (bp
->lmfs_dirt
== BP_DIRTY
&& bp
->lmfs_dev
== dev
) {
418 dirty
[ndirty
++] = bp
;
422 lmfs_rw_scattered(dev
, dirty
, ndirty
, WRITING
);
425 /*===========================================================================*
426 * lmfs_rw_scattered *
427 *===========================================================================*/
428 void lmfs_rw_scattered(
429 dev_t dev
, /* major-minor device number */
430 struct buf
**bufq
, /* pointer to array of buffers */
431 int bufqsize
, /* number of buffers */
432 int rw_flag
/* READING or WRITING */
435 /* Read or write scattered data from a device. */
437 register struct buf
*bp
;
440 register iovec_t
*iop
;
441 static iovec_t
*iovec
= NULL
;
445 STATICINIT(iovec
, NR_IOREQS
);
447 /* (Shell) sort buffers on lmfs_blocknr. */
451 while (gap
<= bufqsize
);
454 for (j
= gap
; j
< bufqsize
; j
++) {
456 i
>= 0 && bufq
[i
]->lmfs_blocknr
> bufq
[i
+ gap
]->lmfs_blocknr
;
459 bufq
[i
] = bufq
[i
+ gap
];
465 /* Set up I/O vector and do I/O. The result of bdev I/O is OK if everything
466 * went fine, otherwise the error code for the first failed transfer.
468 while (bufqsize
> 0) {
469 for (j
= 0, iop
= iovec
; j
< NR_IOREQS
&& j
< bufqsize
; j
++, iop
++) {
471 if (bp
->lmfs_blocknr
!= (block_t
) bufq
[0]->lmfs_blocknr
+ j
) break;
472 iop
->iov_addr
= (vir_bytes
) bp
->data
;
473 iop
->iov_size
= (vir_bytes
) fs_block_size
;
475 pos
= mul64u(bufq
[0]->lmfs_blocknr
, fs_block_size
);
476 if (rw_flag
== READING
)
477 r
= bdev_gather(dev
, pos
, iovec
, j
, BDEV_NOFLAGS
);
479 r
= bdev_scatter(dev
, pos
, iovec
, j
, BDEV_NOFLAGS
);
481 /* Harvest the results. The driver may have returned an error, or it
482 * may have done less than what we asked for.
485 printf("fs cache: I/O error %d on device %d/%d, block %u\n",
486 r
, major(dev
), minor(dev
), bufq
[0]->lmfs_blocknr
);
488 for (i
= 0; i
< j
; i
++) {
490 if (r
< (ssize_t
) fs_block_size
) {
491 /* Transfer failed. */
493 bp
->lmfs_dev
= NO_DEV
; /* Invalidate block */
498 if (rw_flag
== READING
) {
499 bp
->lmfs_dev
= dev
; /* validate block */
500 lmfs_put_block(bp
, PARTIAL_DATA_BLOCK
);
508 if (rw_flag
== READING
) {
509 /* Don't bother reading more than the device is willing to
510 * give at this time. Don't forget to release those extras.
512 while (bufqsize
> 0) {
513 lmfs_put_block(*bufq
++, PARTIAL_DATA_BLOCK
);
517 if (rw_flag
== WRITING
&& i
== 0) {
518 /* We're not making progress, this means we might keep
519 * looping. Buffers remain dirty if un-written. Buffers are
520 * lost if invalidate()d or LRU-removed while dirty. This
521 * is better than keeping unwritable blocks around forever..
528 /*===========================================================================*
530 *===========================================================================*/
531 static void rm_lru(bp
)
534 /* Remove a block from its LRU chain. */
535 struct buf
*next_ptr
, *prev_ptr
;
538 next_ptr
= bp
->lmfs_next
; /* successor on LRU chain */
539 prev_ptr
= bp
->lmfs_prev
; /* predecessor on LRU chain */
540 if (prev_ptr
!= NULL
)
541 prev_ptr
->lmfs_next
= next_ptr
;
543 front
= next_ptr
; /* this block was at front of chain */
545 if (next_ptr
!= NULL
)
546 next_ptr
->lmfs_prev
= prev_ptr
;
548 rear
= prev_ptr
; /* this block was at rear of chain */
551 /*===========================================================================*
553 *===========================================================================*/
554 static void cache_resize(unsigned int blocksize
, unsigned int bufs
)
558 assert(blocksize
> 0);
559 assert(bufs
>= MINBUFS
);
561 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++)
562 if(bp
->lmfs_count
!= 0) panic("change blocksize with buffer in use");
566 fs_block_size
= blocksize
;
569 /*===========================================================================*
570 * lmfs_set_blocksize *
571 *===========================================================================*/
572 void lmfs_set_blocksize(int new_block_size
, int major
)
575 u32_t btotal
, bfree
, bused
;
577 cache_resize(new_block_size
, MINBUFS
);
579 fs_blockstats(&btotal
, &bfree
, &bused
);
581 bufs
= fs_bufs_heuristic(10, btotal
, bfree
,
582 new_block_size
, major
);
584 cache_resize(new_block_size
, bufs
);
586 /* Decide whether to use seconday cache or not.
588 * - it's available, and
589 * - use of it hasn't been disabled for this fs, and
590 * - our main FS device isn't a memory device
594 if(vm_forgetblock(VM_BLOCKID_NONE
) != ENOSYS
&&
595 may_use_vmcache
&& major
!= MEMORY_MAJOR
) {
600 /*===========================================================================*
602 *===========================================================================*/
603 void lmfs_buf_pool(int new_nr_bufs
)
605 /* Initialize the buffer pool. */
606 register struct buf
*bp
;
608 assert(new_nr_bufs
>= MINBUFS
);
613 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) {
615 assert(bp
->lmfs_bytes
> 0);
616 free_contig(bp
->data
, bp
->lmfs_bytes
);
624 if(!(buf
= calloc(sizeof(buf
[0]), new_nr_bufs
)))
625 panic("couldn't allocate buf list (%d)", new_nr_bufs
);
629 if(!(buf_hash
= calloc(sizeof(buf_hash
[0]), new_nr_bufs
)))
630 panic("couldn't allocate buf hash list (%d)", new_nr_bufs
);
632 nr_bufs
= new_nr_bufs
;
636 rear
= &buf
[nr_bufs
- 1];
638 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) {
639 bp
->lmfs_blocknr
= NO_BLOCK
;
640 bp
->lmfs_dev
= NO_DEV
;
641 bp
->lmfs_next
= bp
+ 1;
642 bp
->lmfs_prev
= bp
- 1;
646 front
->lmfs_prev
= NULL
;
647 rear
->lmfs_next
= NULL
;
649 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) bp
->lmfs_hash
= bp
->lmfs_next
;
655 int lmfs_bufs_in_use(void)
660 int lmfs_nr_bufs(void)
665 void lmfs_flushall(void)
668 for(bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++)
669 if(bp
->lmfs_dev
!= NO_DEV
&& bp
->lmfs_dirt
== BP_DIRTY
)
670 flushall(bp
->lmfs_dev
);
673 int lmfs_fs_block_size(void)
675 return fs_block_size
;
678 void lmfs_may_use_vmcache(int ok
)
680 may_use_vmcache
= ok
;
683 void lmfs_reset_rdwt_err(void)
688 int lmfs_rdwt_err(void)