1 /* The file system maintains a buffer cache to reduce the number of disk
2 * accesses needed. Whenever a read or write to the disk is done, a check is
3 * first made to see if the block is in the cache. This file manages the
6 * The entry points into this file are:
7 * get_block: request to fetch a block for reading or writing from cache
8 * put_block: return a block previously requested with get_block
9 * alloc_zone: allocate a new zone (to increase the length of a file)
10 * free_zone: release a zone (when a file is removed)
11 * invalidate: remove all the cache blocks on some device
14 * read_block: read or write a block from the disk itself
18 #include <minix/u64.h>
19 #include <minix/bdev.h>
20 #include <sys/param.h>
23 #include <minix/libminixfs.h>
29 static void rm_lru(struct buf
*bp
);
30 static void read_block(struct buf
*);
32 static int vmcache
= 0; /* are we using vm's secondary cache? (initially not) */
34 static block_t super_start
= 0, super_end
= 0;
36 /*===========================================================================*
38 *===========================================================================*/
39 struct buf
*get_block(
40 register dev_t dev
, /* on which device is the block? */
41 register block_t block
, /* which block is wanted? */
42 int only_search
/* if NO_READ, don't read, else act normal */
45 /* Check to see if the requested block is in the block cache. If so, return
46 * a pointer to it. If not, evict some other block and fetch it (unless
47 * 'only_search' is 1). All the blocks in the cache that are not in use
48 * are linked together in a chain, with 'front' pointing to the least recently
49 * used block and 'rear' to the most recently used block. If 'only_search' is
50 * 1, the block being requested will be overwritten in its entirety, so it is
51 * only necessary to see if it is in the cache; if it is not, any free buffer
52 * will do. It is not necessary to actually read the block in from disk.
53 * If 'only_search' is PREFETCH, the block need not be read from the disk,
54 * and the device is not to be marked on the block, so callers can tell if
55 * the block returned is valid.
56 * In addition to the LRU chain, there is also a hash chain to link together
57 * blocks whose block numbers end with the same bit strings, for fast lookup.
61 static struct buf
*bp
, *prev_ptr
;
62 u64_t yieldid
= VM_BLOCKID_NONE
, getid
= make64(dev
, block
);
68 ASSERT(fs_block_size
> 0);
70 /* Search the hash chain for (dev, block). Do_read() can use
71 * get_block(NO_DEV ...) to get an unnamed block to fill with zeros when
72 * someone wants to read from a hole in a file, in which case this search
79 if (bp
->b_blocknr
== block
&& bp
->b_dev
== dev
) {
80 /* Block needed has been found. */
81 if (bp
->b_count
== 0) rm_lru(bp
);
82 bp
->b_count
++; /* record that block is in use */
83 ASSERT(bp
->b_bytes
== fs_block_size
);
84 ASSERT(bp
->b_dev
== dev
);
85 ASSERT(bp
->b_dev
!= NO_DEV
);
89 /* This block is not the one sought. */
90 bp
= bp
->b_hash
; /* move to next block on hash chain */
95 /* Desired block is not on available chain. Take oldest block ('front'). */
96 if ((bp
= front
) == NULL
) panic("all buffers in use: %d", nr_bufs
);
98 if(bp
->b_bytes
< fs_block_size
) {
100 ASSERT(bp
->b_bytes
== 0);
101 if(!(bp
->bp
= alloc_contig( (size_t) fs_block_size
, 0, NULL
))) {
102 printf("MFS: couldn't allocate a new block.\n");
104 bp
&& bp
->b_bytes
< fs_block_size
; bp
= bp
->b_next
)
107 panic("no buffer available");
110 bp
->b_bytes
= fs_block_size
;
116 ASSERT(bp
->b_bytes
== fs_block_size
);
117 ASSERT(bp
->b_count
== 0);
121 /* Remove the block that was just taken from its hash chain. */
122 b
= BUFHASH(bp
->b_blocknr
);
123 prev_ptr
= buf_hash
[b
];
124 if (prev_ptr
== bp
) {
125 buf_hash
[b
] = bp
->b_hash
;
127 /* The block just taken is not on the front of its hash chain. */
128 while (prev_ptr
->b_hash
!= NULL
)
129 if (prev_ptr
->b_hash
== bp
) {
130 prev_ptr
->b_hash
= bp
->b_hash
; /* found it */
133 prev_ptr
= prev_ptr
->b_hash
; /* keep looking */
137 /* If the block taken is dirty, make it clean by writing it to the disk.
138 * Avoid hysteresis by flushing all other dirty blocks for the same device.
140 if (bp
->b_dev
!= NO_DEV
) {
141 if (ISDIRTY(bp
)) flushall(bp
->b_dev
);
143 /* Are we throwing out a block that contained something?
144 * Give it to VM for the second-layer cache.
146 yieldid
= make64(bp
->b_dev
, bp
->b_blocknr
);
147 assert(bp
->b_bytes
== fs_block_size
);
151 /* Fill in block's parameters and add it to the hash chain where it goes. */
152 MARKCLEAN(bp
); /* NO_DEV blocks may be marked dirty */
153 bp
->b_dev
= dev
; /* fill in device number */
154 bp
->b_blocknr
= block
; /* fill in block number */
155 bp
->b_count
++; /* record that block is being used */
156 b
= BUFHASH(bp
->b_blocknr
);
157 bp
->b_hash
= buf_hash
[b
];
159 buf_hash
[b
] = bp
; /* add to hash list */
162 if(vmcache
&& cmp64(yieldid
, VM_BLOCKID_NONE
) != 0) {
163 vm_yield_block_get_block(yieldid
, VM_BLOCKID_NONE
,
164 bp
->bp
, fs_block_size
);
166 return(bp
); /* If the caller wanted a NO_DEV block, work is done. */
169 /* Go get the requested block unless searching or prefetching. */
170 if(only_search
== PREFETCH
|| only_search
== NORMAL
) {
171 /* Block is not found in our cache, but we do want it
172 * if it's in the vm cache.
175 /* If we can satisfy the PREFETCH or NORMAL request
176 * from the vm cache, work is done.
178 if(vm_yield_block_get_block(yieldid
, getid
,
179 bp
->bp
, fs_block_size
) == OK
) {
185 if(only_search
== PREFETCH
) {
186 /* PREFETCH: don't do i/o. */
188 } else if (only_search
== NORMAL
) {
190 } else if(only_search
== NO_READ
) {
191 /* we want this block, but its contents
192 * will be overwritten. VM has to forget
196 vm_forgetblock(getid
);
199 panic("unexpected only_search value: %d", only_search
);
203 return(bp
); /* return the newly acquired block */
206 /*===========================================================================*
208 *===========================================================================*/
209 void put_block(bp
, block_type
)
210 register struct buf
*bp
; /* pointer to the buffer to be released */
211 int block_type
; /* INODE_BLOCK, DIRECTORY_BLOCK, or whatever */
213 /* Return a block to the list of available blocks. Depending on 'block_type'
214 * it may be put on the front or rear of the LRU chain. Blocks that are
215 * expected to be needed again shortly (e.g., partially full data blocks)
216 * go on the rear; blocks that are unlikely to be needed again shortly
217 * (e.g., full data blocks) go on the front. Blocks whose loss can hurt
218 * the integrity of the file system (e.g., inode blocks) are written to
219 * disk immediately if they are dirty.
221 if (bp
== NULL
) return; /* it is easier to check here than in caller */
223 bp
->b_count
--; /* there is one use fewer now */
224 if (bp
->b_count
!= 0) return; /* block is still in use */
226 bufs_in_use
--; /* one fewer block buffers in use */
228 /* Put this block back on the LRU chain. If the ONE_SHOT bit is set in
229 * 'block_type', the block is not likely to be needed again shortly, so put
230 * it on the front of the LRU chain where it will be the first one to be
231 * taken when a free buffer is needed later.
233 if (bp
->b_dev
== DEV_RAM
|| (block_type
& ONE_SHOT
)) {
234 /* Block probably won't be needed quickly. Put it on front of chain.
235 * It will be the next block to be evicted from the cache.
240 rear
= bp
; /* LRU chain was empty */
246 /* Block probably will be needed quickly. Put it on rear of chain.
247 * It will not be evicted from the cache for a long time.
259 /*===========================================================================*
261 *===========================================================================*/
263 dev_t dev
, /* device where zone wanted */
264 zone_t z
/* try to allocate new zone near this one */
267 /* Allocate a new zone on the indicated device and return its number. */
270 struct super_block
*sp
;
271 static int print_oos_msg
= 1;
273 /* Note that the routine alloc_bit() returns 1 for the lowest possible
274 * zone, which corresponds to sp->s_firstdatazone. To convert a value
275 * between the bit number, 'b', used by alloc_bit() and the zone number, 'z',
276 * stored in the inode, use the formula:
277 * z = b + sp->s_firstdatazone - 1
278 * Alloc_bit() never returns 0, since this is used for NO_BIT (failure).
282 /* If z is 0, skip initial part of the map known to be fully in use. */
283 if (z
== sp
->s_firstdatazone
) {
286 bit
= (bit_t
) (z
- (sp
->s_firstdatazone
- 1));
288 b
= alloc_bit(sp
, ZMAP
, bit
);
292 printf("No space on device %d/%d\n", major(sp
->s_dev
),
294 print_oos_msg
= 0; /* Don't repeat message */
298 if (z
== sp
->s_firstdatazone
) sp
->s_zsearch
= b
; /* for next time */
299 return( (zone_t
) (sp
->s_firstdatazone
- 1) + (zone_t
) b
);
302 /*===========================================================================*
304 *===========================================================================*/
306 dev_t dev
, /* device where zone located */
307 zone_t numb
/* zone to be returned */
312 register struct super_block
*sp
;
315 /* Locate the appropriate super_block and return bit. */
317 if (numb
< sp
->s_firstdatazone
|| numb
>= sp
->s_zones
) return;
318 bit
= (bit_t
) (numb
- (zone_t
) (sp
->s_firstdatazone
- 1));
319 free_bit(sp
, ZMAP
, bit
);
320 if (bit
< sp
->s_zsearch
) sp
->s_zsearch
= bit
;
323 /*===========================================================================*
325 *===========================================================================*/
326 static void read_block(bp
)
327 register struct buf
*bp
; /* buffer pointer */
329 /* Read or write a disk block. This is the only routine in which actual disk
330 * I/O is invoked. If an error occurs, a message is printed here, but the error
331 * is not reported to the caller. If the error occurred while purging a block
332 * from the cache, it is not clear what the caller could do about it anyway.
340 if ( (dev
= bp
->b_dev
) != NO_DEV
) {
341 pos
= mul64u(bp
->b_blocknr
, fs_block_size
);
342 r
= bdev_read(dev
, pos
, bp
->b_data
, fs_block_size
,
345 printf("MFS(%d) I/O error on device %d/%d, block %u\n",
346 SELF_E
, major(dev
), minor(dev
), bp
->b_blocknr
);
348 } else if (r
!= (ssize_t
) fs_block_size
) {
354 bp
->b_dev
= NO_DEV
; /* invalidate block */
356 /* Report read errors to interested parties. */
362 /*===========================================================================*
364 *===========================================================================*/
366 dev_t device
/* device whose blocks are to be purged */
369 /* Remove all the blocks belonging to some device from the cache. */
371 register struct buf
*bp
;
373 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++)
374 if (bp
->b_dev
== device
) bp
->b_dev
= NO_DEV
;
379 /*===========================================================================*
381 *===========================================================================*/
382 int block_write_ok(struct buf
*bp
)
384 if(superblock
.s_dev
!= bp
->b_dev
) return 1;
386 if(bp
->b_blocknr
>= super_start
&& bp
->b_blocknr
<= super_end
) {
387 printf("MFS: blocking write to superblock on mounted filesystem dev 0x%x.\n", bp
->b_dev
);
391 if(superblock
.s_rd_only
) {
392 printf("MFS: blocking write to mounted readonly filesystem 0x%x.\n", bp
->b_dev
);
393 printf("This shouldn't happen.\n");
400 /*===========================================================================*
402 *===========================================================================*/
404 dev_t dev
/* device to flush */
407 /* Flush all dirty blocks for one device. */
409 register struct buf
*bp
;
410 static struct buf
**dirty
; /* static so it isn't on stack */
411 static unsigned int dirtylistsize
= 0;
414 if(dirtylistsize
!= nr_bufs
) {
415 if(dirtylistsize
> 0) {
416 assert(dirty
!= NULL
);
419 if(!(dirty
= malloc(sizeof(dirty
[0])*nr_bufs
)))
420 panic("couldn't allocate dirty buf list");
421 dirtylistsize
= nr_bufs
;
424 for (bp
= &buf
[0], ndirty
= 0; bp
< &buf
[nr_bufs
]; bp
++) {
425 if (ISDIRTY(bp
) && bp
->b_dev
== dev
) {
426 if(!block_write_ok(bp
)) {
427 printf("MFS: LATE: ignoring changes in block %d\n", bp
->b_blocknr
);
431 dirty
[ndirty
++] = bp
;
434 rw_scattered(dev
, dirty
, ndirty
, WRITING
);
437 /*===========================================================================*
439 *===========================================================================*/
441 dev_t dev
, /* major-minor device number */
442 struct buf
**bufq
, /* pointer to array of buffers */
443 int bufqsize
, /* number of buffers */
444 int rw_flag
/* READING or WRITING */
447 /* Read or write scattered data from a device. */
449 register struct buf
*bp
;
452 register iovec_t
*iop
;
453 static iovec_t
*iovec
= NULL
;
457 STATICINIT(iovec
, NR_IOREQS
);
459 /* (Shell) sort buffers on b_blocknr. */
463 while (gap
<= bufqsize
);
466 for (j
= gap
; j
< bufqsize
; j
++) {
468 i
>= 0 && bufq
[i
]->b_blocknr
> bufq
[i
+ gap
]->b_blocknr
;
471 bufq
[i
] = bufq
[i
+ gap
];
477 /* Set up I/O vector and do I/O. The result of bdev I/O is OK if everything
478 * went fine, otherwise the error code for the first failed transfer.
480 while (bufqsize
> 0) {
481 for (j
= 0, iop
= iovec
; j
< NR_IOREQS
&& j
< bufqsize
; j
++, iop
++) {
483 if (bp
->b_blocknr
!= (block_t
) bufq
[0]->b_blocknr
+ j
) break;
484 iop
->iov_addr
= (vir_bytes
) bp
->b_data
;
485 iop
->iov_size
= (vir_bytes
) fs_block_size
;
487 pos
= mul64u(bufq
[0]->b_blocknr
, fs_block_size
);
488 if (rw_flag
== READING
)
489 r
= bdev_gather(dev
, pos
, iovec
, j
, BDEV_NOFLAGS
);
491 r
= bdev_scatter(dev
, pos
, iovec
, j
, BDEV_NOFLAGS
);
493 /* Harvest the results. The driver may have returned an error, or it
494 * may have done less than what we asked for.
497 printf("MFS: I/O error %d on device %d/%d, block %u\n",
498 r
, major(dev
), minor(dev
), bufq
[0]->b_blocknr
);
500 for (i
= 0; i
< j
; i
++) {
502 if (r
< (ssize_t
) fs_block_size
) {
503 /* Transfer failed. */
505 bp
->b_dev
= NO_DEV
; /* Invalidate block */
510 if (rw_flag
== READING
) {
511 bp
->b_dev
= dev
; /* validate block */
512 put_block(bp
, PARTIAL_DATA_BLOCK
);
520 if (rw_flag
== READING
) {
521 /* Don't bother reading more than the device is willing to
522 * give at this time. Don't forget to release those extras.
524 while (bufqsize
> 0) {
525 put_block(*bufq
++, PARTIAL_DATA_BLOCK
);
529 if (rw_flag
== WRITING
&& i
== 0) {
530 /* We're not making progress, this means we might keep
531 * looping. Buffers remain dirty if un-written. Buffers are
532 * lost if invalidate()d or LRU-removed while dirty. This
533 * is better than keeping unwritable blocks around forever..
540 /*===========================================================================*
542 *===========================================================================*/
543 static void rm_lru(bp
)
546 /* Remove a block from its LRU chain. */
547 struct buf
*next_ptr
, *prev_ptr
;
550 next_ptr
= bp
->b_next
; /* successor on LRU chain */
551 prev_ptr
= bp
->b_prev
; /* predecessor on LRU chain */
552 if (prev_ptr
!= NULL
)
553 prev_ptr
->b_next
= next_ptr
;
555 front
= next_ptr
; /* this block was at front of chain */
557 if (next_ptr
!= NULL
)
558 next_ptr
->b_prev
= prev_ptr
;
560 rear
= prev_ptr
; /* this block was at rear of chain */
563 /*===========================================================================*
565 *===========================================================================*/
566 static void cache_resize(unsigned int blocksize
, unsigned int bufs
)
572 assert(blocksize
> 0);
573 assert(bufs
>= MINBUFS
);
575 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++)
576 if(bp
->b_count
!= 0) panic("change blocksize with buffer in use");
578 for (rip
= &inode
[0]; rip
< &inode
[NR_INODES
]; rip
++)
579 if (rip
->i_count
> 0) panic("change blocksize with inode in use");
583 fs_block_size
= blocksize
;
584 super_start
= SUPER_BLOCK_BYTES
/ fs_block_size
;
585 super_end
= (SUPER_BLOCK_BYTES
+ _MIN_BLOCK_SIZE
- 1) / fs_block_size
;
588 /*===========================================================================*
590 *===========================================================================*/
591 static int bufs_heuristic(struct super_block
*sp
)
593 u32_t btotal
, bfree
, bused
;
595 blockstats(&btotal
, &bfree
, &bused
);
597 return fs_bufs_heuristic(MINBUFS
, btotal
, bfree
,
598 sp
->s_block_size
, major(sp
->s_dev
));
601 /*===========================================================================*
603 *===========================================================================*/
604 void set_blocksize(struct super_block
*sp
)
608 cache_resize(sp
->s_block_size
, MINBUFS
);
609 bufs
= bufs_heuristic(sp
);
610 cache_resize(sp
->s_block_size
, bufs
);
612 /* Decide whether to use seconday cache or not.
614 * - it's available, and
615 * - use of it hasn't been disabled for this fs, and
616 * - our main FS device isn't a memory device
620 if(vm_forgetblock(VM_BLOCKID_NONE
) != ENOSYS
&&
621 may_use_vmcache
&& major(sp
->s_dev
) != MEMORY_MAJOR
) {
626 /*===========================================================================*
628 *===========================================================================*/
629 void buf_pool(int new_nr_bufs
)
631 /* Initialize the buffer pool. */
632 register struct buf
*bp
;
634 assert(new_nr_bufs
>= MINBUFS
);
639 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) {
641 assert(bp
->b_bytes
> 0);
642 free_contig(bp
->bp
, bp
->b_bytes
);
650 if(!(buf
= calloc(sizeof(buf
[0]), new_nr_bufs
)))
651 panic("couldn't allocate buf list (%d)", new_nr_bufs
);
655 if(!(buf_hash
= calloc(sizeof(buf_hash
[0]), new_nr_bufs
)))
656 panic("couldn't allocate buf hash list (%d)", new_nr_bufs
);
658 nr_bufs
= new_nr_bufs
;
662 rear
= &buf
[nr_bufs
- 1];
664 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) {
665 bp
->b_blocknr
= NO_BLOCK
;
672 front
->b_prev
= NULL
;
675 for (bp
= &buf
[0]; bp
< &buf
[nr_bufs
]; bp
++) bp
->b_hash
= bp
->b_next
;