VM: simplify slab allocator
[minix.git] / servers / mfs / cache.c
blob1259c5e8c59c628383cc9710c7adff4ccbe67a6a
1 /* The file system maintains a buffer cache to reduce the number of disk
2 * accesses needed. Whenever a read or write to the disk is done, a check is
3 * first made to see if the block is in the cache. This file manages the
4 * cache.
6 * The entry points into this file are:
7 * get_block: request to fetch a block for reading or writing from cache
8 * put_block: return a block previously requested with get_block
9 * alloc_zone: allocate a new zone (to increase the length of a file)
10 * free_zone: release a zone (when a file is removed)
11 * invalidate: remove all the cache blocks on some device
13 * Private functions:
14 * read_block: read or write a block from the disk itself
17 #include "fs.h"
18 #include <minix/u64.h>
19 #include <minix/bdev.h>
20 #include <sys/param.h>
21 #include <stdlib.h>
22 #include <assert.h>
23 #include <minix/libminixfs.h>
24 #include <math.h>
25 #include "buf.h"
26 #include "super.h"
27 #include "inode.h"
29 static void rm_lru(struct buf *bp);
30 static void read_block(struct buf *);
32 static int vmcache = 0; /* are we using vm's secondary cache? (initially not) */
34 static block_t super_start = 0, super_end = 0;
36 /*===========================================================================*
37 * get_block *
38 *===========================================================================*/
39 struct buf *get_block(
40 register dev_t dev, /* on which device is the block? */
41 register block_t block, /* which block is wanted? */
42 int only_search /* if NO_READ, don't read, else act normal */
45 /* Check to see if the requested block is in the block cache. If so, return
46 * a pointer to it. If not, evict some other block and fetch it (unless
47 * 'only_search' is 1). All the blocks in the cache that are not in use
48 * are linked together in a chain, with 'front' pointing to the least recently
49 * used block and 'rear' to the most recently used block. If 'only_search' is
50 * 1, the block being requested will be overwritten in its entirety, so it is
51 * only necessary to see if it is in the cache; if it is not, any free buffer
52 * will do. It is not necessary to actually read the block in from disk.
53 * If 'only_search' is PREFETCH, the block need not be read from the disk,
54 * and the device is not to be marked on the block, so callers can tell if
55 * the block returned is valid.
56 * In addition to the LRU chain, there is also a hash chain to link together
57 * blocks whose block numbers end with the same bit strings, for fast lookup.
60 int b;
61 static struct buf *bp, *prev_ptr;
62 u64_t yieldid = VM_BLOCKID_NONE, getid = make64(dev, block);
64 assert(buf_hash);
65 assert(buf);
66 assert(nr_bufs > 0);
68 ASSERT(fs_block_size > 0);
70 /* Search the hash chain for (dev, block). Do_read() can use
71 * get_block(NO_DEV ...) to get an unnamed block to fill with zeros when
72 * someone wants to read from a hole in a file, in which case this search
73 * is skipped
75 if (dev != NO_DEV) {
76 b = BUFHASH(block);
77 bp = buf_hash[b];
78 while (bp != NULL) {
79 if (bp->b_blocknr == block && bp->b_dev == dev) {
80 /* Block needed has been found. */
81 if (bp->b_count == 0) rm_lru(bp);
82 bp->b_count++; /* record that block is in use */
83 ASSERT(bp->b_bytes == fs_block_size);
84 ASSERT(bp->b_dev == dev);
85 ASSERT(bp->b_dev != NO_DEV);
86 ASSERT(bp->bp);
87 return(bp);
88 } else {
89 /* This block is not the one sought. */
90 bp = bp->b_hash; /* move to next block on hash chain */
95 /* Desired block is not on available chain. Take oldest block ('front'). */
96 if ((bp = front) == NULL) panic("all buffers in use: %d", nr_bufs);
98 if(bp->b_bytes < fs_block_size) {
99 ASSERT(!bp->bp);
100 ASSERT(bp->b_bytes == 0);
101 if(!(bp->bp = alloc_contig( (size_t) fs_block_size, 0, NULL))) {
102 printf("MFS: couldn't allocate a new block.\n");
103 for(bp = front;
104 bp && bp->b_bytes < fs_block_size; bp = bp->b_next)
106 if(!bp) {
107 panic("no buffer available");
109 } else {
110 bp->b_bytes = fs_block_size;
114 ASSERT(bp);
115 ASSERT(bp->bp);
116 ASSERT(bp->b_bytes == fs_block_size);
117 ASSERT(bp->b_count == 0);
119 rm_lru(bp);
121 /* Remove the block that was just taken from its hash chain. */
122 b = BUFHASH(bp->b_blocknr);
123 prev_ptr = buf_hash[b];
124 if (prev_ptr == bp) {
125 buf_hash[b] = bp->b_hash;
126 } else {
127 /* The block just taken is not on the front of its hash chain. */
128 while (prev_ptr->b_hash != NULL)
129 if (prev_ptr->b_hash == bp) {
130 prev_ptr->b_hash = bp->b_hash; /* found it */
131 break;
132 } else {
133 prev_ptr = prev_ptr->b_hash; /* keep looking */
137 /* If the block taken is dirty, make it clean by writing it to the disk.
138 * Avoid hysteresis by flushing all other dirty blocks for the same device.
140 if (bp->b_dev != NO_DEV) {
141 if (ISDIRTY(bp)) flushall(bp->b_dev);
143 /* Are we throwing out a block that contained something?
144 * Give it to VM for the second-layer cache.
146 yieldid = make64(bp->b_dev, bp->b_blocknr);
147 assert(bp->b_bytes == fs_block_size);
148 bp->b_dev = NO_DEV;
151 /* Fill in block's parameters and add it to the hash chain where it goes. */
152 MARKCLEAN(bp); /* NO_DEV blocks may be marked dirty */
153 bp->b_dev = dev; /* fill in device number */
154 bp->b_blocknr = block; /* fill in block number */
155 bp->b_count++; /* record that block is being used */
156 b = BUFHASH(bp->b_blocknr);
157 bp->b_hash = buf_hash[b];
159 buf_hash[b] = bp; /* add to hash list */
161 if(dev == NO_DEV) {
162 if(vmcache && cmp64(yieldid, VM_BLOCKID_NONE) != 0) {
163 vm_yield_block_get_block(yieldid, VM_BLOCKID_NONE,
164 bp->bp, fs_block_size);
166 return(bp); /* If the caller wanted a NO_DEV block, work is done. */
169 /* Go get the requested block unless searching or prefetching. */
170 if(only_search == PREFETCH || only_search == NORMAL) {
171 /* Block is not found in our cache, but we do want it
172 * if it's in the vm cache.
174 if(vmcache) {
175 /* If we can satisfy the PREFETCH or NORMAL request
176 * from the vm cache, work is done.
178 if(vm_yield_block_get_block(yieldid, getid,
179 bp->bp, fs_block_size) == OK) {
180 return bp;
185 if(only_search == PREFETCH) {
186 /* PREFETCH: don't do i/o. */
187 bp->b_dev = NO_DEV;
188 } else if (only_search == NORMAL) {
189 read_block(bp);
190 } else if(only_search == NO_READ) {
191 /* we want this block, but its contents
192 * will be overwritten. VM has to forget
193 * about it.
195 if(vmcache) {
196 vm_forgetblock(getid);
198 } else
199 panic("unexpected only_search value: %d", only_search);
201 assert(bp->bp);
203 return(bp); /* return the newly acquired block */
206 /*===========================================================================*
207 * put_block *
208 *===========================================================================*/
209 void put_block(bp, block_type)
210 register struct buf *bp; /* pointer to the buffer to be released */
211 int block_type; /* INODE_BLOCK, DIRECTORY_BLOCK, or whatever */
213 /* Return a block to the list of available blocks. Depending on 'block_type'
214 * it may be put on the front or rear of the LRU chain. Blocks that are
215 * expected to be needed again shortly (e.g., partially full data blocks)
216 * go on the rear; blocks that are unlikely to be needed again shortly
217 * (e.g., full data blocks) go on the front. Blocks whose loss can hurt
218 * the integrity of the file system (e.g., inode blocks) are written to
219 * disk immediately if they are dirty.
221 if (bp == NULL) return; /* it is easier to check here than in caller */
223 bp->b_count--; /* there is one use fewer now */
224 if (bp->b_count != 0) return; /* block is still in use */
226 bufs_in_use--; /* one fewer block buffers in use */
228 /* Put this block back on the LRU chain. If the ONE_SHOT bit is set in
229 * 'block_type', the block is not likely to be needed again shortly, so put
230 * it on the front of the LRU chain where it will be the first one to be
231 * taken when a free buffer is needed later.
233 if (bp->b_dev == DEV_RAM || (block_type & ONE_SHOT)) {
234 /* Block probably won't be needed quickly. Put it on front of chain.
235 * It will be the next block to be evicted from the cache.
237 bp->b_prev = NULL;
238 bp->b_next = front;
239 if (front == NULL)
240 rear = bp; /* LRU chain was empty */
241 else
242 front->b_prev = bp;
243 front = bp;
245 else {
246 /* Block probably will be needed quickly. Put it on rear of chain.
247 * It will not be evicted from the cache for a long time.
249 bp->b_prev = rear;
250 bp->b_next = NULL;
251 if (rear == NULL)
252 front = bp;
253 else
254 rear->b_next = bp;
255 rear = bp;
259 /*===========================================================================*
260 * alloc_zone *
261 *===========================================================================*/
262 zone_t alloc_zone(
263 dev_t dev, /* device where zone wanted */
264 zone_t z /* try to allocate new zone near this one */
267 /* Allocate a new zone on the indicated device and return its number. */
269 bit_t b, bit;
270 struct super_block *sp;
271 static int print_oos_msg = 1;
273 /* Note that the routine alloc_bit() returns 1 for the lowest possible
274 * zone, which corresponds to sp->s_firstdatazone. To convert a value
275 * between the bit number, 'b', used by alloc_bit() and the zone number, 'z',
276 * stored in the inode, use the formula:
277 * z = b + sp->s_firstdatazone - 1
278 * Alloc_bit() never returns 0, since this is used for NO_BIT (failure).
280 sp = get_super(dev);
282 /* If z is 0, skip initial part of the map known to be fully in use. */
283 if (z == sp->s_firstdatazone) {
284 bit = sp->s_zsearch;
285 } else {
286 bit = (bit_t) (z - (sp->s_firstdatazone - 1));
288 b = alloc_bit(sp, ZMAP, bit);
289 if (b == NO_BIT) {
290 err_code = ENOSPC;
291 if (print_oos_msg)
292 printf("No space on device %d/%d\n", major(sp->s_dev),
293 minor(sp->s_dev));
294 print_oos_msg = 0; /* Don't repeat message */
295 return(NO_ZONE);
297 print_oos_msg = 1;
298 if (z == sp->s_firstdatazone) sp->s_zsearch = b; /* for next time */
299 return( (zone_t) (sp->s_firstdatazone - 1) + (zone_t) b);
302 /*===========================================================================*
303 * free_zone *
304 *===========================================================================*/
305 void free_zone(
306 dev_t dev, /* device where zone located */
307 zone_t numb /* zone to be returned */
310 /* Return a zone. */
312 register struct super_block *sp;
313 bit_t bit;
315 /* Locate the appropriate super_block and return bit. */
316 sp = get_super(dev);
317 if (numb < sp->s_firstdatazone || numb >= sp->s_zones) return;
318 bit = (bit_t) (numb - (zone_t) (sp->s_firstdatazone - 1));
319 free_bit(sp, ZMAP, bit);
320 if (bit < sp->s_zsearch) sp->s_zsearch = bit;
323 /*===========================================================================*
324 * read_block *
325 *===========================================================================*/
326 static void read_block(bp)
327 register struct buf *bp; /* buffer pointer */
329 /* Read or write a disk block. This is the only routine in which actual disk
330 * I/O is invoked. If an error occurs, a message is printed here, but the error
331 * is not reported to the caller. If the error occurred while purging a block
332 * from the cache, it is not clear what the caller could do about it anyway.
334 int r, op_failed;
335 u64_t pos;
336 dev_t dev;
338 op_failed = 0;
340 if ( (dev = bp->b_dev) != NO_DEV) {
341 pos = mul64u(bp->b_blocknr, fs_block_size);
342 r = bdev_read(dev, pos, bp->b_data, fs_block_size,
343 BDEV_NOFLAGS);
344 if (r < 0) {
345 printf("MFS(%d) I/O error on device %d/%d, block %u\n",
346 SELF_E, major(dev), minor(dev), bp->b_blocknr);
347 op_failed = 1;
348 } else if (r != (ssize_t) fs_block_size) {
349 r = END_OF_FILE;
350 op_failed = 1;
353 if (op_failed) {
354 bp->b_dev = NO_DEV; /* invalidate block */
356 /* Report read errors to interested parties. */
357 rdwt_err = r;
362 /*===========================================================================*
363 * invalidate *
364 *===========================================================================*/
365 void invalidate(
366 dev_t device /* device whose blocks are to be purged */
369 /* Remove all the blocks belonging to some device from the cache. */
371 register struct buf *bp;
373 for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++)
374 if (bp->b_dev == device) bp->b_dev = NO_DEV;
376 vm_forgetblocks();
379 /*===========================================================================*
380 * block_write_ok *
381 *===========================================================================*/
382 int block_write_ok(struct buf *bp)
384 if(superblock.s_dev != bp->b_dev) return 1;
386 if(bp->b_blocknr >= super_start && bp->b_blocknr <= super_end) {
387 printf("MFS: blocking write to superblock on mounted filesystem dev 0x%x.\n", bp->b_dev);
388 return 0;
391 if(superblock.s_rd_only) {
392 printf("MFS: blocking write to mounted readonly filesystem 0x%x.\n", bp->b_dev);
393 printf("This shouldn't happen.\n");
394 return 0;
397 return 1;
400 /*===========================================================================*
401 * flushall *
402 *===========================================================================*/
403 void flushall(
404 dev_t dev /* device to flush */
407 /* Flush all dirty blocks for one device. */
409 register struct buf *bp;
410 static struct buf **dirty; /* static so it isn't on stack */
411 static unsigned int dirtylistsize = 0;
412 int ndirty;
414 if(dirtylistsize != nr_bufs) {
415 if(dirtylistsize > 0) {
416 assert(dirty != NULL);
417 free(dirty);
419 if(!(dirty = malloc(sizeof(dirty[0])*nr_bufs)))
420 panic("couldn't allocate dirty buf list");
421 dirtylistsize = nr_bufs;
424 for (bp = &buf[0], ndirty = 0; bp < &buf[nr_bufs]; bp++) {
425 if (ISDIRTY(bp) && bp->b_dev == dev) {
426 if(!block_write_ok(bp)) {
427 printf("MFS: LATE: ignoring changes in block %d\n", bp->b_blocknr);
428 MARKCLEAN(bp);
429 continue;
431 dirty[ndirty++] = bp;
434 rw_scattered(dev, dirty, ndirty, WRITING);
437 /*===========================================================================*
438 * rw_scattered *
439 *===========================================================================*/
440 void rw_scattered(
441 dev_t dev, /* major-minor device number */
442 struct buf **bufq, /* pointer to array of buffers */
443 int bufqsize, /* number of buffers */
444 int rw_flag /* READING or WRITING */
447 /* Read or write scattered data from a device. */
449 register struct buf *bp;
450 int gap;
451 register int i;
452 register iovec_t *iop;
453 static iovec_t *iovec = NULL;
454 u64_t pos;
455 int j, r;
457 STATICINIT(iovec, NR_IOREQS);
459 /* (Shell) sort buffers on b_blocknr. */
460 gap = 1;
462 gap = 3 * gap + 1;
463 while (gap <= bufqsize);
464 while (gap != 1) {
465 gap /= 3;
466 for (j = gap; j < bufqsize; j++) {
467 for (i = j - gap;
468 i >= 0 && bufq[i]->b_blocknr > bufq[i + gap]->b_blocknr;
469 i -= gap) {
470 bp = bufq[i];
471 bufq[i] = bufq[i + gap];
472 bufq[i + gap] = bp;
477 /* Set up I/O vector and do I/O. The result of bdev I/O is OK if everything
478 * went fine, otherwise the error code for the first failed transfer.
480 while (bufqsize > 0) {
481 for (j = 0, iop = iovec; j < NR_IOREQS && j < bufqsize; j++, iop++) {
482 bp = bufq[j];
483 if (bp->b_blocknr != (block_t) bufq[0]->b_blocknr + j) break;
484 iop->iov_addr = (vir_bytes) bp->b_data;
485 iop->iov_size = (vir_bytes) fs_block_size;
487 pos = mul64u(bufq[0]->b_blocknr, fs_block_size);
488 if (rw_flag == READING)
489 r = bdev_gather(dev, pos, iovec, j, BDEV_NOFLAGS);
490 else
491 r = bdev_scatter(dev, pos, iovec, j, BDEV_NOFLAGS);
493 /* Harvest the results. The driver may have returned an error, or it
494 * may have done less than what we asked for.
496 if (r < 0) {
497 printf("MFS: I/O error %d on device %d/%d, block %u\n",
498 r, major(dev), minor(dev), bufq[0]->b_blocknr);
500 for (i = 0; i < j; i++) {
501 bp = bufq[i];
502 if (r < (ssize_t) fs_block_size) {
503 /* Transfer failed. */
504 if (i == 0) {
505 bp->b_dev = NO_DEV; /* Invalidate block */
506 vm_forgetblocks();
508 break;
510 if (rw_flag == READING) {
511 bp->b_dev = dev; /* validate block */
512 put_block(bp, PARTIAL_DATA_BLOCK);
513 } else {
514 MARKCLEAN(bp);
516 r -= fs_block_size;
518 bufq += i;
519 bufqsize -= i;
520 if (rw_flag == READING) {
521 /* Don't bother reading more than the device is willing to
522 * give at this time. Don't forget to release those extras.
524 while (bufqsize > 0) {
525 put_block(*bufq++, PARTIAL_DATA_BLOCK);
526 bufqsize--;
529 if (rw_flag == WRITING && i == 0) {
530 /* We're not making progress, this means we might keep
531 * looping. Buffers remain dirty if un-written. Buffers are
532 * lost if invalidate()d or LRU-removed while dirty. This
533 * is better than keeping unwritable blocks around forever..
535 break;
540 /*===========================================================================*
541 * rm_lru *
542 *===========================================================================*/
543 static void rm_lru(bp)
544 struct buf *bp;
546 /* Remove a block from its LRU chain. */
547 struct buf *next_ptr, *prev_ptr;
549 bufs_in_use++;
550 next_ptr = bp->b_next; /* successor on LRU chain */
551 prev_ptr = bp->b_prev; /* predecessor on LRU chain */
552 if (prev_ptr != NULL)
553 prev_ptr->b_next = next_ptr;
554 else
555 front = next_ptr; /* this block was at front of chain */
557 if (next_ptr != NULL)
558 next_ptr->b_prev = prev_ptr;
559 else
560 rear = prev_ptr; /* this block was at rear of chain */
563 /*===========================================================================*
564 * cache_resize *
565 *===========================================================================*/
566 static void cache_resize(unsigned int blocksize, unsigned int bufs)
568 struct buf *bp;
569 struct inode *rip;
571 #define MINBUFS 10
572 assert(blocksize > 0);
573 assert(bufs >= MINBUFS);
575 for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++)
576 if(bp->b_count != 0) panic("change blocksize with buffer in use");
578 for (rip = &inode[0]; rip < &inode[NR_INODES]; rip++)
579 if (rip->i_count > 0) panic("change blocksize with inode in use");
581 buf_pool(bufs);
583 fs_block_size = blocksize;
584 super_start = SUPER_BLOCK_BYTES / fs_block_size;
585 super_end = (SUPER_BLOCK_BYTES + _MIN_BLOCK_SIZE - 1) / fs_block_size;
588 /*===========================================================================*
589 * bufs_heuristic *
590 *===========================================================================*/
591 static int bufs_heuristic(struct super_block *sp)
593 u32_t btotal, bfree, bused;
595 blockstats(&btotal, &bfree, &bused);
597 return fs_bufs_heuristic(MINBUFS, btotal, bfree,
598 sp->s_block_size, major(sp->s_dev));
601 /*===========================================================================*
602 * set_blocksize *
603 *===========================================================================*/
604 void set_blocksize(struct super_block *sp)
606 int bufs;
608 cache_resize(sp->s_block_size, MINBUFS);
609 bufs = bufs_heuristic(sp);
610 cache_resize(sp->s_block_size, bufs);
612 /* Decide whether to use seconday cache or not.
613 * Only do this if
614 * - it's available, and
615 * - use of it hasn't been disabled for this fs, and
616 * - our main FS device isn't a memory device
619 vmcache = 0;
620 if(vm_forgetblock(VM_BLOCKID_NONE) != ENOSYS &&
621 may_use_vmcache && major(sp->s_dev) != MEMORY_MAJOR) {
622 vmcache = 1;
626 /*===========================================================================*
627 * buf_pool *
628 *===========================================================================*/
629 void buf_pool(int new_nr_bufs)
631 /* Initialize the buffer pool. */
632 register struct buf *bp;
634 assert(new_nr_bufs >= MINBUFS);
636 if(nr_bufs > 0) {
637 assert(buf);
638 (void) fs_sync();
639 for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) {
640 if(bp->bp) {
641 assert(bp->b_bytes > 0);
642 free_contig(bp->bp, bp->b_bytes);
647 if(buf)
648 free(buf);
650 if(!(buf = calloc(sizeof(buf[0]), new_nr_bufs)))
651 panic("couldn't allocate buf list (%d)", new_nr_bufs);
653 if(buf_hash)
654 free(buf_hash);
655 if(!(buf_hash = calloc(sizeof(buf_hash[0]), new_nr_bufs)))
656 panic("couldn't allocate buf hash list (%d)", new_nr_bufs);
658 nr_bufs = new_nr_bufs;
660 bufs_in_use = 0;
661 front = &buf[0];
662 rear = &buf[nr_bufs - 1];
664 for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) {
665 bp->b_blocknr = NO_BLOCK;
666 bp->b_dev = NO_DEV;
667 bp->b_next = bp + 1;
668 bp->b_prev = bp - 1;
669 bp->bp = NULL;
670 bp->b_bytes = 0;
672 front->b_prev = NULL;
673 rear->b_next = NULL;
675 for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) bp->b_hash = bp->b_next;
676 buf_hash[0] = front;
678 vm_forgetblocks();