minix/lib/libminixfs/cache.c

   1
   2 #define _SYSTEM
   3
   4 #include <assert.h>
   5 #include <string.h>
   6 #include <errno.h>
   7 #include <math.h>
   8 #include <stdlib.h>
   9
  10 #include <machine/vmparam.h>
  11
  12 #include <sys/param.h>
  13 #include <sys/mman.h>
  14
  15 #include <minix/dmap.h>
  16 #include <minix/libminixfs.h>
  17 #include <minix/syslib.h>
  18 #include <minix/sysutil.h>
  19 #include <minix/u64.h>
  20 #include <minix/bdev.h>
  21 #include <minix/bitmap.h>
  22
  23 #include "inc.h"
  24
  25 /* Buffer (block) cache.  To acquire a block, a routine calls lmfs_get_block(),
  26  * telling which block it wants.  The block is then regarded as "in use" and
  27  * has its reference count incremented.  All the blocks that are not in use are
  28  * chained together in an LRU list, with 'front' pointing to the least recently
  29  * used block, and 'rear' to the most recently used block.  A reverse chain is
  30  * also maintained.  Usage for LRU is measured by the time the put_block() is
  31  * done.  The second parameter to put_block() can violate the LRU order and put
  32  * a block on the front of the list, if it will probably not be needed again.
  33  * This is used internally only; the lmfs_put_block() API call has no second
  34  * parameter.  If a block is modified, the modifying routine must mark the
  35  * block as dirty, so the block will eventually be rewritten to the disk.
  36  */
  37
  38 /* Flags to put_block(). */
  39 #define ONE_SHOT      0x1       /* set if block will not be needed again */
  40
  41 #define BUFHASH(b) ((unsigned int)((b) % nr_bufs))
  42 #define MARKCLEAN  lmfs_markclean
  43
  44 #define MINBUFS 6       /* minimal no of bufs for sanity check */
  45
  46 static struct buf *front;       /* points to least recently used free block */
  47 static struct buf *rear;        /* points to most recently used free block */
  48 static unsigned int bufs_in_use;/* # bufs currently in use (not on free list)*/
  49
  50 static void rm_lru(struct buf *bp);
  51 static int read_block(struct buf *bp, size_t size);
  52 static void freeblock(struct buf *bp);
  53 static void cache_heuristic_check(void);
  54 static void put_block(struct buf *bp, int put_flags);
  55
  56 static int vmcache = 0; /* are we using vm's secondary cache? (initially not) */
  57
  58 static struct buf *buf;
  59 static struct buf **buf_hash;   /* the buffer hash table */
  60 static unsigned int nr_bufs;
  61 static int may_use_vmcache;
  62
  63 static size_t fs_block_size = PAGE_SIZE;        /* raw i/o block size */
  64
  65 static fsblkcnt_t fs_btotal = 0, fs_bused = 0;
  66
  67 static int quiet = 0;
  68
  69 typedef struct buf *noxfer_buf_ptr_t; /* annotation for temporary buf ptrs */
  70
  71 void lmfs_setquiet(int q) { quiet = q; }
  72
  73 static int fs_bufs_heuristic(int minbufs, fsblkcnt_t btotal,
  74         fsblkcnt_t bused, int blocksize)
  75 {
  76   struct vm_stats_info vsi;
  77   int bufs;
  78   u32_t kbytes_used_fs, kbytes_total_fs, kbcache, kb_fsmax;
  79   u32_t kbytes_remain_mem;
  80
  81   /* set a reasonable cache size; cache at most a certain
  82    * portion of the used FS, and at most a certain %age of remaining
  83    * memory
  84    */
  85   if(vm_info_stats(&vsi) != OK) {
  86         bufs = 1024;
  87         if(!quiet)
  88           printf("fslib: heuristic info fail: default to %d bufs\n", bufs);
  89         return bufs;
  90   }
  91
  92   /* remaining free memory is unused memory plus memory in used for cache,
  93    * as the cache can be evicted
  94    */
  95   kbytes_remain_mem = (u64_t)(vsi.vsi_free + vsi.vsi_cached) *
  96         vsi.vsi_pagesize / 1024;
  97
  98   /* check fs usage. */
  99   kbytes_used_fs  = (unsigned long)(((u64_t)bused * blocksize) / 1024);
 100   kbytes_total_fs = (unsigned long)(((u64_t)btotal * blocksize) / 1024);
 101
 102   /* heuristic for a desired cache size based on FS usage;
 103    * but never bigger than half of the total filesystem
 104    */
 105   kb_fsmax = sqrt_approx(kbytes_used_fs)*40;
 106   kb_fsmax = MIN(kb_fsmax, kbytes_total_fs/2);
 107
 108   /* heuristic for a maximum usage - 10% of remaining memory */
 109   kbcache = MIN(kbytes_remain_mem/10, kb_fsmax);
 110   bufs = kbcache * 1024 / blocksize;
 111
 112   /* but we simply need MINBUFS no matter what */
 113   if(bufs < minbufs)
 114         bufs = minbufs;
 115
 116   return bufs;
 117 }
 118
 119 void lmfs_change_blockusage(int delta)
 120 {
 121         /* Change the number of allocated blocks by 'delta.'
 122          * Also accumulate the delta since the last cache re-evaluation.
 123          * If it is outside a certain band, ask the cache library to
 124          * re-evaluate the cache size.
 125          */
 126         static int bitdelta = 0, warn_low = TRUE, warn_high = TRUE;
 127
 128         /* Adjust the file system block usage counter accordingly. Do bounds
 129          * checking, and report file system misbehavior.
 130          */
 131         if (delta > 0 && (fsblkcnt_t)delta > fs_btotal - fs_bused) {
 132                 if (warn_high) {
 133                         printf("libminixfs: block usage overflow\n");
 134                         warn_high = FALSE;
 135                 }
 136                 delta = (int)(fs_btotal - fs_bused);
 137         } else if (delta < 0 && (fsblkcnt_t)-delta > fs_bused) {
 138                 if (warn_low) {
 139                         printf("libminixfs: block usage underflow\n");
 140                         warn_low = FALSE;
 141                 }
 142                 delta = -(int)fs_bused;
 143         }
 144         fs_bused += delta;
 145
 146         bitdelta += delta;
 147
 148 #define BAND_KB (10*1024)       /* recheck cache every 10MB change */
 149
 150         /* If the accumulated delta exceeds the configured threshold, resize
 151          * the cache, but only if the cache isn't in use any more. In order to
 152          * avoid that the latter case blocks a resize forever, we also call
 153          * this function from lmfs_flushall(). Since lmfs_buf_pool() may call
 154          * lmfs_flushall(), reset 'bitdelta' before doing the heuristics check.
 155          */
 156         if (bufs_in_use == 0 &&
 157             (bitdelta*(int)fs_block_size/1024 > BAND_KB ||
 158             bitdelta*(int)fs_block_size/1024 < -BAND_KB)) {
 159                 bitdelta = 0;
 160                 cache_heuristic_check();
 161         }
 162 }
 163
 164 void lmfs_markdirty(struct buf *bp)
 165 {
 166         bp->lmfs_flags |= VMMC_DIRTY;
 167 }
 168
 169 void lmfs_markclean(struct buf *bp)
 170 {
 171         bp->lmfs_flags &= ~VMMC_DIRTY;
 172 }
 173
 174 int lmfs_isclean(struct buf *bp)
 175 {
 176         return !(bp->lmfs_flags & VMMC_DIRTY);
 177 }
 178
 179 static void free_unused_blocks(void)
 180 {
 181         struct buf *bp;
 182
 183         int freed = 0, bytes = 0;
 184         printf("libminixfs: freeing; %d blocks in use\n", bufs_in_use);
 185         for(bp = &buf[0]; bp < &buf[nr_bufs]; bp++) {
 186                 if(bp->lmfs_bytes > 0 && bp->lmfs_count == 0) {
 187                         freed++;
 188                         bytes += bp->lmfs_bytes;
 189                         freeblock(bp);
 190                 }
 191         }
 192         printf("libminixfs: freeing; %d blocks, %d bytes\n", freed, bytes);
 193 }
 194
 195 static void lmfs_alloc_block(struct buf *bp, size_t block_size)
 196 {
 197   ASSERT(!bp->data);
 198   ASSERT(bp->lmfs_bytes == 0);
 199
 200   if((bp->data = mmap(0, block_size, PROT_READ|PROT_WRITE,
 201       MAP_PREALLOC|MAP_ANON, -1, 0)) == MAP_FAILED) {
 202         free_unused_blocks();
 203         if((bp->data = mmap(0, block_size, PROT_READ|PROT_WRITE,
 204                 MAP_PREALLOC|MAP_ANON, -1, 0)) == MAP_FAILED) {
 205                 panic("libminixfs: could not allocate block");
 206         }
 207   }
 208   assert(bp->data);
 209   bp->lmfs_bytes = block_size;
 210   bp->lmfs_needsetcache = 1;
 211 }
 212
 213 /*===========================================================================*
 214  *                              lmfs_get_block                               *
 215  *===========================================================================*/
 216 int lmfs_get_block(struct buf **bpp, dev_t dev, block64_t block, int how)
 217 {
 218         return lmfs_get_block_ino(bpp, dev, block, how, VMC_NO_INODE, 0);
 219 }
 220
 221 static void munmap_t(void *a, int len)
 222 {
 223         assert(a);
 224         assert(a != MAP_FAILED);
 225         assert(!((vir_bytes)a % PAGE_SIZE));
 226         assert(len > 0);
 227
 228         len = roundup(len, PAGE_SIZE);
 229
 230         assert(!(len % PAGE_SIZE));
 231
 232         if(munmap(a, len) < 0)
 233                 panic("libminixfs cache: munmap failed");
 234 }
 235
 236 static void raisecount(struct buf *bp)
 237 {
 238   ASSERT(bp->lmfs_count < CHAR_MAX);
 239   bp->lmfs_count++;
 240   if(bp->lmfs_count == 1) bufs_in_use++;
 241   assert(bufs_in_use > 0);
 242 }
 243
 244 static void lowercount(struct buf *bp)
 245 {
 246   assert(bufs_in_use > 0);
 247   ASSERT(bp->lmfs_count > 0);
 248   bp->lmfs_count--;
 249   if(bp->lmfs_count == 0) bufs_in_use--;
 250 }
 251
 252 static void freeblock(struct buf *bp)
 253 {
 254   ASSERT(bp->lmfs_count == 0);
 255   /* If the block taken is dirty, make it clean by writing it to the disk.
 256    * Avoid hysteresis by flushing all other dirty blocks for the same device.
 257    */
 258   if (bp->lmfs_dev != NO_DEV) {
 259         if (!lmfs_isclean(bp)) lmfs_flushdev(bp->lmfs_dev);
 260         assert(bp->lmfs_bytes > 0);
 261         bp->lmfs_dev = NO_DEV;
 262   }
 263
 264   /* Fill in block's parameters and add it to the hash chain where it goes. */
 265   MARKCLEAN(bp);                /* NO_DEV blocks may be marked dirty */
 266   if(bp->lmfs_bytes > 0) {
 267         assert(bp->data);
 268         munmap_t(bp->data, bp->lmfs_bytes);
 269         bp->lmfs_bytes = 0;
 270         bp->data = NULL;
 271   } else assert(!bp->data);
 272 }
 273
 274 /*===========================================================================*
 275  *                              find_block                                   *
 276  *===========================================================================*/
 277 static struct buf *find_block(dev_t dev, block64_t block)
 278 {
 279 /* Search the hash chain for (dev, block). Return the buffer structure if
 280  * found, or NULL otherwise.
 281  */
 282   struct buf *bp;
 283   int b;
 284
 285   assert(dev != NO_DEV);
 286
 287   b = BUFHASH(block);
 288   for (bp = buf_hash[b]; bp != NULL; bp = bp->lmfs_hash)
 289         if (bp->lmfs_blocknr == block && bp->lmfs_dev == dev)
 290                 return bp;
 291
 292   return NULL;
 293 }
 294
 295 /*===========================================================================*
 296  *                              get_block_ino                                *
 297  *===========================================================================*/
 298 static int get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how,
 299         ino_t ino, u64_t ino_off, size_t block_size)
 300 {
 301 /* Check to see if the requested block is in the block cache.  The requested
 302  * block is identified by the block number in 'block' on device 'dev', counted
 303  * in the file system block size.  The amount of data requested for this block
 304  * is given in 'block_size', which may be less than the file system block size
 305  * iff the requested block is the last (partial) block on a device.  Note that
 306  * the given block size does *not* affect the conversion of 'block' to a byte
 307  * offset!  Either way, if the block could be obtained, either from the cache
 308  * or by reading from the device, return OK, with a pointer to the buffer
 309  * structure stored in 'bpp'.  If not, return a negative error code (and no
 310  * buffer).  If necessary, evict some other block and fetch the contents from
 311  * disk (if 'how' is NORMAL).  If 'how' is NO_READ, the caller intends to
 312  * overwrite the requested block in its entirety, so it is only necessary to
 313  * see if it is in the cache; if it is not, any free buffer will do.  If 'how'
 314  * is PEEK, the function returns the block if it is in the cache or the VM
 315  * cache, and an ENOENT error code otherwise.
 316  * In addition to the LRU chain, there is also a hash chain to link together
 317  * blocks whose block numbers end with the same bit strings, for fast lookup.
 318  */
 319   int b, r;
 320   static struct buf *bp;
 321   uint64_t dev_off;
 322   struct buf *prev_ptr;
 323
 324   assert(buf_hash);
 325   assert(buf);
 326   assert(nr_bufs > 0);
 327
 328   ASSERT(fs_block_size > 0);
 329
 330   assert(dev != NO_DEV);
 331
 332   assert(block <= UINT64_MAX / fs_block_size);
 333
 334   dev_off = block * fs_block_size;
 335
 336   if((ino_off % fs_block_size)) {
 337
 338         printf("cache: unaligned lmfs_get_block_ino ino_off %llu\n",
 339                 ino_off);
 340         util_stacktrace();
 341   }
 342
 343   /* See if the block is in the cache. If so, we can return it right away. */
 344   bp = find_block(dev, block);
 345   if (bp != NULL && !(bp->lmfs_flags & VMMC_EVICTED)) {
 346         ASSERT(bp->lmfs_dev == dev);
 347         ASSERT(bp->lmfs_dev != NO_DEV);
 348
 349         /* The block must have exactly the requested number of bytes. */
 350         if (bp->lmfs_bytes != block_size)
 351                 return EIO;
 352
 353         /* Block needed has been found. */
 354         if (bp->lmfs_count == 0) {
 355                 rm_lru(bp);
 356                 ASSERT(bp->lmfs_needsetcache == 0);
 357                 ASSERT(!(bp->lmfs_flags & VMMC_BLOCK_LOCKED));
 358                 /* FIXME: race condition against the VMMC_EVICTED check */
 359                 bp->lmfs_flags |= VMMC_BLOCK_LOCKED;
 360         }
 361         raisecount(bp);
 362         ASSERT(bp->lmfs_flags & VMMC_BLOCK_LOCKED);
 363         ASSERT(bp->data);
 364
 365         if(ino != VMC_NO_INODE) {
 366                 if(bp->lmfs_inode == VMC_NO_INODE
 367                 || bp->lmfs_inode != ino
 368                 || bp->lmfs_inode_offset != ino_off) {
 369                         bp->lmfs_inode = ino;
 370                         bp->lmfs_inode_offset = ino_off;
 371                         bp->lmfs_needsetcache = 1;
 372                 }
 373         }
 374
 375         *bpp = bp;
 376         return OK;
 377   }
 378
 379   /* We had the block in the cache but VM evicted it; invalidate it. */
 380   if (bp != NULL) {
 381         assert(bp->lmfs_flags & VMMC_EVICTED);
 382         ASSERT(bp->lmfs_count == 0);
 383         ASSERT(!(bp->lmfs_flags & VMMC_BLOCK_LOCKED));
 384         ASSERT(!(bp->lmfs_flags & VMMC_DIRTY));
 385         bp->lmfs_dev = NO_DEV;
 386         bp->lmfs_bytes = 0;
 387         bp->data = NULL;
 388   }
 389
 390   /* Desired block is not on available chain. Find a free block to use. */
 391   if(bp) {
 392         ASSERT(bp->lmfs_flags & VMMC_EVICTED);
 393   } else {
 394         if ((bp = front) == NULL) panic("all buffers in use: %d", nr_bufs);
 395   }
 396   assert(bp);
 397
 398   rm_lru(bp);
 399
 400   /* Remove the block that was just taken from its hash chain. */
 401   b = BUFHASH(bp->lmfs_blocknr);
 402   prev_ptr = buf_hash[b];
 403   if (prev_ptr == bp) {
 404         buf_hash[b] = bp->lmfs_hash;
 405   } else {
 406         /* The block just taken is not on the front of its hash chain. */
 407         while (prev_ptr->lmfs_hash != NULL)
 408                 if (prev_ptr->lmfs_hash == bp) {
 409                         prev_ptr->lmfs_hash = bp->lmfs_hash;    /* found it */
 410                         break;
 411                 } else {
 412                         prev_ptr = prev_ptr->lmfs_hash; /* keep looking */
 413                 }
 414   }
 415
 416   freeblock(bp);
 417
 418   bp->lmfs_inode = ino;
 419   bp->lmfs_inode_offset = ino_off;
 420
 421   bp->lmfs_flags = VMMC_BLOCK_LOCKED;
 422   bp->lmfs_needsetcache = 0;
 423   bp->lmfs_dev = dev;           /* fill in device number */
 424   bp->lmfs_blocknr = block;     /* fill in block number */
 425   ASSERT(bp->lmfs_count == 0);
 426   raisecount(bp);
 427   b = BUFHASH(bp->lmfs_blocknr);
 428   bp->lmfs_hash = buf_hash[b];
 429
 430   buf_hash[b] = bp;             /* add to hash list */
 431
 432   assert(dev != NO_DEV);
 433
 434   /* The block is not found in our cache, but we do want it if it's in the VM
 435    * cache. The exception is NO_READ, purely for context switching performance
 436    * reasons. NO_READ is used for 1) newly allocated blocks, 2) blocks being
 437    * prefetched, and 3) blocks about to be fully overwritten. In the first two
 438    * cases, VM will not have the block in its cache anyway, and for the third
 439    * we save on one VM call only if the block is in the VM cache.
 440    */
 441   assert(!bp->data);
 442   assert(!bp->lmfs_bytes);
 443   if (how != NO_READ && vmcache) {
 444         if((bp->data = vm_map_cacheblock(dev, dev_off, ino, ino_off,
 445             &bp->lmfs_flags, roundup(block_size, PAGE_SIZE))) != MAP_FAILED) {
 446                 bp->lmfs_bytes = block_size;
 447                 ASSERT(!bp->lmfs_needsetcache);
 448                 *bpp = bp;
 449                 return OK;
 450         }
 451   }
 452   bp->data = NULL;
 453
 454   /* The block is not in the cache, and VM does not know about it. If we were
 455    * requested to search for the block only, we can now return failure to the
 456    * caller. Return the block to the pool without allocating data pages, since
 457    * these would be freed upon recycling the block anyway.
 458    */
 459   if (how == PEEK) {
 460         bp->lmfs_dev = NO_DEV;
 461
 462         put_block(bp, ONE_SHOT);
 463
 464         return ENOENT;
 465   }
 466
 467   /* Not in the cache; reserve memory for its contents. */
 468
 469   lmfs_alloc_block(bp, block_size);
 470
 471   assert(bp->data);
 472
 473   if (how == NORMAL) {
 474         /* Try to read the block. Return an error code on failure. */
 475         if ((r = read_block(bp, block_size)) != OK) {
 476                 put_block(bp, 0);
 477
 478                 return r;
 479         }
 480   } else if(how == NO_READ) {
 481         /* This block will be overwritten by new contents. */
 482   } else
 483         panic("unexpected 'how' value: %d", how);
 484
 485   assert(bp->data);
 486
 487   *bpp = bp;                    /* return the newly acquired block */
 488   return OK;
 489 }
 490
 491 /*===========================================================================*
 492  *                              lmfs_get_block_ino                           *
 493  *===========================================================================*/
 494 int lmfs_get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how,
 495         ino_t ino, u64_t ino_off)
 496 {
 497   return get_block_ino(bpp, dev, block, how, ino, ino_off, fs_block_size);
 498 }
 499
 500 /*===========================================================================*
 501  *                              lmfs_get_partial_block                       *
 502  *===========================================================================*/
 503 int lmfs_get_partial_block(struct buf **bpp, dev_t dev, block64_t block,
 504         int how, size_t block_size)
 505 {
 506   return get_block_ino(bpp, dev, block, how, VMC_NO_INODE, 0, block_size);
 507 }
 508
 509 /*===========================================================================*
 510  *                              put_block                                    *
 511  *===========================================================================*/
 512 static void put_block(struct buf *bp, int put_flags)
 513 {
 514 /* Return a block to the list of available blocks.   Depending on 'put_flags'
 515  * it may be put on the front or rear of the LRU chain.  Blocks that are
 516  * expected to be needed again at some point go on the rear; blocks that are
 517  * unlikely to be needed again at all go on the front.
 518  */
 519   dev_t dev;
 520   uint64_t dev_off;
 521   int r, setflags;
 522
 523   assert(bp != NULL);
 524
 525   dev = bp->lmfs_dev;
 526
 527   dev_off = bp->lmfs_blocknr * fs_block_size;
 528
 529   lowercount(bp);
 530   if (bp->lmfs_count != 0) return;      /* block is still in use */
 531
 532   /* Put this block back on the LRU chain.  */
 533   if (dev == NO_DEV || dev == DEV_RAM || (put_flags & ONE_SHOT)) {
 534         /* Block will not be needed again. Put it on front of chain.
 535          * It will be the next block to be evicted from the cache.
 536          */
 537         bp->lmfs_prev = NULL;
 538         bp->lmfs_next = front;
 539         if (front == NULL)
 540                 rear = bp;      /* LRU chain was empty */
 541         else
 542                 front->lmfs_prev = bp;
 543         front = bp;
 544   }
 545   else {
 546         /* Block may be needed again.  Put it on rear of chain.
 547          * It will not be evicted from the cache for a long time.
 548          */
 549         bp->lmfs_prev = rear;
 550         bp->lmfs_next = NULL;
 551         if (rear == NULL)
 552                 front = bp;
 553         else
 554                 rear->lmfs_next = bp;
 555         rear = bp;
 556   }
 557
 558   assert(bp->lmfs_flags & VMMC_BLOCK_LOCKED);
 559   bp->lmfs_flags &= ~VMMC_BLOCK_LOCKED;
 560
 561   /* block has sensible content - if necessary, identify it to VM */
 562   if(vmcache && bp->lmfs_needsetcache && dev != NO_DEV) {
 563         assert(bp->data);
 564
 565         setflags = (put_flags & ONE_SHOT) ? VMSF_ONCE : 0;
 566
 567         if ((r = vm_set_cacheblock(bp->data, dev, dev_off, bp->lmfs_inode,
 568             bp->lmfs_inode_offset, &bp->lmfs_flags,
 569             roundup(bp->lmfs_bytes, PAGE_SIZE), setflags)) != OK) {
 570                 if(r == ENOSYS) {
 571                         printf("libminixfs: ENOSYS, disabling VM calls\n");
 572                         vmcache = 0;
 573                 } else if (r == ENOMEM) {
 574                         /* Do not panic in this case. Running out of memory is
 575                          * bad, especially since it may lead to applications
 576                          * crashing when trying to access memory-mapped pages
 577                          * we haven't been able to pass off to the VM cache,
 578                          * but the entire file system crashing is always worse.
 579                          */
 580                         printf("libminixfs: no memory for cache block!\n");
 581                 } else {
 582                         panic("libminixfs: setblock of %p dev 0x%llx off "
 583                                 "0x%llx failed\n", bp->data, dev, dev_off);
 584                 }
 585         }
 586   }
 587   bp->lmfs_needsetcache = 0;
 588
 589   /* Now that we (may) have given the block to VM, invalidate the block if it
 590    * is a one-shot block.  Otherwise, it may still be reobtained immediately
 591    * after, which could be a problem if VM already forgot the block and we are
 592    * expected to pass it to VM again, which then wouldn't happen.
 593    */
 594   if (put_flags & ONE_SHOT)
 595         bp->lmfs_dev = NO_DEV;
 596 }
 597
 598 /*===========================================================================*
 599  *                              lmfs_put_block                               *
 600  *===========================================================================*/
 601 void lmfs_put_block(struct buf *bp)
 602 {
 603 /* User interface to put_block(). */
 604
 605   if (bp == NULL) return;       /* for poorly written file systems */
 606
 607   put_block(bp, 0);
 608 }
 609
 610 /*===========================================================================*
 611  *                              lmfs_free_block                              *
 612  *===========================================================================*/
 613 void lmfs_free_block(dev_t dev, block64_t block)
 614 {
 615 /* The file system has just freed the given block. The block may previously
 616  * have been in use as data block for an inode. Therefore, we now need to tell
 617  * VM that the block is no longer associated with an inode. If we fail to do so
 618  * and the inode now has a hole at this location, mapping in the hole would
 619  * yield the old block contents rather than a zeroed page. In addition, if the
 620  * block is in the cache, it will be removed, even if it was dirty.
 621  */
 622   struct buf *bp;
 623   int r;
 624
 625   /* Tell VM to forget about the block. The primary purpose of this call is to
 626    * break the inode association, but since the block is part of a mounted file
 627    * system, it is not expected to be accessed directly anyway. So, save some
 628    * cache memory by throwing it out of the VM cache altogether.
 629    */
 630   if (vmcache) {
 631         if ((r = vm_forget_cacheblock(dev, block * fs_block_size,
 632             fs_block_size)) != OK)
 633                 printf("libminixfs: vm_forget_cacheblock failed (%d)\n", r);
 634   }
 635
 636   if ((bp = find_block(dev, block)) != NULL) {
 637         lmfs_markclean(bp);
 638
 639         /* Invalidate the block. The block may or may not be in use right now,
 640          * so don't be smart about freeing memory or repositioning in the LRU.
 641          */
 642         bp->lmfs_dev = NO_DEV;
 643   }
 644
 645   /* Note that this is *not* the right place to implement TRIM support. Even
 646    * though the block is freed, on the device it may still be part of a
 647    * previous checkpoint or snapshot of some sort. Only the file system can
 648    * be trusted to decide which blocks can be reused on the device!
 649    */
 650 }
 651
 652 /*===========================================================================*
 653  *                              lmfs_zero_block_ino                          *
 654  *===========================================================================*/
 655 void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t ino_off)
 656 {
 657 /* Files may have holes. From an application perspective, these are just file
 658  * regions filled with zeroes. From a file system perspective however, holes
 659  * may represent unallocated regions on disk. Thus, these holes do not have
 660  * corresponding blocks on the disk, and therefore also no block number.
 661  * Therefore, we cannot simply use lmfs_get_block_ino() for them. For reads,
 662  * this is not a problem, since the file system can just zero out the target
 663  * application buffer instead. For mapped pages however, this *is* a problem,
 664  * since the VM cache needs to be told about the corresponding block, and VM
 665  * does not accept blocks without a device offset. The role of this function is
 666  * therefore to tell VM about the hole using a fake device offset. The device
 667  * offsets are picked so that the VM cache will see a block memory-mapped for
 668  * the hole in the file, while the same block is not visible when
 669  * memory-mapping the block device.
 670  */
 671   struct buf *bp;
 672   static block64_t fake_block = 0;
 673   int r;
 674
 675   if (!vmcache)
 676         return;
 677
 678   assert(fs_block_size > 0);
 679
 680   /* Pick a block number which is above the threshold of what can possibly be
 681    * mapped in by mmap'ing the device, since off_t is signed, and it is safe to
 682    * say that it will take a while before we have 8-exabyte devices. Pick a
 683    * different block number each time to avoid possible concurrency issues.
 684    * FIXME: it does not seem like VM actually verifies mmap offsets though..
 685    */
 686   if (fake_block == 0 || ++fake_block >= UINT64_MAX / fs_block_size)
 687         fake_block = ((uint64_t)INT64_MAX + 1) / fs_block_size;
 688
 689   /* Obtain a block. */
 690   if ((r = lmfs_get_block_ino(&bp, dev, fake_block, NO_READ, ino,
 691       ino_off)) != OK)
 692         panic("libminixfs: getting a NO_READ block failed: %d", r);
 693   assert(bp != NULL);
 694   assert(bp->lmfs_dev != NO_DEV);
 695
 696   /* The block is already zeroed, as it has just been allocated with mmap. File
 697    * systems do not rely on this assumption yet, so if VM ever gets changed to
 698    * not clear the blocks we allocate (e.g., by recycling pages in the VM cache
 699    * for the same process, which would be safe), we need to add a memset here.
 700    */
 701
 702   /* Release the block. We don't expect it to be accessed ever again. Moreover,
 703    * if we keep the block around in the VM cache, it may erroneously be mapped
 704    * in beyond the file end later. Hence, use VMSF_ONCE when passing it to VM.
 705    * TODO: tell VM that it is an all-zeroes block, so that VM can deduplicate
 706    * all such pages in its cache.
 707    */
 708   put_block(bp, ONE_SHOT);
 709 }
 710
 711 void lmfs_set_blockusage(fsblkcnt_t btotal, fsblkcnt_t bused)
 712 {
 713
 714   assert(bused <= btotal);
 715   fs_btotal = btotal;
 716   fs_bused = bused;
 717
 718   /* if the cache isn't in use, we could resize it. */
 719   if (bufs_in_use == 0)
 720         cache_heuristic_check();
 721 }
 722
 723 /*===========================================================================*
 724  *                              read_block                                   *
 725  *===========================================================================*/
 726 static int read_block(struct buf *bp, size_t block_size)
 727 {
 728 /* Read a disk block of 'size' bytes.  The given size is always the FS block
 729  * size, except for the last block of a device.  If an I/O error occurs,
 730  * invalidate the block and return an error code.
 731  */
 732   ssize_t r;
 733   off_t pos;
 734   dev_t dev = bp->lmfs_dev;
 735
 736   assert(dev != NO_DEV);
 737
 738   ASSERT(bp->lmfs_bytes == block_size);
 739   ASSERT(fs_block_size > 0);
 740
 741   pos = (off_t)bp->lmfs_blocknr * fs_block_size;
 742   if (block_size > PAGE_SIZE) {
 743 #define MAXPAGES 20
 744         vir_bytes blockrem, vaddr = (vir_bytes) bp->data;
 745         int p = 0;
 746         static iovec_t iovec[MAXPAGES];
 747         blockrem = block_size;
 748         while(blockrem > 0) {
 749                 vir_bytes chunk = blockrem >= PAGE_SIZE ? PAGE_SIZE : blockrem;
 750                 iovec[p].iov_addr = vaddr;
 751                 iovec[p].iov_size = chunk;
 752                 vaddr += chunk;
 753                 blockrem -= chunk;
 754                 p++;
 755         }
 756         r = bdev_gather(dev, pos, iovec, p, BDEV_NOFLAGS);
 757   } else {
 758         r = bdev_read(dev, pos, bp->data, block_size, BDEV_NOFLAGS);
 759   }
 760   if (r != (ssize_t)block_size) {
 761         /* Aesthetics: do not report EOF errors on superblock reads, because
 762          * this is a fairly common occurrence, e.g. during system installation.
 763          */
 764         if (bp->lmfs_blocknr != 0 /*first block*/ || r != 0 /*EOF*/)
 765                 printf("fs cache: I/O error on device %d/%d, block %"PRIu64
 766                     " (%zd)\n", major(dev), minor(dev), bp->lmfs_blocknr, r);
 767
 768         if (r >= 0)
 769                 r = EIO; /* TODO: retry retrieving (just) the remaining part */
 770
 771         bp->lmfs_dev = NO_DEV;  /* invalidate block */
 772
 773         return r;
 774   }
 775
 776   return OK;
 777 }
 778
 779 /*===========================================================================*
 780  *                              lmfs_invalidate                              *
 781  *===========================================================================*/
 782 void lmfs_invalidate(
 783   dev_t device                  /* device whose blocks are to be purged */
 784 )
 785 {
 786 /* Remove all the blocks belonging to some device from the cache. */
 787
 788   register struct buf *bp;
 789
 790   assert(device != NO_DEV);
 791
 792   for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) {
 793         if (bp->lmfs_dev == device) {
 794                 assert(bp->data);
 795                 assert(bp->lmfs_bytes > 0);
 796                 munmap_t(bp->data, bp->lmfs_bytes);
 797                 bp->lmfs_dev = NO_DEV;
 798                 bp->lmfs_bytes = 0;
 799                 bp->data = NULL;
 800         }
 801   }
 802
 803   /* Clear the cache even if VM caching is disabled for the file system:
 804    * caching may be disabled as side effect of an error, leaving blocks behind
 805    * in the actual VM cache.
 806    */
 807   vm_clear_cache(device);
 808 }
 809
 810 /*===========================================================================*
 811  *                              sort_blocks                                  *
 812  *===========================================================================*/
 813 static void sort_blocks(struct buf **bufq, unsigned int bufqsize)
 814 {
 815   struct buf *bp;
 816   int i, j, gap;
 817
 818   gap = 1;
 819   do
 820         gap = 3 * gap + 1;
 821   while ((unsigned int)gap <= bufqsize);
 822
 823   while (gap != 1) {
 824         gap /= 3;
 825         for (j = gap; (unsigned int)j < bufqsize; j++) {
 826                 for (i = j - gap; i >= 0 &&
 827                     bufq[i]->lmfs_blocknr > bufq[i + gap]->lmfs_blocknr;
 828                     i -= gap) {
 829                         bp = bufq[i];
 830                         bufq[i] = bufq[i + gap];
 831                         bufq[i + gap] = bp;
 832                 }
 833         }
 834   }
 835 }
 836
 837 /*===========================================================================*
 838  *                              rw_scattered                                 *
 839  *===========================================================================*/
 840 static void rw_scattered(
 841   dev_t dev,                    /* major-minor device number */
 842   struct buf **bufq,            /* pointer to array of buffers */
 843   unsigned int bufqsize,        /* number of buffers */
 844   int rw_flag                   /* READING or WRITING */
 845 )
 846 {
 847 /* Read or write scattered data from a device. */
 848
 849   register struct buf *bp;
 850   register iovec_t *iop;
 851   static iovec_t iovec[NR_IOREQS];
 852   off_t pos;
 853   unsigned int i, iov_per_block;
 854 #if !defined(NDEBUG)
 855   unsigned int start_in_use = bufs_in_use, start_bufqsize = bufqsize;
 856 #endif /* !defined(NDEBUG) */
 857
 858   if(bufqsize == 0) return;
 859
 860 #if !defined(NDEBUG)
 861   /* for READING, check all buffers on the list are obtained and held
 862    * (count > 0)
 863    */
 864   if (rw_flag == READING) {
 865         assert(bufqsize <= LMFS_MAX_PREFETCH);
 866
 867         for(i = 0; i < bufqsize; i++) {
 868                 assert(bufq[i] != NULL);
 869                 assert(bufq[i]->lmfs_count > 0);
 870         }
 871
 872         /* therefore they are all 'in use' and must be at least this many */
 873         assert(start_in_use >= start_bufqsize);
 874   }
 875
 876   assert(dev != NO_DEV);
 877   assert(fs_block_size > 0);
 878   assert(howmany(fs_block_size, PAGE_SIZE) <= NR_IOREQS);
 879 #endif /* !defined(NDEBUG) */
 880
 881   /* For WRITING, (Shell) sort buffers on lmfs_blocknr.
 882    * For READING, the buffers are already sorted.
 883    */
 884   if (rw_flag == WRITING)
 885         sort_blocks(bufq, bufqsize);
 886
 887   /* Set up I/O vector and do I/O.  The result of bdev I/O is OK if everything
 888    * went fine, otherwise the error code for the first failed transfer.
 889    */
 890   while (bufqsize > 0) {
 891         unsigned int p, nblocks = 0, niovecs = 0;
 892         int r;
 893         for (iop = iovec; nblocks < bufqsize; nblocks++) {
 894                 vir_bytes vdata, blockrem;
 895                 bp = bufq[nblocks];
 896                 if (bp->lmfs_blocknr != bufq[0]->lmfs_blocknr + nblocks)
 897                         break;
 898                 blockrem = bp->lmfs_bytes;
 899                 iov_per_block = howmany(blockrem, PAGE_SIZE);
 900                 if (niovecs > NR_IOREQS - iov_per_block) break;
 901                 vdata = (vir_bytes) bp->data;
 902                 for(p = 0; p < iov_per_block; p++) {
 903                         vir_bytes chunk =
 904                             blockrem < PAGE_SIZE ? blockrem : PAGE_SIZE;
 905                         iop->iov_addr = vdata;
 906                         iop->iov_size = chunk;
 907                         vdata += PAGE_SIZE;
 908                         blockrem -= chunk;
 909                         iop++;
 910                         niovecs++;
 911                 }
 912                 assert(p == iov_per_block);
 913                 assert(blockrem == 0);
 914         }
 915
 916         assert(nblocks > 0);
 917         assert(niovecs > 0 && niovecs <= NR_IOREQS);
 918
 919         pos = (off_t)bufq[0]->lmfs_blocknr * fs_block_size;
 920         if (rw_flag == READING)
 921                 r = bdev_gather(dev, pos, iovec, niovecs, BDEV_NOFLAGS);
 922         else
 923                 r = bdev_scatter(dev, pos, iovec, niovecs, BDEV_NOFLAGS);
 924
 925         /* Harvest the results.  The driver may have returned an error, or it
 926          * may have done less than what we asked for.
 927          */
 928         if (r < 0) {
 929                 printf("fs cache: I/O error %d on device %d/%d, "
 930                     "block %"PRIu64"\n",
 931                     r, major(dev), minor(dev), bufq[0]->lmfs_blocknr);
 932         }
 933         for (i = 0; i < nblocks; i++) {
 934                 bp = bufq[i];
 935                 if (r < (ssize_t)bp->lmfs_bytes) {
 936                         /* Transfer failed. */
 937                         if (i == 0) {
 938                                 bp->lmfs_dev = NO_DEV;  /* Invalidate block */
 939                         }
 940                         break;
 941                 }
 942                 if (rw_flag == READING) {
 943                         lmfs_put_block(bp);
 944                 } else {
 945                         MARKCLEAN(bp);
 946                 }
 947                 r -= bp->lmfs_bytes;
 948         }
 949
 950         bufq += i;
 951         bufqsize -= i;
 952
 953         if (rw_flag == READING) {
 954                 /* Don't bother reading more than the device is willing to
 955                  * give at this time.  Don't forget to release those extras.
 956                  */
 957                 while (bufqsize > 0) {
 958                         bp = *bufq++;
 959                         bp->lmfs_dev = NO_DEV;  /* invalidate block */
 960                         lmfs_put_block(bp);
 961                         bufqsize--;
 962                 }
 963         }
 964         if (rw_flag == WRITING && i == 0) {
 965                 /* We're not making progress, this means we might keep
 966                  * looping. Buffers remain dirty if un-written. Buffers are
 967                  * lost if invalidate()d or LRU-removed while dirty. This
 968                  * is better than keeping unwritable blocks around forever..
 969                  */
 970                 break;
 971         }
 972   }
 973
 974 #if !defined(NDEBUG)
 975   if(rw_flag == READING) {
 976         assert(start_in_use >= start_bufqsize);
 977
 978         /* READING callers assume all bufs are released. */
 979         assert(start_in_use - start_bufqsize == bufs_in_use);
 980   }
 981 #endif /* !defined(NDEBUG) */
 982 }
 983
 984 /*===========================================================================*
 985  *                              lmfs_readahead                               *
 986  *===========================================================================*/
 987 void lmfs_readahead(dev_t dev, block64_t base_block, unsigned int nblocks,
 988         size_t last_size)
 989 {
 990 /* Read ahead 'nblocks' blocks starting from the block 'base_block' on device
 991  * 'dev'. The number of blocks must be between 1 and LMFS_MAX_PREFETCH,
 992  * inclusive. All blocks have the file system's block size, possibly except the
 993  * last block in the range, which is of size 'last_size'. The caller must
 994  * ensure that none of the blocks in the range are already in the cache.
 995  * However, the caller must also not rely on all or even any of the blocks to
 996  * be present in the cache afterwards--failures are (deliberately!) ignored.
 997  */
 998   static noxfer_buf_ptr_t bufq[LMFS_MAX_PREFETCH]; /* static for size only */
 999   struct buf *bp;
1000   unsigned int count;
1001   int r;
1002
1003   assert(nblocks >= 1 && nblocks <= LMFS_MAX_PREFETCH);
1004
1005   for (count = 0; count < nblocks; count++) {
1006         if (count == nblocks - 1)
1007                 r = lmfs_get_partial_block(&bp, dev, base_block + count,
1008                     NO_READ, last_size);
1009         else
1010                 r = lmfs_get_block(&bp, dev, base_block + count, NO_READ);
1011
1012         if (r != OK)
1013                 break;
1014
1015         /* We could add a flag that makes the get_block() calls fail if the
1016          * block is already in the cache, but it is not a major concern if it
1017          * is: we just perform a useless read in that case. However, if the
1018          * block is cached *and* dirty, we are about to lose its new contents.
1019          */
1020         assert(lmfs_isclean(bp));
1021
1022         bufq[count] = bp;
1023   }
1024
1025   rw_scattered(dev, bufq, count, READING);
1026 }
1027
1028 /*===========================================================================*
1029  *                              lmfs_prefetch                                *
1030  *===========================================================================*/
1031 unsigned int lmfs_readahead_limit(void)
1032 {
1033 /* Return the maximum number of blocks that should be read ahead at once. The
1034  * return value is guaranteed to be between 1 and LMFS_MAX_PREFETCH, inclusive.
1035  */
1036   unsigned int max_transfer, max_bufs;
1037
1038   /* The returned value is the minimum of two factors: the maximum number of
1039    * blocks that can be transferred in a single I/O gather request (see how
1040    * rw_scattered() generates I/O requests), and a policy limit on the number
1041    * of buffers that any read-ahead operation may use (that is, thrash).
1042    */
1043   max_transfer = NR_IOREQS / MAX(fs_block_size / PAGE_SIZE, 1);
1044
1045   /* The constants have been imported from MFS as is, and may need tuning. */
1046   if (nr_bufs < 50)
1047         max_bufs = 18;
1048   else
1049         max_bufs = nr_bufs - 4;
1050
1051   return MIN(max_transfer, max_bufs);
1052 }
1053
1054 /*===========================================================================*
1055  *                              lmfs_prefetch                                *
1056  *===========================================================================*/
1057 void lmfs_prefetch(dev_t dev, const block64_t *blockset, unsigned int nblocks)
1058 {
1059 /* The given set of blocks is expected to be needed soon, so prefetch a
1060  * convenient subset. The blocks are expected to be sorted by likelihood of
1061  * being accessed soon, making the first block of the set the most important
1062  * block to prefetch right now. The caller must have made sure that the blocks
1063  * are not in the cache already. The array may have duplicate block numbers.
1064  */
1065   bitchunk_t blocks_before[BITMAP_CHUNKS(LMFS_MAX_PREFETCH)];
1066   bitchunk_t blocks_after[BITMAP_CHUNKS(LMFS_MAX_PREFETCH)];
1067   block64_t block, base_block;
1068   unsigned int i, bit, nr_before, nr_after, span, limit, nr_blocks;
1069
1070   if (nblocks == 0)
1071         return;
1072
1073   /* Here is the deal. We are going to prefetch one range only, because seeking
1074    * is too expensive for just prefetching. The range we select should at least
1075    * include the first ("base") block of the given set, since that is the block
1076    * the caller is primarily interested in. Thus, the rest of the range is
1077    * going to have to be directly around this base block. We first check which
1078    * blocks from the set fall just before and after the base block, which then
1079    * allows us to construct a contiguous range of desired blocks directly
1080    * around the base block, in O(n) time. As a natural part of this, we ignore
1081    * duplicate blocks in the given set. We then read from the beginning of this
1082    * range, in order to maximize the chance that a next prefetch request will
1083    * continue from the last disk position without requiring a seek. However, we
1084    * do correct for the maximum number of blocks we can (or should) read in at
1085    * once, such that we will still end up reading the base block.
1086    */
1087   base_block = blockset[0];
1088
1089   memset(blocks_before, 0, sizeof(blocks_before));
1090   memset(blocks_after, 0, sizeof(blocks_after));
1091
1092   for (i = 1; i < nblocks; i++) {
1093         block = blockset[i];
1094
1095         if (block < base_block && block + LMFS_MAX_PREFETCH >= base_block) {
1096                 bit = base_block - block - 1;
1097                 assert(bit < LMFS_MAX_PREFETCH);
1098                 SET_BIT(blocks_before, bit);
1099         } else if (block > base_block &&
1100             block - LMFS_MAX_PREFETCH <= base_block) {
1101                 bit = block - base_block - 1;
1102                 assert(bit < LMFS_MAX_PREFETCH);
1103                 SET_BIT(blocks_after, bit);
1104         }
1105   }
1106
1107   for (nr_before = 0; nr_before < LMFS_MAX_PREFETCH; nr_before++)
1108         if (!GET_BIT(blocks_before, nr_before))
1109                 break;
1110
1111   for (nr_after = 0; nr_after < LMFS_MAX_PREFETCH; nr_after++)
1112         if (!GET_BIT(blocks_after, nr_after))
1113                 break;
1114
1115   /* The number of blocks to prefetch is the minimum of two factors: the number
1116    * of blocks in the range around the base block, and the maximum number of
1117    * blocks that should be read ahead at once at all.
1118    */
1119   span = nr_before + 1 + nr_after;
1120   limit = lmfs_readahead_limit();
1121
1122   nr_blocks = MIN(span, limit);
1123   assert(nr_blocks >= 1 && nr_blocks <= LMFS_MAX_PREFETCH);
1124
1125   /* Start prefetching from the lowest block within the contiguous range, but
1126    * make sure that we read at least the original base block itself, too.
1127    */
1128   base_block -= MIN(nr_before, nr_blocks - 1);
1129
1130   lmfs_readahead(dev, base_block, nr_blocks, fs_block_size);
1131 }
1132
1133 /*===========================================================================*
1134  *                              lmfs_flushdev                                *
1135  *===========================================================================*/
1136 void lmfs_flushdev(dev_t dev)
1137 {
1138 /* Flush all dirty blocks for one device. */
1139
1140   register struct buf *bp;
1141   static noxfer_buf_ptr_t *dirty;
1142   static unsigned int dirtylistsize = 0;
1143   unsigned int ndirty;
1144
1145   if(dirtylistsize != nr_bufs) {
1146         if(dirtylistsize > 0) {
1147                 assert(dirty != NULL);
1148                 free(dirty);
1149         }
1150         if(!(dirty = malloc(sizeof(dirty[0])*nr_bufs)))
1151                 panic("couldn't allocate dirty buf list");
1152         dirtylistsize = nr_bufs;
1153   }
1154
1155   for (bp = &buf[0], ndirty = 0; bp < &buf[nr_bufs]; bp++) {
1156         /* Do not flush dirty blocks that are in use (lmfs_count>0): the file
1157          * system may mark the block as dirty before changing its contents, in
1158          * which case the new contents could end up being lost.
1159          */
1160         if (!lmfs_isclean(bp) && bp->lmfs_dev == dev && bp->lmfs_count == 0) {
1161                 dirty[ndirty++] = bp;
1162         }
1163   }
1164
1165   rw_scattered(dev, dirty, ndirty, WRITING);
1166 }
1167
1168 /*===========================================================================*
1169  *                              rm_lru                                       *
1170  *===========================================================================*/
1171 static void rm_lru(struct buf *bp)
1172 {
1173 /* Remove a block from its LRU chain. */
1174   struct buf *next_ptr, *prev_ptr;
1175
1176   next_ptr = bp->lmfs_next;     /* successor on LRU chain */
1177   prev_ptr = bp->lmfs_prev;     /* predecessor on LRU chain */
1178   if (prev_ptr != NULL)
1179         prev_ptr->lmfs_next = next_ptr;
1180   else
1181         front = next_ptr;       /* this block was at front of chain */
1182
1183   if (next_ptr != NULL)
1184         next_ptr->lmfs_prev = prev_ptr;
1185   else
1186         rear = prev_ptr;        /* this block was at rear of chain */
1187 }
1188
1189 /*===========================================================================*
1190  *                              cache_resize                                 *
1191  *===========================================================================*/
1192 static void cache_resize(size_t blocksize, unsigned int bufs)
1193 {
1194   struct buf *bp;
1195
1196   assert(blocksize > 0);
1197   assert(bufs >= MINBUFS);
1198
1199   for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++)
1200         if(bp->lmfs_count != 0) panic("change blocksize with buffer in use");
1201
1202   lmfs_buf_pool(bufs);
1203
1204   fs_block_size = blocksize;
1205 }
1206
1207 static void cache_heuristic_check(void)
1208 {
1209   int bufs, d;
1210
1211   bufs = fs_bufs_heuristic(MINBUFS, fs_btotal, fs_bused, fs_block_size);
1212
1213   /* set the cache to the new heuristic size if the new one
1214    * is more than 10% off from the current one.
1215    */
1216   d = bufs-nr_bufs;
1217   if(d < 0) d = -d;
1218   if(d*100/nr_bufs > 10) {
1219         cache_resize(fs_block_size, bufs);
1220   }
1221 }
1222
1223 /*===========================================================================*
1224  *                      lmfs_set_blocksize                                   *
1225  *===========================================================================*/
1226 void lmfs_set_blocksize(size_t new_block_size)
1227 {
1228   cache_resize(new_block_size, MINBUFS);
1229   cache_heuristic_check();
1230
1231   /* Decide whether to use seconday cache or not.
1232    * Only do this if the block size is a multiple of the page size, and using
1233    * the VM cache has been enabled for this FS.
1234    */
1235
1236   vmcache = 0;
1237
1238   if(may_use_vmcache && !(new_block_size % PAGE_SIZE))
1239         vmcache = 1;
1240 }
1241
1242 /*===========================================================================*
1243  *                              lmfs_buf_pool                                *
1244  *===========================================================================*/
1245 void lmfs_buf_pool(int new_nr_bufs)
1246 {
1247 /* Initialize the buffer pool. */
1248   register struct buf *bp;
1249
1250   assert(new_nr_bufs >= MINBUFS);
1251
1252   if(nr_bufs > 0) {
1253         assert(buf);
1254         lmfs_flushall();
1255         for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) {
1256                 if(bp->data) {
1257                         assert(bp->lmfs_bytes > 0);
1258                         munmap_t(bp->data, bp->lmfs_bytes);
1259                 }
1260         }
1261   }
1262
1263   if(buf)
1264         free(buf);
1265
1266   if(!(buf = calloc(sizeof(buf[0]), new_nr_bufs)))
1267         panic("couldn't allocate buf list (%d)", new_nr_bufs);
1268
1269   if(buf_hash)
1270         free(buf_hash);
1271   if(!(buf_hash = calloc(sizeof(buf_hash[0]), new_nr_bufs)))
1272         panic("couldn't allocate buf hash list (%d)", new_nr_bufs);
1273
1274   nr_bufs = new_nr_bufs;
1275
1276   bufs_in_use = 0;
1277   front = &buf[0];
1278   rear = &buf[nr_bufs - 1];
1279
1280   for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) {
1281         bp->lmfs_blocknr = NO_BLOCK;
1282         bp->lmfs_dev = NO_DEV;
1283         bp->lmfs_next = bp + 1;
1284         bp->lmfs_prev = bp - 1;
1285         bp->data = NULL;
1286         bp->lmfs_bytes = 0;
1287   }
1288   front->lmfs_prev = NULL;
1289   rear->lmfs_next = NULL;
1290
1291   for (bp = &buf[0]; bp < &buf[nr_bufs]; bp++) bp->lmfs_hash = bp->lmfs_next;
1292   buf_hash[0] = front;
1293 }
1294
1295 void lmfs_flushall(void)
1296 {
1297         struct buf *bp;
1298         for(bp = &buf[0]; bp < &buf[nr_bufs]; bp++)
1299                 if(bp->lmfs_dev != NO_DEV && !lmfs_isclean(bp))
1300                         lmfs_flushdev(bp->lmfs_dev);
1301
1302         /* This is the moment where it is least likely (although certainly not
1303          * impossible!) that there are buffers in use, since buffers should not
1304          * be held across file system syncs. See if we already intended to
1305          * resize the buffer cache, but couldn't. Be aware that we may be
1306          * called indirectly from within lmfs_change_blockusage(), so care must
1307          * be taken not to recurse infinitely. TODO: see if it is better to
1308          * resize the cache from here *only*, thus guaranteeing a clean cache.
1309          */
1310         lmfs_change_blockusage(0);
1311 }
1312
1313 size_t lmfs_fs_block_size(void)
1314 {
1315         return fs_block_size;
1316 }
1317
1318 void lmfs_may_use_vmcache(int ok)
1319 {
1320         may_use_vmcache = ok;
1321 }