i386/linux/linux-0.99/fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /*
  14  * NOTE! There is one discordant note here: checking floppies for
  15  * disk change. This is where it fits best, I think, as it should
  16  * invalidate changed floppy-disk-caches.
  17  */
  18
  19 #include <stdarg.h>
  20
  21 #include <linux/config.h>
  22 #include <linux/errno.h>
  23 #include <linux/sched.h>
  24 #include <linux/kernel.h>
  25 #include <linux/major.h>
  26 #include <linux/string.h>
  27 #include <linux/locks.h>
  28 #include <linux/errno.h>
  29
  30 #include <asm/system.h>
  31 #include <asm/io.h>
  32
  33 #ifdef CONFIG_SCSI
  34 #ifdef CONFIG_BLK_DEV_SR
  35 extern int check_cdrom_media_change(int, int);
  36 #endif
  37 #ifdef CONFIG_BLK_DEV_SD
  38 extern int check_scsidisk_media_change(int, int);
  39 extern int revalidate_scsidisk(int, int);
  40 #endif
  41 #endif
  42 #ifdef CONFIG_CDU31A
  43 extern int check_cdu31a_media_change(int, int);
  44 #endif
  45 #ifdef CONFIG_MCD
  46 extern int check_mcd_media_change(int, int);
  47 #endif
  48
  49 static int grow_buffers(int pri, int size);
  50
  51 static struct buffer_head * hash_table[NR_HASH];
  52 static struct buffer_head * free_list = NULL;
  53 static struct buffer_head * unused_list = NULL;
  54 static struct wait_queue * buffer_wait = NULL;
  55
  56 int nr_buffers = 0;
  57 int buffermem = 0;
  58 int nr_buffer_heads = 0;
  59 static int min_free_pages = 20; /* nr free pages needed before buffer grows */
  60 extern int *blksize_size[];
  61
  62 /*
  63  * Rewrote the wait-routines to use the "new" wait-queue functionality,
  64  * and getting rid of the cli-sti pairs. The wait-queue routines still
  65  * need cli-sti, but now it's just a couple of 386 instructions or so.
  66  *
  67  * Note that the real wait_on_buffer() is an inline function that checks
  68  * if 'b_wait' is set before calling this, so that the queues aren't set
  69  * up unnecessarily.
  70  */
  71 void __wait_on_buffer(struct buffer_head * bh)
  72 {
  73         struct wait_queue wait = { current, NULL };
  74
  75         bh->b_count++;
  76         add_wait_queue(&bh->b_wait, &wait);
  77 repeat:
  78         current->state = TASK_UNINTERRUPTIBLE;
  79         if (bh->b_lock) {
  80                 schedule();
  81                 goto repeat;
  82         }
  83         remove_wait_queue(&bh->b_wait, &wait);
  84         bh->b_count--;
  85         current->state = TASK_RUNNING;
  86 }
  87
  88 /* Call sync_buffers with wait!=0 to ensure that the call does not
  89    return until all buffer writes have completed.  Sync() may return
  90    before the writes have finished; fsync() may not. */
  91
  92 static int sync_buffers(dev_t dev, int wait)
  93 {
  94         int i, retry, pass = 0, err = 0;
  95         struct buffer_head * bh;
  96
  97         /* One pass for no-wait, three for wait:
  98            0) write out all dirty, unlocked buffers;
  99            1) write out all dirty buffers, waiting if locked;
 100            2) wait for completion by waiting for all buffers to unlock.
 101          */
 102 repeat:
 103         retry = 0;
 104         bh = free_list;
 105         for (i = nr_buffers*2 ; i-- > 0 ; bh = bh->b_next_free) {
 106                 if (dev && bh->b_dev != dev)
 107                         continue;
 108 #ifdef 0 /* Disable bad-block debugging code */
 109                 if (bh->b_req && !bh->b_lock &&
 110                     !bh->b_dirt && !bh->b_uptodate)
 111                         printk ("Warning (IO error) - orphaned block %08x on %04x\n",
 112                                 bh->b_blocknr, bh->b_dev);
 113 #endif
 114                 if (bh->b_lock)
 115                 {
 116                         /* Buffer is locked; skip it unless wait is
 117                            requested AND pass > 0. */
 118                         if (!wait || !pass) {
 119                                 retry = 1;
 120                                 continue;
 121                         }
 122                         wait_on_buffer (bh);
 123                 }
 124                 /* If an unlocked buffer is not uptodate, there has been
 125                    an IO error. Skip it. */
 126                 if (wait && bh->b_req && !bh->b_lock &&
 127                     !bh->b_dirt && !bh->b_uptodate)
 128                 {
 129                         err = 1;
 130                         continue;
 131                 }
 132                 /* Don't write clean buffers.  Don't write ANY buffers
 133                    on the third pass. */
 134                 if (!bh->b_dirt || pass>=2)
 135                         continue;
 136                 bh->b_count++;
 137                 ll_rw_block(WRITE, 1, &bh);
 138                 bh->b_count--;
 139                 retry = 1;
 140         }
 141         /* If we are waiting for the sync to succeed, and if any dirty
 142            blocks were written, then repeat; on the second pass, only
 143            wait for buffers being written (do not pass to write any
 144            more buffers on the second pass). */
 145         if (wait && retry && ++pass<=2)
 146                 goto repeat;
 147         return err;
 148 }
 149
 150 void sync_dev(dev_t dev)
 151 {
 152         sync_buffers(dev, 0);
 153         sync_supers(dev);
 154         sync_inodes(dev);
 155         sync_buffers(dev, 0);
 156 }
 157
 158 int fsync_dev(dev_t dev)
 159 {
 160         sync_buffers(dev, 0);
 161         sync_supers(dev);
 162         sync_inodes(dev);
 163         return sync_buffers(dev, 1);
 164 }
 165
 166 asmlinkage int sys_sync(void)
 167 {
 168         sync_dev(0);
 169         return 0;
 170 }
 171
 172 int file_fsync (struct inode *inode, struct file *filp)
 173 {
 174         return fsync_dev(inode->i_dev);
 175 }
 176
 177 asmlinkage int sys_fsync(unsigned int fd)
 178 {
 179         struct file * file;
 180         struct inode * inode;
 181
 182         if (fd>=NR_OPEN || !(file=current->filp[fd]) || !(inode=file->f_inode))
 183                 return -EBADF;
 184         if (!file->f_op || !file->f_op->fsync)
 185                 return -EINVAL;
 186         if (file->f_op->fsync(inode,file))
 187                 return -EIO;
 188         return 0;
 189 }
 190
 191 void invalidate_buffers(dev_t dev)
 192 {
 193         int i;
 194         struct buffer_head * bh;
 195
 196         bh = free_list;
 197         for (i = nr_buffers*2 ; --i > 0 ; bh = bh->b_next_free) {
 198                 if (bh->b_dev != dev)
 199                         continue;
 200                 wait_on_buffer(bh);
 201                 if (bh->b_dev == dev)
 202                         bh->b_uptodate = bh->b_dirt = bh->b_req = 0;
 203         }
 204 }
 205
 206 /*
 207  * This routine checks whether a floppy has been changed, and
 208  * invalidates all buffer-cache-entries in that case. This
 209  * is a relatively slow routine, so we have to try to minimize using
 210  * it. Thus it is called only upon a 'mount' or 'open'. This
 211  * is the best way of combining speed and utility, I think.
 212  * People changing diskettes in the middle of an operation deserve
 213  * to loose :-)
 214  *
 215  * NOTE! Although currently this is only for floppies, the idea is
 216  * that any additional removable block-device will use this routine,
 217  * and that mount/open needn't know that floppies/whatever are
 218  * special.
 219  */
 220 void check_disk_change(dev_t dev)
 221 {
 222         int i;
 223         struct buffer_head * bh;
 224
 225         switch(MAJOR(dev)){
 226         case FLOPPY_MAJOR:
 227                 if (!(bh = getblk(dev,0,1024)))
 228                         return;
 229                 i = floppy_change(bh);
 230                 brelse(bh);
 231                 break;
 232
 233 #if defined(CONFIG_BLK_DEV_SD) && defined(CONFIG_SCSI)
 234          case SCSI_DISK_MAJOR:
 235                 i = check_scsidisk_media_change(dev, 0);
 236                 break;
 237 #endif
 238
 239 #if defined(CONFIG_BLK_DEV_SR) && defined(CONFIG_SCSI)
 240          case SCSI_CDROM_MAJOR:
 241                 i = check_cdrom_media_change(dev, 0);
 242                 break;
 243 #endif
 244
 245 #if defined(CONFIG_CDU31A)
 246          case CDU31A_CDROM_MAJOR:
 247                 i = check_cdu31a_media_change(dev, 0);
 248                 break;
 249 #endif
 250
 251 #if defined(CONFIG_MCD)
 252          case MITSUMI_CDROM_MAJOR:
 253                 i = check_mcd_media_change(dev, 0);
 254                 break;
 255 #endif
 256
 257          default:
 258                 return;
 259         };
 260
 261         if (!i) return;
 262
 263         printk("VFS: Disk change detected on device %d/%d\n",
 264                                         MAJOR(dev), MINOR(dev));
 265         for (i=0 ; i<NR_SUPER ; i++)
 266                 if (super_blocks[i].s_dev == dev)
 267                         put_super(super_blocks[i].s_dev);
 268         invalidate_inodes(dev);
 269         invalidate_buffers(dev);
 270
 271 #if defined(CONFIG_BLK_DEV_SD) && defined(CONFIG_SCSI)
 272 /* This is trickier for a removable hardisk, because we have to invalidate
 273    all of the partitions that lie on the disk. */
 274         if (MAJOR(dev) == SCSI_DISK_MAJOR)
 275                 revalidate_scsidisk(dev, 0);
 276 #endif
 277 }
 278
 279 #define _hashfn(dev,block) (((unsigned)(dev^block))%NR_HASH)
 280 #define hash(dev,block) hash_table[_hashfn(dev,block)]
 281
 282 static inline void remove_from_hash_queue(struct buffer_head * bh)
 283 {
 284         if (bh->b_next)
 285                 bh->b_next->b_prev = bh->b_prev;
 286         if (bh->b_prev)
 287                 bh->b_prev->b_next = bh->b_next;
 288         if (hash(bh->b_dev,bh->b_blocknr) == bh)
 289                 hash(bh->b_dev,bh->b_blocknr) = bh->b_next;
 290         bh->b_next = bh->b_prev = NULL;
 291 }
 292
 293 static inline void remove_from_free_list(struct buffer_head * bh)
 294 {
 295         if (!(bh->b_prev_free) || !(bh->b_next_free))
 296                 panic("VFS: Free block list corrupted");
 297         bh->b_prev_free->b_next_free = bh->b_next_free;
 298         bh->b_next_free->b_prev_free = bh->b_prev_free;
 299         if (free_list == bh)
 300                 free_list = bh->b_next_free;
 301         bh->b_next_free = bh->b_prev_free = NULL;
 302 }
 303
 304 static inline void remove_from_queues(struct buffer_head * bh)
 305 {
 306         remove_from_hash_queue(bh);
 307         remove_from_free_list(bh);
 308 }
 309
 310 static inline void put_first_free(struct buffer_head * bh)
 311 {
 312         if (!bh || (bh == free_list))
 313                 return;
 314         remove_from_free_list(bh);
 315 /* add to front of free list */
 316         bh->b_next_free = free_list;
 317         bh->b_prev_free = free_list->b_prev_free;
 318         free_list->b_prev_free->b_next_free = bh;
 319         free_list->b_prev_free = bh;
 320         free_list = bh;
 321 }
 322
 323 static inline void put_last_free(struct buffer_head * bh)
 324 {
 325         if (!bh)
 326                 return;
 327         if (bh == free_list) {
 328                 free_list = bh->b_next_free;
 329                 return;
 330         }
 331         remove_from_free_list(bh);
 332 /* add to back of free list */
 333         bh->b_next_free = free_list;
 334         bh->b_prev_free = free_list->b_prev_free;
 335         free_list->b_prev_free->b_next_free = bh;
 336         free_list->b_prev_free = bh;
 337 }
 338
 339 static inline void insert_into_queues(struct buffer_head * bh)
 340 {
 341 /* put at end of free list */
 342         bh->b_next_free = free_list;
 343         bh->b_prev_free = free_list->b_prev_free;
 344         free_list->b_prev_free->b_next_free = bh;
 345         free_list->b_prev_free = bh;
 346 /* put the buffer in new hash-queue if it has a device */
 347         bh->b_prev = NULL;
 348         bh->b_next = NULL;
 349         if (!bh->b_dev)
 350                 return;
 351         bh->b_next = hash(bh->b_dev,bh->b_blocknr);
 352         hash(bh->b_dev,bh->b_blocknr) = bh;
 353         if (bh->b_next)
 354                 bh->b_next->b_prev = bh;
 355 }
 356
 357 static struct buffer_head * find_buffer(dev_t dev, int block, int size)
 358 {
 359         struct buffer_head * tmp;
 360
 361         for (tmp = hash(dev,block) ; tmp != NULL ; tmp = tmp->b_next)
 362                 if (tmp->b_dev==dev && tmp->b_blocknr==block)
 363                         if (tmp->b_size == size)
 364                                 return tmp;
 365                         else {
 366                                 printk("VFS: Wrong blocksize on device %d/%d\n",
 367                                                         MAJOR(dev), MINOR(dev));
 368                                 return NULL;
 369                         }
 370         return NULL;
 371 }
 372
 373 /*
 374  * Why like this, I hear you say... The reason is race-conditions.
 375  * As we don't lock buffers (unless we are readint them, that is),
 376  * something might happen to it while we sleep (ie a read-error
 377  * will force it bad). This shouldn't really happen currently, but
 378  * the code is ready.
 379  */
 380 struct buffer_head * get_hash_table(dev_t dev, int block, int size)
 381 {
 382         struct buffer_head * bh;
 383
 384         for (;;) {
 385                 if (!(bh=find_buffer(dev,block,size)))
 386                         return NULL;
 387                 bh->b_count++;
 388                 wait_on_buffer(bh);
 389                 if (bh->b_dev == dev && bh->b_blocknr == block && bh->b_size == size)
 390                         return bh;
 391                 bh->b_count--;
 392         }
 393 }
 394
 395 void set_blocksize(dev_t dev, int size)
 396 {
 397         int i;
 398         struct buffer_head * bh, *bhnext;
 399
 400         if (!blksize_size[MAJOR(dev)])
 401                 return;
 402
 403         switch(size) {
 404                 default: panic("Invalid blocksize passed to set_blocksize");
 405                 case 512: case 1024: case 2048: case 4096:;
 406         }
 407
 408         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 409                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 410                 return;
 411         }
 412         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 413                 return;
 414         sync_buffers(dev, 2);
 415         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 416
 417   /* We need to be quite careful how we do this - we are moving entries
 418      around on the free list, and we can get in a loop if we are not careful.*/
 419
 420         bh = free_list;
 421         for (i = nr_buffers*2 ; --i > 0 ; bh = bhnext) {
 422                 bhnext = bh->b_next_free;
 423                 if (bh->b_dev != dev)
 424                         continue;
 425                 if (bh->b_size == size)
 426                         continue;
 427
 428                 wait_on_buffer(bh);
 429                 if (bh->b_dev == dev && bh->b_size != size)
 430                         bh->b_uptodate = bh->b_dirt = 0;
 431                 remove_from_hash_queue(bh);
 432 /*    put_first_free(bh); */
 433         }
 434 }
 435
 436 /*
 437  * Ok, this is getblk, and it isn't very clear, again to hinder
 438  * race-conditions. Most of the code is seldom used, (ie repeating),
 439  * so it should be much more efficient than it looks.
 440  *
 441  * The algoritm is changed: hopefully better, and an elusive bug removed.
 442  *
 443  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 444  * when the filesystem starts to get full of dirty blocks (I hope).
 445  */
 446 #define BADNESS(bh) (((bh)->b_dirt<<1)+(bh)->b_lock)
 447 struct buffer_head * getblk(dev_t dev, int block, int size)
 448 {
 449         struct buffer_head * bh, * tmp;
 450         int buffers;
 451         static int grow_size = 0;
 452
 453 repeat:
 454         bh = get_hash_table(dev, block, size);
 455         if (bh) {
 456                 if (bh->b_uptodate && !bh->b_dirt)
 457                         put_last_free(bh);
 458                 return bh;
 459         }
 460         grow_size -= size;
 461         if (nr_free_pages > min_free_pages && grow_size <= 0) {
 462                 if (grow_buffers(GFP_BUFFER, size))
 463                         grow_size = PAGE_SIZE;
 464         }
 465         buffers = nr_buffers;
 466         bh = NULL;
 467
 468         for (tmp = free_list; buffers-- > 0 ; tmp = tmp->b_next_free) {
 469                 if (tmp->b_count || tmp->b_size != size)
 470                         continue;
 471                 if (mem_map[MAP_NR((unsigned long) tmp->b_data)] != 1)
 472                         continue;
 473                 if (!bh || BADNESS(tmp)<BADNESS(bh)) {
 474                         bh = tmp;
 475                         if (!BADNESS(tmp))
 476                                 break;
 477                 }
 478 #if 0
 479                 if (tmp->b_dirt) {
 480                         tmp->b_count++;
 481                         ll_rw_block(WRITEA, 1, &tmp);
 482                         tmp->b_count--;
 483                 }
 484 #endif
 485         }
 486
 487         if (!bh && nr_free_pages > 5) {
 488                 if (grow_buffers(GFP_BUFFER, size))
 489                         goto repeat;
 490         }
 491
 492 /* and repeat until we find something good */
 493         if (!bh) {
 494                 if (!grow_buffers(GFP_ATOMIC, size))
 495                         sleep_on(&buffer_wait);
 496                 goto repeat;
 497         }
 498         wait_on_buffer(bh);
 499         if (bh->b_count || bh->b_size != size)
 500                 goto repeat;
 501         if (bh->b_dirt) {
 502                 sync_buffers(0,0);
 503                 goto repeat;
 504         }
 505 /* NOTE!! While we slept waiting for this block, somebody else might */
 506 /* already have added "this" block to the cache. check it */
 507         if (find_buffer(dev,block,size))
 508                 goto repeat;
 509 /* OK, FINALLY we know that this buffer is the only one of its kind, */
 510 /* and that it's unused (b_count=0), unlocked (b_lock=0), and clean */
 511         bh->b_count=1;
 512         bh->b_dirt=0;
 513         bh->b_uptodate=0;
 514         bh->b_req=0;
 515         remove_from_queues(bh);
 516         bh->b_dev=dev;
 517         bh->b_blocknr=block;
 518         insert_into_queues(bh);
 519         return bh;
 520 }
 521
 522 void brelse(struct buffer_head * buf)
 523 {
 524         if (!buf)
 525                 return;
 526         wait_on_buffer(buf);
 527         if (buf->b_count) {
 528                 if (--buf->b_count)
 529                         return;
 530                 wake_up(&buffer_wait);
 531                 return;
 532         }
 533         printk("VFS: brelse: Trying to free free buffer\n");
 534 }
 535
 536 /*
 537  * bread() reads a specified block and returns the buffer that contains
 538  * it. It returns NULL if the block was unreadable.
 539  */
 540 struct buffer_head * bread(dev_t dev, int block, int size)
 541 {
 542         struct buffer_head * bh;
 543
 544         if (!(bh = getblk(dev, block, size))) {
 545                 printk("VFS: bread: READ error on device %d/%d\n",
 546                                                 MAJOR(dev), MINOR(dev));
 547                 return NULL;
 548         }
 549         if (bh->b_uptodate)
 550                 return bh;
 551         ll_rw_block(READ, 1, &bh);
 552         wait_on_buffer(bh);
 553         if (bh->b_uptodate)
 554                 return bh;
 555         brelse(bh);
 556         return NULL;
 557 }
 558
 559 /*
 560  * Ok, breada can be used as bread, but additionally to mark other
 561  * blocks for reading as well. End the argument list with a negative
 562  * number.
 563  */
 564 struct buffer_head * breada(dev_t dev,int first, ...)
 565 {
 566         va_list args;
 567         unsigned int blocksize;
 568         struct buffer_head * bh, *tmp;
 569
 570         va_start(args,first);
 571
 572         blocksize = BLOCK_SIZE;
 573         if (blksize_size[MAJOR(dev)] && blksize_size[MAJOR(dev)][MINOR(dev)])
 574                 blocksize = blksize_size[MAJOR(dev)][MINOR(dev)];
 575
 576         if (!(bh = getblk(dev, first, blocksize))) {
 577                 printk("VFS: breada: READ error on device %d/%d\n",
 578                                                 MAJOR(dev), MINOR(dev));
 579                 return NULL;
 580         }
 581         if (!bh->b_uptodate)
 582                 ll_rw_block(READ, 1, &bh);
 583         while ((first=va_arg(args,int))>=0) {
 584                 tmp = getblk(dev, first, blocksize);
 585                 if (tmp) {
 586                         if (!tmp->b_uptodate)
 587                                 ll_rw_block(READA, 1, &tmp);
 588                         tmp->b_count--;
 589                 }
 590         }
 591         va_end(args);
 592         wait_on_buffer(bh);
 593         if (bh->b_uptodate)
 594                 return bh;
 595         brelse(bh);
 596         return (NULL);
 597 }
 598
 599 /*
 600  * See fs/inode.c for the weird use of volatile..
 601  */
 602 static void put_unused_buffer_head(struct buffer_head * bh)
 603 {
 604         struct wait_queue * wait;
 605
 606         wait = ((volatile struct buffer_head *) bh)->b_wait;
 607         memset((void *) bh,0,sizeof(*bh));
 608         ((volatile struct buffer_head *) bh)->b_wait = wait;
 609         bh->b_next_free = unused_list;
 610         unused_list = bh;
 611 }
 612
 613 static void get_more_buffer_heads(void)
 614 {
 615         int i;
 616         struct buffer_head * bh;
 617
 618         if (unused_list)
 619                 return;
 620
 621         if(! (bh = (struct buffer_head*) get_free_page(GFP_BUFFER)))
 622                 return;
 623
 624         for (nr_buffer_heads+=i=PAGE_SIZE/sizeof*bh ; i>0; i--) {
 625                 bh->b_next_free = unused_list;  /* only make link */
 626                 unused_list = bh++;
 627         }
 628 }
 629
 630 static struct buffer_head * get_unused_buffer_head(void)
 631 {
 632         struct buffer_head * bh;
 633
 634         get_more_buffer_heads();
 635         if (!unused_list)
 636                 return NULL;
 637         bh = unused_list;
 638         unused_list = bh->b_next_free;
 639         bh->b_next_free = NULL;
 640         bh->b_data = NULL;
 641         bh->b_size = 0;
 642         bh->b_req = 0;
 643         return bh;
 644 }
 645
 646 /*
 647  * Create the appropriate buffers when given a page for data area and
 648  * the size of each buffer.. Use the bh->b_this_page linked list to
 649  * follow the buffers created.  Return NULL if unable to create more
 650  * buffers.
 651  */
 652 static struct buffer_head * create_buffers(unsigned long page, unsigned long size)
 653 {
 654         struct buffer_head *bh, *head;
 655         unsigned long offset;
 656
 657         head = NULL;
 658         offset = PAGE_SIZE;
 659         while ((offset -= size) < PAGE_SIZE) {
 660                 bh = get_unused_buffer_head();
 661                 if (!bh)
 662                         goto no_grow;
 663                 bh->b_this_page = head;
 664                 head = bh;
 665                 bh->b_data = (char *) (page+offset);
 666                 bh->b_size = size;
 667         }
 668         return head;
 669 /*
 670  * In case anything failed, we just free everything we got.
 671  */
 672 no_grow:
 673         bh = head;
 674         while (bh) {
 675                 head = bh;
 676                 bh = bh->b_this_page;
 677                 put_unused_buffer_head(head);
 678         }
 679         return NULL;
 680 }
 681
 682 static void read_buffers(struct buffer_head * bh[], int nrbuf)
 683 {
 684         int i;
 685         int bhnum = 0;
 686         struct buffer_head * bhr[8];
 687
 688         for (i = 0 ; i < nrbuf ; i++) {
 689                 if (bh[i] && !bh[i]->b_uptodate)
 690                         bhr[bhnum++] = bh[i];
 691         }
 692         if (bhnum)
 693                 ll_rw_block(READ, bhnum, bhr);
 694         for (i = 0 ; i < nrbuf ; i++) {
 695                 if (bh[i]) {
 696                         wait_on_buffer(bh[i]);
 697                 }
 698         }
 699 }
 700
 701 static unsigned long check_aligned(struct buffer_head * first, unsigned long address,
 702         dev_t dev, int *b, int size)
 703 {
 704         struct buffer_head * bh[8];
 705         unsigned long page;
 706         unsigned long offset;
 707         int block;
 708         int nrbuf;
 709
 710         page = (unsigned long) first->b_data;
 711         if (page & ~PAGE_MASK) {
 712                 brelse(first);
 713                 return 0;
 714         }
 715         mem_map[MAP_NR(page)]++;
 716         bh[0] = first;
 717         nrbuf = 1;
 718         for (offset = size ; offset < PAGE_SIZE ; offset += size) {
 719                 block = *++b;
 720                 if (!block)
 721                         goto no_go;
 722                 first = get_hash_table(dev, block, size);
 723                 if (!first)
 724                         goto no_go;
 725                 bh[nrbuf++] = first;
 726                 if (page+offset != (unsigned long) first->b_data)
 727                         goto no_go;
 728         }
 729         read_buffers(bh,nrbuf);         /* make sure they are actually read correctly */
 730         while (nrbuf-- > 0)
 731                 brelse(bh[nrbuf]);
 732         free_page(address);
 733         ++current->min_flt;
 734         return page;
 735 no_go:
 736         while (nrbuf-- > 0)
 737                 brelse(bh[nrbuf]);
 738         free_page(page);
 739         return 0;
 740 }
 741
 742 static unsigned long try_to_load_aligned(unsigned long address,
 743         dev_t dev, int b[], int size)
 744 {
 745         struct buffer_head * bh, * tmp, * arr[8];
 746         unsigned long offset;
 747         int * p;
 748         int block;
 749
 750         bh = create_buffers(address, size);
 751         if (!bh)
 752                 return 0;
 753         p = b;
 754         for (offset = 0 ; offset < PAGE_SIZE ; offset += size) {
 755                 block = *(p++);
 756                 if (!block)
 757                         goto not_aligned;
 758                 tmp = get_hash_table(dev, block, size);
 759                 if (tmp) {
 760                         brelse(tmp);
 761                         goto not_aligned;
 762                 }
 763         }
 764         tmp = bh;
 765         p = b;
 766         block = 0;
 767         while (1) {
 768                 arr[block++] = bh;
 769                 bh->b_count = 1;
 770                 bh->b_dirt = 0;
 771                 bh->b_uptodate = 0;
 772                 bh->b_dev = dev;
 773                 bh->b_blocknr = *(p++);
 774                 nr_buffers++;
 775                 insert_into_queues(bh);
 776                 if (bh->b_this_page)
 777                         bh = bh->b_this_page;
 778                 else
 779                         break;
 780         }
 781         buffermem += PAGE_SIZE;
 782         bh->b_this_page = tmp;
 783         mem_map[MAP_NR(address)]++;
 784         read_buffers(arr,block);
 785         while (block-- > 0)
 786                 brelse(arr[block]);
 787         ++current->maj_flt;
 788         return address;
 789 not_aligned:
 790         while ((tmp = bh) != NULL) {
 791                 bh = bh->b_this_page;
 792                 put_unused_buffer_head(tmp);
 793         }
 794         return 0;
 795 }
 796
 797 /*
 798  * Try-to-share-buffers tries to minimize memory use by trying to keep
 799  * both code pages and the buffer area in the same page. This is done by
 800  * (a) checking if the buffers are already aligned correctly in memory and
 801  * (b) if none of the buffer heads are in memory at all, trying to load
 802  * them into memory the way we want them.
 803  *
 804  * This doesn't guarantee that the memory is shared, but should under most
 805  * circumstances work very well indeed (ie >90% sharing of code pages on
 806  * demand-loadable executables).
 807  */
 808 static inline unsigned long try_to_share_buffers(unsigned long address,
 809         dev_t dev, int *b, int size)
 810 {
 811         struct buffer_head * bh;
 812         int block;
 813
 814         block = b[0];
 815         if (!block)
 816                 return 0;
 817         bh = get_hash_table(dev, block, size);
 818         if (bh)
 819                 return check_aligned(bh, address, dev, b, size);
 820         return try_to_load_aligned(address, dev, b, size);
 821 }
 822
 823 #define COPYBLK(size,from,to) \
 824 __asm__ __volatile__("rep ; movsl": \
 825         :"c" (((unsigned long) size) >> 2),"S" (from),"D" (to) \
 826         :"cx","di","si")
 827
 828 /*
 829  * bread_page reads four buffers into memory at the desired address. It's
 830  * a function of its own, as there is some speed to be got by reading them
 831  * all at the same time, not waiting for one to be read, and then another
 832  * etc. This also allows us to optimize memory usage by sharing code pages
 833  * and filesystem buffers..
 834  */
 835 unsigned long bread_page(unsigned long address, dev_t dev, int b[], int size, int prot)
 836 {
 837         struct buffer_head * bh[8];
 838         unsigned long where;
 839         int i, j;
 840
 841         if (!(prot & PAGE_RW)) {
 842                 where = try_to_share_buffers(address,dev,b,size);
 843                 if (where)
 844                         return where;
 845         }
 846         ++current->maj_flt;
 847         for (i=0, j=0; j<PAGE_SIZE ; i++, j+= size) {
 848                 bh[i] = NULL;
 849                 if (b[i])
 850                         bh[i] = getblk(dev, b[i], size);
 851         }
 852         read_buffers(bh,i);
 853         where = address;
 854         for (i=0, j=0; j<PAGE_SIZE ; i++, j += size,address += size) {
 855                 if (bh[i]) {
 856                         if (bh[i]->b_uptodate)
 857                                 COPYBLK(size, (unsigned long) bh[i]->b_data,address);
 858                         brelse(bh[i]);
 859                 }
 860         }
 861         return where;
 862 }
 863
 864 /*
 865  * Try to increase the number of buffers available: the size argument
 866  * is used to determine what kind of buffers we want.
 867  */
 868 static int grow_buffers(int pri, int size)
 869 {
 870         unsigned long page;
 871         struct buffer_head *bh, *tmp;
 872
 873         if ((size & 511) || (size > PAGE_SIZE)) {
 874                 printk("VFS: grow_buffers: size = %d\n",size);
 875                 return 0;
 876         }
 877         if(!(page = __get_free_page(pri)))
 878                 return 0;
 879         bh = create_buffers(page, size);
 880         if (!bh) {
 881                 free_page(page);
 882                 return 0;
 883         }
 884         tmp = bh;
 885         while (1) {
 886                 if (free_list) {
 887                         tmp->b_next_free = free_list;
 888                         tmp->b_prev_free = free_list->b_prev_free;
 889                         free_list->b_prev_free->b_next_free = tmp;
 890                         free_list->b_prev_free = tmp;
 891                 } else {
 892                         tmp->b_prev_free = tmp;
 893                         tmp->b_next_free = tmp;
 894                 }
 895                 free_list = tmp;
 896                 ++nr_buffers;
 897                 if (tmp->b_this_page)
 898                         tmp = tmp->b_this_page;
 899                 else
 900                         break;
 901         }
 902         tmp->b_this_page = bh;
 903         buffermem += PAGE_SIZE;
 904         return 1;
 905 }
 906
 907 /*
 908  * try_to_free() checks if all the buffers on this particular page
 909  * are unused, and free's the page if so.
 910  */
 911 static int try_to_free(struct buffer_head * bh, struct buffer_head ** bhp)
 912 {
 913         unsigned long page;
 914         struct buffer_head * tmp, * p;
 915
 916         *bhp = bh;
 917         page = (unsigned long) bh->b_data;
 918         page &= PAGE_MASK;
 919         tmp = bh;
 920         do {
 921                 if (!tmp)
 922                         return 0;
 923                 if (tmp->b_count || tmp->b_dirt || tmp->b_lock)
 924                         return 0;
 925                 tmp = tmp->b_this_page;
 926         } while (tmp != bh);
 927         tmp = bh;
 928         do {
 929                 p = tmp;
 930                 tmp = tmp->b_this_page;
 931                 nr_buffers--;
 932                 if (p == *bhp)
 933                         *bhp = p->b_prev_free;
 934                 remove_from_queues(p);
 935                 put_unused_buffer_head(p);
 936         } while (tmp != bh);
 937         buffermem -= PAGE_SIZE;
 938         free_page(page);
 939         return !mem_map[MAP_NR(page)];
 940 }
 941
 942 /*
 943  * Try to free up some pages by shrinking the buffer-cache
 944  *
 945  * Priority tells the routine how hard to try to shrink the
 946  * buffers: 3 means "don't bother too much", while a value
 947  * of 0 means "we'd better get some free pages now".
 948  */
 949 int shrink_buffers(unsigned int priority)
 950 {
 951         struct buffer_head *bh;
 952         int i;
 953
 954         if (priority < 2)
 955                 sync_buffers(0,0);
 956         bh = free_list;
 957         i = nr_buffers >> priority;
 958         for ( ; i-- > 0 ; bh = bh->b_next_free) {
 959                 if (bh->b_count || !bh->b_this_page)
 960                         continue;
 961                 if (bh->b_lock)
 962                         if (priority)
 963                                 continue;
 964                         else
 965                                 wait_on_buffer(bh);
 966                 if (bh->b_dirt) {
 967                         bh->b_count++;
 968                         ll_rw_block(WRITEA, 1, &bh);
 969                         bh->b_count--;
 970                         continue;
 971                 }
 972                 if (try_to_free(bh, &bh))
 973                         return 1;
 974         }
 975         return 0;
 976 }
 977
 978 /*
 979  * This initializes the initial buffer free list.  nr_buffers is set
 980  * to one less the actual number of buffers, as a sop to backwards
 981  * compatibility --- the old code did this (I think unintentionally,
 982  * but I'm not sure), and programs in the ps package expect it.
 983  *                                      - TYT 8/30/92
 984  */
 985 void buffer_init(void)
 986 {
 987         int i;
 988
 989         if (high_memory >= 4*1024*1024)
 990                 min_free_pages = 200;
 991         else
 992                 min_free_pages = 20;
 993         for (i = 0 ; i < NR_HASH ; i++)
 994                 hash_table[i] = NULL;
 995         free_list = 0;
 996         grow_buffers(GFP_KERNEL, BLOCK_SIZE);
 997         if (!free_list)
 998                 panic("VFS: Unable to initialize buffer free list!");
 999         return;
1000 }