drivers/block/ll_rw_blk.c

   1 /*
   2  *  linux/drivers/block/ll_rw_blk.c
   3  *
   4  * Copyright (C) 1991, 1992 Linus Torvalds
   5  * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
   6  * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
   7  * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
   8  * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
   9  */
  10
  11 /*
  12  * This handles all read/write requests to block devices
  13  */
  14 #include <linux/sched.h>
  15 #include <linux/kernel.h>
  16 #include <linux/kernel_stat.h>
  17 #include <linux/errno.h>
  18 #include <linux/string.h>
  19 #include <linux/config.h>
  20 #include <linux/locks.h>
  21 #include <linux/mm.h>
  22 #include <linux/init.h>
  23 #include <linux/smp_lock.h>
  24
  25 #include <asm/system.h>
  26 #include <asm/io.h>
  27 #include <linux/blk.h>
  28 #include <linux/highmem.h>
  29 #include <linux/raid/md.h>
  30
  31 #include <linux/module.h>
  32
  33 /*
  34  * MAC Floppy IWM hooks
  35  */
  36
  37 #ifdef CONFIG_MAC_FLOPPY_IWM
  38 extern int mac_floppy_init(void);
  39 #endif
  40
  41 extern int lvm_init(void);
  42
  43 /*
  44  * For the allocated request tables
  45  */
  46 static kmem_cache_t *request_cachep;
  47
  48 /*
  49  * The "disk" task queue is used to start the actual requests
  50  * after a plug
  51  */
  52 DECLARE_TASK_QUEUE(tq_disk);
  53
  54 /*
  55  * Protect the request list against multiple users..
  56  *
  57  * With this spinlock the Linux block IO subsystem is 100% SMP threaded
  58  * from the IRQ event side, and almost 100% SMP threaded from the syscall
  59  * side (we still have protect against block device array operations, and
  60  * the do_request() side is casually still unsafe. The kernel lock protects
  61  * this part currently.).
  62  *
  63  * there is a fair chance that things will work just OK if these functions
  64  * are called with no global kernel lock held ...
  65  */
  66 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
  67
  68 /* This specifies how many sectors to read ahead on the disk. */
  69
  70 int read_ahead[MAX_BLKDEV];
  71
  72 /* blk_dev_struct is:
  73  *      *request_fn
  74  *      *current_request
  75  */
  76 struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */
  77
  78 /*
  79  * blk_size contains the size of all block-devices in units of 1024 byte
  80  * sectors:
  81  *
  82  * blk_size[MAJOR][MINOR]
  83  *
  84  * if (!blk_size[MAJOR]) then no minor size checking is done.
  85  */
  86 int * blk_size[MAX_BLKDEV];
  87
  88 /*
  89  * blksize_size contains the size of all block-devices:
  90  *
  91  * blksize_size[MAJOR][MINOR]
  92  *
  93  * if (!blksize_size[MAJOR]) then 1024 bytes is assumed.
  94  */
  95 int * blksize_size[MAX_BLKDEV];
  96
  97 /*
  98  * hardsect_size contains the size of the hardware sector of a device.
  99  *
 100  * hardsect_size[MAJOR][MINOR]
 101  *
 102  * if (!hardsect_size[MAJOR])
 103  *              then 512 bytes is assumed.
 104  * else
 105  *              sector_size is hardsect_size[MAJOR][MINOR]
 106  * This is currently set by some scsi devices and read by the msdos fs driver.
 107  * Other uses may appear later.
 108  */
 109 int * hardsect_size[MAX_BLKDEV];
 110
 111 /*
 112  * The following tunes the read-ahead algorithm in mm/filemap.c
 113  */
 114 int * max_readahead[MAX_BLKDEV];
 115
 116 /*
 117  * Max number of sectors per request
 118  */
 119 int * max_sectors[MAX_BLKDEV];
 120
 121 static inline int get_max_sectors(kdev_t dev)
 122 {
 123         if (!max_sectors[MAJOR(dev)])
 124                 return MAX_SECTORS;
 125         return max_sectors[MAJOR(dev)][MINOR(dev)];
 126 }
 127
 128 static inline request_queue_t *__blk_get_queue(kdev_t dev)
 129 {
 130         struct blk_dev_struct *bdev = blk_dev + MAJOR(dev);
 131
 132         if (bdev->queue)
 133                 return bdev->queue(dev);
 134         else
 135                 return &blk_dev[MAJOR(dev)].request_queue;
 136 }
 137
 138 /*
 139  * NOTE: the device-specific queue() functions
 140  * have to be atomic!
 141  */
 142 request_queue_t *blk_get_queue(kdev_t dev)
 143 {
 144         request_queue_t *ret;
 145         unsigned long flags;
 146
 147         spin_lock_irqsave(&io_request_lock,flags);
 148         ret = __blk_get_queue(dev);
 149         spin_unlock_irqrestore(&io_request_lock,flags);
 150
 151         return ret;
 152 }
 153
 154 static int __blk_cleanup_queue(struct list_head *head)
 155 {
 156         struct list_head *entry;
 157         struct request *rq;
 158         int i = 0;
 159
 160         if (list_empty(head))
 161                 return 0;
 162
 163         entry = head->next;
 164         do {
 165                 rq = list_entry(entry, struct request, table);
 166                 entry = entry->next;
 167                 list_del(&rq->table);
 168                 kmem_cache_free(request_cachep, rq);
 169                 i++;
 170         } while (!list_empty(head));
 171
 172         return i;
 173 }
 174
 175 /**
 176  * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
 177  * @q:    the request queue to be released
 178  *
 179  * Description:
 180  *     blk_cleanup_queue is the pair to blk_init_queue().  It should
 181  *     be called when a request queue is being released; typically
 182  *     when a block device is being de-registered.  Currently, its
 183  *     primary task it to free all the &struct request structures that
 184  *     were allocated to the queue.
 185  * Caveat:
 186  *     Hopefully the low level driver will have finished any
 187  *     outstanding requests first...
 188  **/
 189 void blk_cleanup_queue(request_queue_t * q)
 190 {
 191         int count = QUEUE_NR_REQUESTS;
 192
 193         count -= __blk_cleanup_queue(&q->request_freelist[READ]);
 194         count -= __blk_cleanup_queue(&q->request_freelist[WRITE]);
 195
 196         if (count)
 197                 printk("blk_cleanup_queue: leaked requests (%d)\n", count);
 198
 199         memset(q, 0, sizeof(*q));
 200 }
 201
 202 /**
 203  * blk_queue_headactive - indicate whether head of request queue may be active
 204  * @q:       The queue which this applies to.
 205  * @active:  A flag indication where the head of the queue is active.
 206  *
 207  * Description:
 208  *    The driver for a block device may choose to leave the currently active
 209  *    request on the request queue, removing it only when it has completed.
 210  *    The queue handling routines assume this by default for safety reasons
 211  *    and will not involve the head of the request queue in any merging or
 212  *    reordering of requests when the queue is unplugged (and thus may be
 213  *    working on this particular request).
 214  *
 215  *    If a driver removes requests from the queue before processing them, then
 216  *    it may indicate that it does so, there by allowing the head of the queue
 217  *    to be involved in merging and reordering.  This is done be calling
 218  *    blk_queue_headactive() with an @active flag of %0.
 219  *
 220  *    If a driver processes several requests at once, it must remove them (or
 221  *    at least all but one of them) from the request queue.
 222  *
 223  *    When a queue is plugged (see blk_queue_pluggable()) the head will be
 224  *    assumed to be inactive.
 225  **/
 226
 227 void blk_queue_headactive(request_queue_t * q, int active)
 228 {
 229         q->head_active = active;
 230 }
 231
 232 /**
 233  * blk_queue_pluggable - define a plugging function for a request queue
 234  * @q:   the request queue to which the function will apply
 235  * @plug: the function to be called to plug a queue
 236  *
 237  * Description:
 238  *   A request queue will be "plugged" if a request is added to it
 239  *   while it is empty.  This allows a number of requests to be added
 240  *   before any are processed, thus providing an opportunity for these
 241  *   requests to be merged or re-ordered.
 242  *   The default plugging function (generic_plug_device()) sets the
 243  *   "plugged" flag for the queue and adds a task to the $tq_disk task
 244  *   queue to unplug the queue and call the request function at a
 245  *   later time.
 246  *
 247  *   A device driver may provide an alternate plugging function by
 248  *   passing it to blk_queue_pluggable().  This function should set
 249  *   the "plugged" flag if it want calls to the request_function to be
 250  *   blocked, and should place a task on $tq_disk which will unplug
 251  *   the queue.  Alternately it can simply do nothing and there-by
 252  *   disable plugging of the device.
 253  **/
 254
 255 void blk_queue_pluggable (request_queue_t * q, plug_device_fn *plug)
 256 {
 257         q->plug_device_fn = plug;
 258 }
 259
 260
 261 /**
 262  * blk_queue_make_request - define an alternate make_request function for a device
 263  * @q:  the request queue for the device to be affected
 264  * @mfn: the alternate make_request function
 265  *
 266  * Description:
 267  *    The normal way for &struct buffer_heads to be passed to a device
 268  *    driver is for them to be collected into requests on a request
 269  *    queue, and then to allow the device driver to select requests
 270  *    off that queue when it is ready.  This works well for many block
 271  *    devices. However some block devices (typically virtual devices
 272  *    such as md or lvm) do not benefit from the processing on the
 273  *    request queue, and are served best by having the requests passed
 274  *    directly to them.  This can be achieved by providing a function
 275  *    to blk_queue_make_request().
 276  *
 277  * Caveat:
 278  *    The driver that does this *must* be able to deal appropriately
 279  *    with buffers in "highmemory", either by calling bh_kmap() to get
 280  *    a kernel mapping, to by calling create_bounce() to create a
 281  *    buffer in normal memory.
 282  **/
 283
 284 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 285 {
 286         q->make_request_fn = mfn;
 287 }
 288
 289 static inline int ll_new_segment(request_queue_t *q, struct request *req, int max_segments)
 290 {
 291         if (req->nr_segments < max_segments) {
 292                 req->nr_segments++;
 293                 q->elevator.nr_segments++;
 294                 return 1;
 295         }
 296         return 0;
 297 }
 298
 299 static int ll_back_merge_fn(request_queue_t *q, struct request *req,
 300                             struct buffer_head *bh, int max_segments)
 301 {
 302         if (req->bhtail->b_data + req->bhtail->b_size == bh->b_data)
 303                 return 1;
 304         return ll_new_segment(q, req, max_segments);
 305 }
 306
 307 static int ll_front_merge_fn(request_queue_t *q, struct request *req,
 308                              struct buffer_head *bh, int max_segments)
 309 {
 310         if (bh->b_data + bh->b_size == req->bh->b_data)
 311                 return 1;
 312         return ll_new_segment(q, req, max_segments);
 313 }
 314
 315 static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
 316                                 struct request *next, int max_segments)
 317 {
 318         int total_segments = req->nr_segments + next->nr_segments;
 319         int same_segment;
 320
 321         same_segment = 0;
 322         if (req->bhtail->b_data + req->bhtail->b_size == next->bh->b_data) {
 323                 total_segments--;
 324                 same_segment = 1;
 325         }
 326
 327         if (total_segments > max_segments)
 328                 return 0;
 329
 330         q->elevator.nr_segments -= same_segment;
 331         req->nr_segments = total_segments;
 332         return 1;
 333 }
 334
 335 /*
 336  * "plug" the device if there are no outstanding requests: this will
 337  * force the transfer to start only after we have put all the requests
 338  * on the list.
 339  *
 340  * This is called with interrupts off and no requests on the queue.
 341  * (and with the request spinlock acquired)
 342  */
 343 static void generic_plug_device(request_queue_t *q, kdev_t dev)
 344 {
 345         /*
 346          * no need to replug device
 347          */
 348         if (!list_empty(&q->queue_head) || q->plugged)
 349                 return;
 350
 351         q->plugged = 1;
 352         queue_task(&q->plug_tq, &tq_disk);
 353 }
 354
 355 /*
 356  * remove the plug and let it rip..
 357  */
 358 static inline void __generic_unplug_device(request_queue_t *q)
 359 {
 360         if (q->plugged) {
 361                 q->plugged = 0;
 362                 if (!list_empty(&q->queue_head))
 363                         q->request_fn(q);
 364         }
 365 }
 366
 367 static void generic_unplug_device(void *data)
 368 {
 369         request_queue_t *q = (request_queue_t *) data;
 370         unsigned long flags;
 371
 372         spin_lock_irqsave(&io_request_lock, flags);
 373         __generic_unplug_device(q);
 374         spin_unlock_irqrestore(&io_request_lock, flags);
 375 }
 376
 377 static void blk_init_free_list(request_queue_t *q)
 378 {
 379         struct request *rq;
 380         int i;
 381
 382         /*
 383          * Divide requests in half between read and write. This used to
 384          * be a 2/3 advantage for reads, but now reads can steal from
 385          * the write free list.
 386          */
 387         for (i = 0; i < QUEUE_NR_REQUESTS; i++) {
 388                 rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL);
 389                 rq->rq_status = RQ_INACTIVE;
 390                 list_add(&rq->table, &q->request_freelist[i & 1]);
 391         }
 392
 393         init_waitqueue_head(&q->wait_for_request);
 394         spin_lock_init(&q->request_lock);
 395 }
 396
 397 static int __make_request(request_queue_t * q, int rw, struct buffer_head * bh);
 398
 399 /**
 400  * blk_init_queue  - prepare a request queue for use with a block device
 401  * @q:    The &request_queue_t to be initialised
 402  * @rfn:  The function to be called to process requests that have been
 403  *        placed on the queue.
 404  *
 405  * Description:
 406  *    If a block device wishes to use the standard request handling procedures,
 407  *    which sorts requests and coalesces adjacent requests, then it must
 408  *    call blk_init_queue().  The function @rfn will be called when there
 409  *    are requests on the queue that need to be processed.  If the device
 410  *    supports plugging, then @rfn may not be called immediately when requests
 411  *    are available on the queue, but may be called at some time later instead.
 412  *    Plugged queues are generally unplugged when a buffer belonging to one
 413  *    of the requests on the queue is needed, or due to memory pressure.
 414  *
 415  *    @rfn is not required, or even expected, to remove all requests off the
 416  *    queue, but only as many as it can handle at a time.  If it does leave
 417  *    requests on the queue, it is responsible for arranging that the requests
 418  *    get dealt with eventually.
 419  *
 420  *    A global spin lock $io_request_lock must be held while manipulating the
 421  *    requests on the request queue.
 422  *
 423  *    The request on the head of the queue is by default assumed to be
 424  *    potentially active, and it is not considered for re-ordering or merging
 425  *    whenever the given queue is unplugged. This behaviour can be changed with
 426  *    blk_queue_headactive().
 427  *
 428  * Note:
 429  *    blk_init_queue() must be paired with a blk_cleanup-queue() call
 430  *    when the block device is deactivated (such as at module unload).
 431  **/
 432 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
 433 {
 434         INIT_LIST_HEAD(&q->queue_head);
 435         INIT_LIST_HEAD(&q->request_freelist[READ]);
 436         INIT_LIST_HEAD(&q->request_freelist[WRITE]);
 437         elevator_init(&q->elevator, ELEVATOR_LINUS);
 438         blk_init_free_list(q);
 439         q->request_fn           = rfn;
 440         q->back_merge_fn        = ll_back_merge_fn;
 441         q->front_merge_fn       = ll_front_merge_fn;
 442         q->merge_requests_fn    = ll_merge_requests_fn;
 443         q->make_request_fn      = __make_request;
 444         q->plug_tq.sync         = 0;
 445         q->plug_tq.routine      = &generic_unplug_device;
 446         q->plug_tq.data         = q;
 447         q->plugged              = 0;
 448         /*
 449          * These booleans describe the queue properties.  We set the
 450          * default (and most common) values here.  Other drivers can
 451          * use the appropriate functions to alter the queue properties.
 452          * as appropriate.
 453          */
 454         q->plug_device_fn       = generic_plug_device;
 455         q->head_active          = 1;
 456 }
 457
 458
 459 #define blkdev_free_rq(list) list_entry((list)->next, struct request, table);
 460 /*
 461  * Get a free request. io_request_lock must be held and interrupts
 462  * disabled on the way in.
 463  */
 464 static inline struct request *get_request(request_queue_t *q, int rw)
 465 {
 466         struct list_head *list = &q->request_freelist[rw];
 467         struct request *rq;
 468
 469         /*
 470          * Reads get preferential treatment and are allowed to steal
 471          * from the write free list if necessary.
 472          */
 473         if (!list_empty(list)) {
 474                 rq = blkdev_free_rq(list);
 475                 goto got_rq;
 476         }
 477
 478         /*
 479          * if the WRITE list is non-empty, we know that rw is READ
 480          * and that the READ list is empty. allow reads to 'steal'
 481          * from the WRITE list.
 482          */
 483         if (!list_empty(&q->request_freelist[WRITE])) {
 484                 list = &q->request_freelist[WRITE];
 485                 rq = blkdev_free_rq(list);
 486                 goto got_rq;
 487         }
 488
 489         return NULL;
 490
 491 got_rq:
 492         list_del(&rq->table);
 493         rq->free_list = list;
 494         rq->rq_status = RQ_ACTIVE;
 495         rq->special = NULL;
 496         rq->q = q;
 497         return rq;
 498 }
 499
 500 /*
 501  * No available requests for this queue, unplug the device.
 502  */
 503 static struct request *__get_request_wait(request_queue_t *q, int rw)
 504 {
 505         register struct request *rq;
 506         DECLARE_WAITQUEUE(wait, current);
 507
 508         add_wait_queue_exclusive(&q->wait_for_request, &wait);
 509         for (;;) {
 510                 __set_current_state(TASK_UNINTERRUPTIBLE);
 511                 spin_lock_irq(&io_request_lock);
 512                 rq = get_request(q, rw);
 513                 spin_unlock_irq(&io_request_lock);
 514                 if (rq)
 515                         break;
 516                 generic_unplug_device(q);
 517                 schedule();
 518         }
 519         remove_wait_queue(&q->wait_for_request, &wait);
 520         current->state = TASK_RUNNING;
 521         return rq;
 522 }
 523
 524 static inline struct request *get_request_wait(request_queue_t *q, int rw)
 525 {
 526         register struct request *rq;
 527
 528         spin_lock_irq(&io_request_lock);
 529         rq = get_request(q, rw);
 530         spin_unlock_irq(&io_request_lock);
 531         if (rq)
 532                 return rq;
 533         return __get_request_wait(q, rw);
 534 }
 535
 536 /* RO fail safe mechanism */
 537
 538 static long ro_bits[MAX_BLKDEV][8];
 539
 540 int is_read_only(kdev_t dev)
 541 {
 542         int minor,major;
 543
 544         major = MAJOR(dev);
 545         minor = MINOR(dev);
 546         if (major < 0 || major >= MAX_BLKDEV) return 0;
 547         return ro_bits[major][minor >> 5] & (1 << (minor & 31));
 548 }
 549
 550 void set_device_ro(kdev_t dev,int flag)
 551 {
 552         int minor,major;
 553
 554         major = MAJOR(dev);
 555         minor = MINOR(dev);
 556         if (major < 0 || major >= MAX_BLKDEV) return;
 557         if (flag) ro_bits[major][minor >> 5] |= 1 << (minor & 31);
 558         else ro_bits[major][minor >> 5] &= ~(1 << (minor & 31));
 559 }
 560
 561 inline void drive_stat_acct (kdev_t dev, int rw,
 562                                 unsigned long nr_sectors, int new_io)
 563 {
 564         unsigned int major = MAJOR(dev);
 565         unsigned int index;
 566
 567         index = disk_index(dev);
 568         if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
 569                 return;
 570
 571         kstat.dk_drive[major][index] += new_io;
 572         if (rw == READ) {
 573                 kstat.dk_drive_rio[major][index] += new_io;
 574                 kstat.dk_drive_rblk[major][index] += nr_sectors;
 575         } else if (rw == WRITE) {
 576                 kstat.dk_drive_wio[major][index] += new_io;
 577                 kstat.dk_drive_wblk[major][index] += nr_sectors;
 578         } else
 579                 printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n");
 580 }
 581
 582 /*
 583  * add-request adds a request to the linked list.
 584  * It disables interrupts (acquires the request spinlock) so that it can muck
 585  * with the request-lists in peace. Thus it should be called with no spinlocks
 586  * held.
 587  *
 588  * By this point, req->cmd is always either READ/WRITE, never READA,
 589  * which is important for drive_stat_acct() above.
 590  */
 591
 592 static inline void add_request(request_queue_t * q, struct request * req,
 593                                struct list_head *head, int lat)
 594 {
 595         int major;
 596
 597         drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1);
 598
 599         /*
 600          * let selected elevator insert the request
 601          */
 602         q->elevator.elevator_fn(req, &q->elevator, &q->queue_head, head, lat);
 603
 604         /*
 605          * FIXME(eric) I don't understand why there is a need for this
 606          * special case code.  It clearly doesn't fit any more with
 607          * the new queueing architecture, and it got added in 2.3.10.
 608          * I am leaving this in here until I hear back from the COMPAQ
 609          * people.
 610          */
 611         major = MAJOR(req->rq_dev);
 612         if (major >= COMPAQ_SMART2_MAJOR+0 && major <= COMPAQ_SMART2_MAJOR+7)
 613                 (q->request_fn)(q);
 614         if (major >= COMPAQ_CISS_MAJOR+0 && major <= COMPAQ_CISS_MAJOR+7)
 615                 (q->request_fn)(q);
 616         if (major >= DAC960_MAJOR+0 && major <= DAC960_MAJOR+7)
 617                 (q->request_fn)(q);
 618 }
 619
 620 /*
 621  * Must be called with io_request_lock held and interrupts disabled
 622  */
 623 void inline blkdev_release_request(struct request *req)
 624 {
 625         req->rq_status = RQ_INACTIVE;
 626
 627         /*
 628          * Request may not have originated from ll_rw_blk
 629          */
 630         if (req->free_list) {
 631                 list_add(&req->table, req->free_list);
 632                 req->free_list = NULL;
 633                 wake_up(&req->q->wait_for_request);
 634         }
 635 }
 636
 637 /*
 638  * Has to be called with the request spinlock acquired
 639  */
 640 static void attempt_merge(request_queue_t * q,
 641                           struct request *req,
 642                           int max_sectors,
 643                           int max_segments)
 644 {
 645         struct request *next;
 646
 647         next = blkdev_next_request(req);
 648         if (req->sector + req->nr_sectors != next->sector)
 649                 return;
 650         if (req->cmd != next->cmd
 651             || req->rq_dev != next->rq_dev
 652             || req->nr_sectors + next->nr_sectors > max_sectors
 653             || next->sem)
 654                 return;
 655         /*
 656          * If we are not allowed to merge these requests, then
 657          * return.  If we are allowed to merge, then the count
 658          * will have been updated to the appropriate number,
 659          * and we shouldn't do it here too.
 660          */
 661         if(!(q->merge_requests_fn)(q, req, next, max_segments))
 662                 return;
 663
 664         req->bhtail->b_reqnext = next->bh;
 665         req->bhtail = next->bhtail;
 666         req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
 667         list_del(&next->queue);
 668         blkdev_release_request(next);
 669 }
 670
 671 static inline void attempt_back_merge(request_queue_t * q,
 672                                       struct request *req,
 673                                       int max_sectors,
 674                                       int max_segments)
 675 {
 676         if (&req->queue == q->queue_head.prev)
 677                 return;
 678         attempt_merge(q, req, max_sectors, max_segments);
 679 }
 680
 681 static inline void attempt_front_merge(request_queue_t * q,
 682                                        struct list_head * head,
 683                                        struct request *req,
 684                                        int max_sectors,
 685                                        int max_segments)
 686 {
 687         struct list_head * prev;
 688
 689         prev = req->queue.prev;
 690         if (head == prev)
 691                 return;
 692         attempt_merge(q, blkdev_entry_to_request(prev), max_sectors, max_segments);
 693 }
 694
 695 static int __make_request(request_queue_t * q, int rw,
 696                                   struct buffer_head * bh)
 697 {
 698         unsigned int sector, count;
 699         int max_segments = MAX_SEGMENTS;
 700         struct request * req = NULL, *freereq = NULL;
 701         int rw_ahead, max_sectors, el_ret;
 702         struct list_head *head;
 703         int latency;
 704         elevator_t *elevator = &q->elevator;
 705
 706         count = bh->b_size >> 9;
 707         sector = bh->b_rsector;
 708
 709         rw_ahead = 0;   /* normal case; gets changed below for READA */
 710         switch (rw) {
 711                 case READA:
 712                         rw_ahead = 1;
 713                         rw = READ;      /* drop into READ */
 714                 case READ:
 715                 case WRITE:
 716                         break;
 717                 default:
 718                         BUG();
 719                         goto end_io;
 720         }
 721
 722         /* We'd better have a real physical mapping!
 723            Check this bit only if the buffer was dirty and just locked
 724            down by us so at this point flushpage will block and
 725            won't clear the mapped bit under us. */
 726         if (!buffer_mapped(bh))
 727                 BUG();
 728
 729         /*
 730          * Temporary solution - in 2.5 this will be done by the lowlevel
 731          * driver. Create a bounce buffer if the buffer data points into
 732          * high memory - keep the original buffer otherwise.
 733          */
 734 #if CONFIG_HIGHMEM
 735         bh = create_bounce(rw, bh);
 736 #endif
 737
 738 /* look for a free request. */
 739         /*
 740          * Try to coalesce the new request with old requests
 741          */
 742         max_sectors = get_max_sectors(bh->b_rdev);
 743
 744         latency = elevator_request_latency(elevator, rw);
 745
 746         /*
 747          * Now we acquire the request spinlock, we have to be mega careful
 748          * not to schedule or do something nonatomic
 749          */
 750 again:
 751         spin_lock_irq(&io_request_lock);
 752
 753         /*
 754          * skip first entry, for devices with active queue head
 755          */
 756         head = &q->queue_head;
 757         if (q->head_active && !q->plugged)
 758                 head = head->next;
 759
 760         if (list_empty(head)) {
 761                 q->plug_device_fn(q, bh->b_rdev); /* is atomic */
 762                 goto get_rq;
 763         }
 764
 765         el_ret = elevator->elevator_merge_fn(q, &req, bh, rw,
 766                                              &max_sectors, &max_segments);
 767         switch (el_ret) {
 768
 769                 case ELEVATOR_BACK_MERGE:
 770                         if (!q->back_merge_fn(q, req, bh, max_segments))
 771                                 break;
 772                         req->bhtail->b_reqnext = bh;
 773                         req->bhtail = bh;
 774                         req->nr_sectors = req->hard_nr_sectors += count;
 775                         req->e = elevator;
 776                         drive_stat_acct(req->rq_dev, req->cmd, count, 0);
 777                         attempt_back_merge(q, req, max_sectors, max_segments);
 778                         goto out;
 779
 780                 case ELEVATOR_FRONT_MERGE:
 781                         if (!q->front_merge_fn(q, req, bh, max_segments))
 782                                 break;
 783                         bh->b_reqnext = req->bh;
 784                         req->bh = bh;
 785                         req->buffer = bh->b_data;
 786                         req->current_nr_sectors = count;
 787                         req->sector = req->hard_sector = sector;
 788                         req->nr_sectors = req->hard_nr_sectors += count;
 789                         req->e = elevator;
 790                         drive_stat_acct(req->rq_dev, req->cmd, count, 0);
 791                         attempt_front_merge(q, head, req, max_sectors, max_segments);
 792                         goto out;
 793                 /*
 794                  * elevator says don't/can't merge. get new request
 795                  */
 796                 case ELEVATOR_NO_MERGE:
 797                         break;
 798
 799                 default:
 800                         printk("elevator returned crap (%d)\n", el_ret);
 801                         BUG();
 802         }
 803
 804         /*
 805          * Grab a free request from the freelist. Read first try their
 806          * own queue - if that is empty, we steal from the write list.
 807          * Writes must block if the write list is empty, and read aheads
 808          * are not crucial.
 809          */
 810 get_rq:
 811         if (freereq) {
 812                 req = freereq;
 813                 freereq = NULL;
 814         } else if ((req = get_request(q, rw)) == NULL) {
 815                 spin_unlock_irq(&io_request_lock);
 816                 if (rw_ahead)
 817                         goto end_io;
 818
 819                 freereq = __get_request_wait(q, rw);
 820                 goto again;
 821         }
 822
 823 /* fill up the request-info, and add it to the queue */
 824         req->cmd = rw;
 825         req->errors = 0;
 826         req->hard_sector = req->sector = sector;
 827         req->hard_nr_sectors = req->nr_sectors = count;
 828         req->current_nr_sectors = count;
 829         req->nr_segments = 1; /* Always 1 for a new request. */
 830         req->nr_hw_segments = 1; /* Always 1 for a new request. */
 831         req->buffer = bh->b_data;
 832         req->sem = NULL;
 833         req->bh = bh;
 834         req->bhtail = bh;
 835         req->rq_dev = bh->b_rdev;
 836         req->e = elevator;
 837         add_request(q, req, head, latency);
 838 out:
 839         if (!q->plugged)
 840                 (q->request_fn)(q);
 841         if (freereq)
 842                 blkdev_release_request(freereq);
 843         spin_unlock_irq(&io_request_lock);
 844         return 0;
 845 end_io:
 846         bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
 847         return 0;
 848 }
 849
 850 /**
 851  * generic_make_request: hand a buffer head to it's device driver for I/O
 852  * @rw:  READ, WRITE, or READA - what sort of I/O is desired.
 853  * @bh:  The buffer head describing the location in memory and on the device.
 854  *
 855  * generic_make_request() is used to make I/O requests of block
 856  * devices. It is passed a &struct buffer_head and a &rw value.  The
 857  * %READ and %WRITE options are (hopefully) obvious in meaning.  The
 858  * %READA value means that a read is required, but that the driver is
 859  * free to fail the request if, for example, it cannot get needed
 860  * resources immediately.
 861  *
 862  * generic_make_request() does not return any status.  The
 863  * success/failure status of the request, along with notification of
 864  * completion, is delivered asynchronously through the bh->b_end_io
 865  * function described (one day) else where.
 866  *
 867  * The caller of generic_make_request must make sure that b_page,
 868  * b_addr, b_size are set to describe the memory buffer, that b_rdev
 869  * and b_rsector are set to describe the device address, and the
 870  * b_end_io and optionally b_private are set to describe how
 871  * completion notification should be signaled.  BH_Mapped should also
 872  * be set (to confirm that b_dev and b_blocknr are valid).
 873  *
 874  * generic_make_request and the drivers it calls may use b_reqnext,
 875  * and may change b_rdev and b_rsector.  So the values of these fields
 876  * should NOT be depended on after the call to generic_make_request.
 877  * Because of this, the caller should record the device address
 878  * information in b_dev and b_blocknr.
 879  *
 880  * Apart from those fields mentioned above, no other fields, and in
 881  * particular, no other flags, are changed by generic_make_request or
 882  * any lower level drivers.
 883  * */
 884 void generic_make_request (int rw, struct buffer_head * bh)
 885 {
 886         int major = MAJOR(bh->b_rdev);
 887         request_queue_t *q;
 888
 889         if (!bh->b_end_io) BUG();
 890         if (blk_size[major]) {
 891                 unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
 892                 unsigned int sector, count;
 893
 894                 count = bh->b_size >> 9;
 895                 sector = bh->b_rsector;
 896
 897                 if (maxsector < count || maxsector - count < sector) {
 898                         bh->b_state &= (1 << BH_Lock) | (1 << BH_Mapped);
 899                         if (blk_size[major][MINOR(bh->b_rdev)]) {
 900
 901                                 /* This may well happen - the kernel calls bread()
 902                                    without checking the size of the device, e.g.,
 903                                    when mounting a device. */
 904                                 printk(KERN_INFO
 905                                        "attempt to access beyond end of device\n");
 906                                 printk(KERN_INFO "%s: rw=%d, want=%d, limit=%d\n",
 907                                        kdevname(bh->b_rdev), rw,
 908                                        (sector + count)>>1,
 909                                        blk_size[major][MINOR(bh->b_rdev)]);
 910                         }
 911                         bh->b_end_io(bh, 0);
 912                         return;
 913                 }
 914         }
 915
 916         /*
 917          * Resolve the mapping until finished. (drivers are
 918          * still free to implement/resolve their own stacking
 919          * by explicitly returning 0)
 920          */
 921         /* NOTE: we don't repeat the blk_size check for each new device.
 922          * Stacking drivers are expected to know what they are doing.
 923          */
 924         do {
 925                 q = blk_get_queue(bh->b_rdev);
 926                 if (!q) {
 927                         printk(KERN_ERR
 928                                "generic_make_request: Trying to access nonexistent block-device %s (%ld)\n",
 929                                kdevname(bh->b_rdev), bh->b_rsector);
 930                         buffer_IO_error(bh);
 931                         break;
 932                 }
 933
 934         }
 935         while (q->make_request_fn(q, rw, bh));
 936 }
 937
 938
 939 /**
 940  * submit_bh: submit a buffer_head to the block device later for I/O
 941  * @rw: whether to %READ or %WRITE, or mayve to %READA (read ahead)
 942  * @bh: The &struct buffer_head which describes the I/O
 943  *
 944  * submit_bh() is very similar in purpose to generic_make_request(), and
 945  * uses that function to do most of the work.
 946  *
 947  * The extra functionality provided by submit_bh is to determine
 948  * b_rsector from b_blocknr and b_size, and to set b_rdev from b_dev.
 949  * This is is appropriate for IO requests that come from the buffer
 950  * cache and page cache which (currently) always use aligned blocks.
 951  */
 952 void submit_bh(int rw, struct buffer_head * bh)
 953 {
 954         if (!test_bit(BH_Lock, &bh->b_state))
 955                 BUG();
 956
 957         set_bit(BH_Req, &bh->b_state);
 958
 959         /*
 960          * First step, 'identity mapping' - RAID or LVM might
 961          * further remap this.
 962          */
 963         bh->b_rdev = bh->b_dev;
 964         bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
 965
 966         generic_make_request(rw, bh);
 967
 968         switch (rw) {
 969                 case WRITE:
 970                         kstat.pgpgout++;
 971                         break;
 972                 default:
 973                         kstat.pgpgin++;
 974                         break;
 975         }
 976 }
 977
 978 /*
 979  * Default IO end handler, used by "ll_rw_block()".
 980  */
 981 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 982 {
 983         mark_buffer_uptodate(bh, uptodate);
 984         unlock_buffer(bh);
 985 }
 986
 987 /**
 988  * ll_rw_block: low-level access to block devices
 989  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
 990  * @nr: number of &struct buffer_heads in the array
 991  * @bhs: array of pointers to &struct buffer_head
 992  *
 993  * ll_rw_block() takes an array of pointers to &struct buffer_heads,
 994  * and requests an I/O operation on them, either a %READ or a %WRITE.
 995  * The third %READA option is described in the documentation for
 996  * generic_make_request() which ll_rw_block() calls.
 997  *
 998  * This function provides extra functionality that is not in
 999  * generic_make_request() that is relevant to buffers in the buffer
1000  * cache or page cache.  In particular it drops any buffer that it
1001  * cannot get a lock on (with the BH_Lock state bit), any buffer that
1002  * appears to be clean when doing a write request, and any buffer that
1003  * appears to be up-to-date when doing read request.  Further it marks
1004  * as clean buffers that are processed for writing (the buffer cache
1005  * wont assume that they are actually clean until the buffer gets
1006  * unlocked).
1007  *
1008  * ll_rw_block sets b_end_io to simple completion handler that marks
1009  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
1010  * any waiters.  As client that needs a more interesting completion
1011  * routine should call submit_bh() (or generic_make_request())
1012  * directly.
1013  *
1014  * Caveat:
1015  *  All of the buffers must be for the same device, and must also be
1016  *  of the current approved size for the device.  */
1017
1018 void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
1019 {
1020         unsigned int major;
1021         int correct_size;
1022         int i;
1023
1024         major = MAJOR(bhs[0]->b_dev);
1025
1026         /* Determine correct block size for this device. */
1027         correct_size = BLOCK_SIZE;
1028         if (blksize_size[major]) {
1029                 i = blksize_size[major][MINOR(bhs[0]->b_dev)];
1030                 if (i)
1031                         correct_size = i;
1032         }
1033
1034         /* Verify requested block sizes. */
1035         for (i = 0; i < nr; i++) {
1036                 struct buffer_head *bh;
1037                 bh = bhs[i];
1038                 if (bh->b_size != correct_size) {
1039                         printk(KERN_NOTICE "ll_rw_block: device %s: "
1040                                "only %d-char blocks implemented (%u)\n",
1041                                kdevname(bhs[0]->b_dev),
1042                                correct_size, bh->b_size);
1043                         goto sorry;
1044                 }
1045         }
1046
1047         if ((rw & WRITE) && is_read_only(bhs[0]->b_dev)) {
1048                 printk(KERN_NOTICE "Can't write to read-only device %s\n",
1049                        kdevname(bhs[0]->b_dev));
1050                 goto sorry;
1051         }
1052
1053         for (i = 0; i < nr; i++) {
1054                 struct buffer_head *bh;
1055                 bh = bhs[i];
1056
1057                 /* Only one thread can actually submit the I/O. */
1058                 if (test_and_set_bit(BH_Lock, &bh->b_state))
1059                         continue;
1060
1061                 /* We have the buffer lock */
1062                 bh->b_end_io = end_buffer_io_sync;
1063
1064                 switch(rw) {
1065                 case WRITE:
1066                         if (!atomic_set_buffer_clean(bh))
1067                                 /* Hmmph! Nothing to write */
1068                                 goto end_io;
1069                         __mark_buffer_clean(bh);
1070                         break;
1071
1072                 case READA:
1073                 case READ:
1074                         if (buffer_uptodate(bh))
1075                                 /* Hmmph! Already have it */
1076                                 goto end_io;
1077                         break;
1078                 default:
1079                         BUG();
1080         end_io:
1081                         bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1082                         continue;
1083                 }
1084
1085                 submit_bh(rw, bh);
1086         }
1087         return;
1088
1089 sorry:
1090         /* Make sure we don't get infinite dirty retries.. */
1091         for (i = 0; i < nr; i++)
1092                 mark_buffer_clean(bhs[i]);
1093 }
1094
1095 #ifdef CONFIG_STRAM_SWAP
1096 extern int stram_device_init (void);
1097 #endif
1098
1099 /*
1100  * First step of what used to be end_request
1101  *
1102  * 0 means continue with end_that_request_last,
1103  * 1 means we are done
1104  */
1105
1106 int end_that_request_first (struct request *req, int uptodate, char *name)
1107 {
1108         struct buffer_head * bh;
1109         int nsect;
1110
1111         req->errors = 0;
1112         if (!uptodate)
1113                 printk("end_request: I/O error, dev %s (%s), sector %lu\n",
1114                         kdevname(req->rq_dev), name, req->sector);
1115
1116         if ((bh = req->bh) != NULL) {
1117                 nsect = bh->b_size >> 9;
1118                 req->bh = bh->b_reqnext;
1119                 bh->b_reqnext = NULL;
1120                 bh->b_end_io(bh, uptodate);
1121                 if ((bh = req->bh) != NULL) {
1122                         req->hard_sector += nsect;
1123                         req->hard_nr_sectors -= nsect;
1124                         req->sector = req->hard_sector;
1125                         req->nr_sectors = req->hard_nr_sectors;
1126
1127                         req->current_nr_sectors = bh->b_size >> 9;
1128                         if (req->nr_sectors < req->current_nr_sectors) {
1129                                 req->nr_sectors = req->current_nr_sectors;
1130                                 printk("end_request: buffer-list destroyed\n");
1131                         }
1132                         req->buffer = bh->b_data;
1133                         return 1;
1134                 }
1135         }
1136         return 0;
1137 }
1138
1139 void end_that_request_last(struct request *req)
1140 {
1141         if (req->e) {
1142                 printk("end_that_request_last called with non-dequeued req\n");
1143                 BUG();
1144         }
1145         if (req->sem != NULL)
1146                 up(req->sem);
1147
1148         blkdev_release_request(req);
1149 }
1150
1151 int __init blk_dev_init(void)
1152 {
1153         struct blk_dev_struct *dev;
1154
1155         request_cachep = kmem_cache_create("blkdev_requests",
1156                                            sizeof(struct request),
1157                                            0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1158
1159         if (!request_cachep)
1160                 panic("Can't create request pool slab cache\n");
1161
1162         for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
1163                 dev->queue = NULL;
1164
1165         memset(ro_bits,0,sizeof(ro_bits));
1166         memset(max_readahead, 0, sizeof(max_readahead));
1167         memset(max_sectors, 0, sizeof(max_sectors));
1168 #ifdef CONFIG_AMIGA_Z2RAM
1169         z2_init();
1170 #endif
1171 #ifdef CONFIG_STRAM_SWAP
1172         stram_device_init();
1173 #endif
1174 #ifdef CONFIG_BLK_DEV_RAM
1175         rd_init();
1176 #endif
1177 #ifdef CONFIG_BLK_DEV_LOOP
1178         loop_init();
1179 #endif
1180 #ifdef CONFIG_ISP16_CDI
1181         isp16_init();
1182 #endif
1183 #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_IDE)
1184         ide_init();             /* this MUST precede hd_init */
1185 #endif
1186 #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_HD)
1187         hd_init();
1188 #endif
1189 #ifdef CONFIG_BLK_DEV_PS2
1190         ps2esdi_init();
1191 #endif
1192 #ifdef CONFIG_BLK_DEV_XD
1193         xd_init();
1194 #endif
1195 #ifdef CONFIG_BLK_DEV_MFM
1196         mfm_init();
1197 #endif
1198 #ifdef CONFIG_PARIDE
1199         { extern void paride_init(void); paride_init(); };
1200 #endif
1201 #ifdef CONFIG_MAC_FLOPPY
1202         swim3_init();
1203 #endif
1204 #ifdef CONFIG_BLK_DEV_SWIM_IOP
1205         swimiop_init();
1206 #endif
1207 #ifdef CONFIG_AMIGA_FLOPPY
1208         amiga_floppy_init();
1209 #endif
1210 #ifdef CONFIG_ATARI_FLOPPY
1211         atari_floppy_init();
1212 #endif
1213 #ifdef CONFIG_BLK_DEV_FD
1214         floppy_init();
1215 #else
1216 #if defined(__i386__)   /* Do we even need this? */
1217         outb_p(0xc, 0x3f2);
1218 #endif
1219 #endif
1220 #ifdef CONFIG_CDU31A
1221         cdu31a_init();
1222 #endif
1223 #ifdef CONFIG_ATARI_ACSI
1224         acsi_init();
1225 #endif
1226 #ifdef CONFIG_MCD
1227         mcd_init();
1228 #endif
1229 #ifdef CONFIG_MCDX
1230         mcdx_init();
1231 #endif
1232 #ifdef CONFIG_SBPCD
1233         sbpcd_init();
1234 #endif
1235 #ifdef CONFIG_AZTCD
1236         aztcd_init();
1237 #endif
1238 #ifdef CONFIG_CDU535
1239         sony535_init();
1240 #endif
1241 #ifdef CONFIG_GSCD
1242         gscd_init();
1243 #endif
1244 #ifdef CONFIG_CM206
1245         cm206_init();
1246 #endif
1247 #ifdef CONFIG_OPTCD
1248         optcd_init();
1249 #endif
1250 #ifdef CONFIG_SJCD
1251         sjcd_init();
1252 #endif
1253 #ifdef CONFIG_APBLOCK
1254         ap_init();
1255 #endif
1256 #ifdef CONFIG_DDV
1257         ddv_init();
1258 #endif
1259 #ifdef CONFIG_BLK_DEV_NBD
1260         nbd_init();
1261 #endif
1262 #ifdef CONFIG_MDISK
1263         mdisk_init();
1264 #endif
1265 #ifdef CONFIG_DASD
1266         dasd_init();
1267 #endif
1268 #ifdef CONFIG_SUN_JSFLASH
1269         jsfd_init();
1270 #endif
1271 #ifdef CONFIG_BLK_DEV_LVM
1272         lvm_init();
1273 #endif
1274         return 0;
1275 };
1276
1277 EXPORT_SYMBOL(io_request_lock);
1278 EXPORT_SYMBOL(end_that_request_first);
1279 EXPORT_SYMBOL(end_that_request_last);
1280 EXPORT_SYMBOL(blk_init_queue);
1281 EXPORT_SYMBOL(blk_get_queue);
1282 EXPORT_SYMBOL(blk_cleanup_queue);
1283 EXPORT_SYMBOL(blk_queue_headactive);
1284 EXPORT_SYMBOL(blk_queue_pluggable);
1285 EXPORT_SYMBOL(blk_queue_make_request);
1286 EXPORT_SYMBOL(generic_make_request);
1287 EXPORT_SYMBOL(blkdev_release_request);