2 * linux/drivers/block/ll_rw_blk.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
9 * This handles all read/write requests to block devices
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <linux/kernel_stat.h>
14 #include <linux/errno.h>
15 #include <linux/string.h>
16 #include <linux/config.h>
17 #include <linux/locks.h>
19 #include <linux/init.h>
20 #include <linux/smp_lock.h>
22 #include <asm/system.h>
24 #include <linux/blk.h>
26 #include <linux/module.h>
29 * MAC Floppy IWM hooks
32 #ifdef CONFIG_MAC_FLOPPY_IWM
33 extern int mac_floppy_init(void);
37 * The request-struct contains all necessary data
38 * to load a nr of sectors into memory
40 static struct request all_requests
[NR_REQUEST
];
43 * The "disk" task queue is used to start the actual requests
46 DECLARE_TASK_QUEUE(tq_disk
);
49 * Protect the request list against multiple users..
51 * With this spinlock the Linux block IO subsystem is 100% SMP threaded
52 * from the IRQ event side, and almost 100% SMP threaded from the syscall
53 * side (we still have protect against block device array operations, and
54 * the do_request() side is casually still unsafe. The kernel lock protects
55 * this part currently.).
57 * there is a fair chance that things will work just OK if these functions
58 * are called with no global kernel lock held ...
60 spinlock_t io_request_lock
= SPIN_LOCK_UNLOCKED
;
63 * used to wait on when there are no free requests
65 DECLARE_WAIT_QUEUE_HEAD(wait_for_request
);
67 /* This specifies how many sectors to read ahead on the disk. */
69 int read_ahead
[MAX_BLKDEV
] = {0, };
75 struct blk_dev_struct blk_dev
[MAX_BLKDEV
]; /* initialized by blk_dev_init() */
78 * blk_size contains the size of all block-devices in units of 1024 byte
81 * blk_size[MAJOR][MINOR]
83 * if (!blk_size[MAJOR]) then no minor size checking is done.
85 int * blk_size
[MAX_BLKDEV
] = { NULL
, NULL
, };
88 * blksize_size contains the size of all block-devices:
90 * blksize_size[MAJOR][MINOR]
92 * if (!blksize_size[MAJOR]) then 1024 bytes is assumed.
94 int * blksize_size
[MAX_BLKDEV
] = { NULL
, NULL
, };
97 * hardsect_size contains the size of the hardware sector of a device.
99 * hardsect_size[MAJOR][MINOR]
101 * if (!hardsect_size[MAJOR])
102 * then 512 bytes is assumed.
104 * sector_size is hardsect_size[MAJOR][MINOR]
105 * This is currently set by some scsi devices and read by the msdos fs driver.
106 * Other uses may appear later.
108 int * hardsect_size
[MAX_BLKDEV
] = { NULL
, NULL
, };
111 * The following tunes the read-ahead algorithm in mm/filemap.c
113 int * max_readahead
[MAX_BLKDEV
] = { NULL
, NULL
, };
116 * Max number of sectors per request
118 int * max_sectors
[MAX_BLKDEV
] = { NULL
, NULL
, };
121 * Max number of segments per request
123 int * max_segments
[MAX_BLKDEV
] = { NULL
, NULL
, };
125 static inline int get_max_sectors(kdev_t dev
)
127 if (!max_sectors
[MAJOR(dev
)])
129 return max_sectors
[MAJOR(dev
)][MINOR(dev
)];
132 static inline int get_max_segments(kdev_t dev
)
134 if (!max_segments
[MAJOR(dev
)])
136 return max_segments
[MAJOR(dev
)][MINOR(dev
)];
140 * Is called with the request spinlock aquired.
141 * NOTE: the device-specific queue() functions
144 static inline struct request
**get_queue(kdev_t dev
)
146 int major
= MAJOR(dev
);
147 struct blk_dev_struct
*bdev
= blk_dev
+ major
;
150 return bdev
->queue(dev
);
151 return &blk_dev
[major
].current_request
;
155 * remove the plug and let it rip..
157 void unplug_device(void * data
)
159 struct blk_dev_struct
* dev
= (struct blk_dev_struct
*) data
;
160 int queue_new_request
=0;
163 spin_lock_irqsave(&io_request_lock
,flags
);
164 if (dev
->current_request
== &dev
->plug
) {
165 struct request
* next
= dev
->plug
.next
;
166 dev
->current_request
= next
;
167 if (next
|| dev
->queue
) {
168 dev
->plug
.next
= NULL
;
169 queue_new_request
= 1;
172 if (queue_new_request
)
175 spin_unlock_irqrestore(&io_request_lock
,flags
);
179 * "plug" the device if there are no outstanding requests: this will
180 * force the transfer to start only after we have put all the requests
183 * This is called with interrupts off and no requests on the queue.
184 * (and with the request spinlock aquired)
186 static inline void plug_device(struct blk_dev_struct
* dev
)
188 if (dev
->current_request
)
190 dev
->current_request
= &dev
->plug
;
191 queue_task(&dev
->plug_tq
, &tq_disk
);
195 * look for a free request in the first N entries.
196 * NOTE: interrupts must be disabled on the way in (on SMP the request queue
197 * spinlock has to be aquired), and will still be disabled on the way out.
199 static inline struct request
* get_request(int n
, kdev_t dev
)
201 static struct request
*prev_found
= NULL
, *prev_limit
= NULL
;
202 register struct request
*req
, *limit
;
205 panic("get_request(%d): impossible!\n", n
);
207 limit
= all_requests
+ n
;
208 if (limit
!= prev_limit
) {
210 prev_found
= all_requests
;
214 req
= ((req
> all_requests
) ? req
: limit
) - 1;
215 if (req
->rq_status
== RQ_INACTIVE
)
217 if (req
== prev_found
)
221 req
->rq_status
= RQ_ACTIVE
;
227 * wait until a free request in the first N entries is available.
229 static struct request
* __get_request_wait(int n
, kdev_t dev
)
231 register struct request
*req
;
232 DECLARE_WAITQUEUE(wait
, current
);
235 add_wait_queue(&wait_for_request
, &wait
);
237 current
->state
= TASK_UNINTERRUPTIBLE
;
238 spin_lock_irqsave(&io_request_lock
,flags
);
239 req
= get_request(n
, dev
);
240 spin_unlock_irqrestore(&io_request_lock
,flags
);
243 run_task_queue(&tq_disk
);
246 remove_wait_queue(&wait_for_request
, &wait
);
247 current
->state
= TASK_RUNNING
;
251 static inline struct request
* get_request_wait(int n
, kdev_t dev
)
253 register struct request
*req
;
256 spin_lock_irqsave(&io_request_lock
,flags
);
257 req
= get_request(n
, dev
);
258 spin_unlock_irqrestore(&io_request_lock
,flags
);
261 return __get_request_wait(n
, dev
);
264 /* RO fail safe mechanism */
266 static long ro_bits
[MAX_BLKDEV
][8];
268 int is_read_only(kdev_t dev
)
274 if (major
< 0 || major
>= MAX_BLKDEV
) return 0;
275 return ro_bits
[major
][minor
>> 5] & (1 << (minor
& 31));
278 void set_device_ro(kdev_t dev
,int flag
)
284 if (major
< 0 || major
>= MAX_BLKDEV
) return;
285 if (flag
) ro_bits
[major
][minor
>> 5] |= 1 << (minor
& 31);
286 else ro_bits
[major
][minor
>> 5] &= ~(1 << (minor
& 31));
289 static inline void drive_stat_acct(int cmd
, unsigned long nr_sectors
,
292 kstat
.dk_drive
[disk_index
]++;
294 kstat
.dk_drive_rio
[disk_index
]++;
295 kstat
.dk_drive_rblk
[disk_index
] += nr_sectors
;
296 } else if (cmd
== WRITE
) {
297 kstat
.dk_drive_wio
[disk_index
]++;
298 kstat
.dk_drive_wblk
[disk_index
] += nr_sectors
;
300 printk(KERN_ERR
"drive_stat_acct: cmd not R/W?\n");
304 * add-request adds a request to the linked list.
305 * It disables interrupts (aquires the request spinlock) so that it can muck
306 * with the request-lists in peace. Thus it should be called with no spinlocks
309 * By this point, req->cmd is always either READ/WRITE, never READA,
310 * which is important for drive_stat_acct() above.
313 void add_request(struct blk_dev_struct
* dev
, struct request
* req
)
315 int major
= MAJOR(req
->rq_dev
);
316 int minor
= MINOR(req
->rq_dev
);
317 struct request
* tmp
, **current_request
;
320 int queue_new_request
= 0;
324 disk_index
= (minor
& 0x00f8) >> 3;
326 drive_stat_acct(req
->cmd
, req
->nr_sectors
, disk_index
);
328 case SCSI_DISK0_MAJOR
:
329 disk_index
= (minor
& 0x00f0) >> 4;
331 drive_stat_acct(req
->cmd
, req
->nr_sectors
, disk_index
);
333 case IDE0_MAJOR
: /* same as HD_MAJOR */
335 disk_index
= (minor
& 0x0040) >> 6;
336 drive_stat_acct(req
->cmd
, req
->nr_sectors
, disk_index
);
339 disk_index
= ((minor
& 0x0040) >> 6) + 2;
340 drive_stat_acct(req
->cmd
, req
->nr_sectors
, disk_index
);
348 * We use the goto to reduce locking complexity
350 spin_lock_irqsave(&io_request_lock
,flags
);
351 current_request
= get_queue(req
->rq_dev
);
353 if (!(tmp
= *current_request
)) {
354 *current_request
= req
;
355 if (dev
->current_request
!= &dev
->plug
)
356 queue_new_request
= 1;
359 for ( ; tmp
->next
; tmp
= tmp
->next
) {
360 const int after_current
= IN_ORDER(tmp
,req
);
361 const int before_next
= IN_ORDER(req
,tmp
->next
);
363 if (!IN_ORDER(tmp
,tmp
->next
)) {
364 if (after_current
|| before_next
)
367 if (after_current
&& before_next
)
371 req
->next
= tmp
->next
;
374 /* for SCSI devices, call request_fn unconditionally */
375 if (scsi_blk_major(major
))
376 queue_new_request
= 1;
377 if (major
>= COMPAQ_SMART2_MAJOR
+0 &&
378 major
<= COMPAQ_SMART2_MAJOR
+7)
379 queue_new_request
= 1;
380 if (major
>= DAC960_MAJOR
+0 && major
<= DAC960_MAJOR
+7)
381 queue_new_request
= 1;
383 if (queue_new_request
)
385 spin_unlock_irqrestore(&io_request_lock
,flags
);
389 * Has to be called with the request spinlock aquired
391 static inline void attempt_merge (struct request
*req
, int max_sectors
)
393 struct request
*next
= req
->next
;
397 if (req
->sector
+ req
->nr_sectors
!= next
->sector
)
399 if (next
->sem
|| req
->cmd
!= next
->cmd
|| req
->rq_dev
!= next
->rq_dev
|| req
->nr_sectors
+ next
->nr_sectors
> max_sectors
)
401 req
->bhtail
->b_reqnext
= next
->bh
;
402 req
->bhtail
= next
->bhtail
;
403 req
->nr_sectors
+= next
->nr_sectors
;
404 next
->rq_status
= RQ_INACTIVE
;
405 req
->next
= next
->next
;
406 wake_up (&wait_for_request
);
409 void make_request(int major
,int rw
, struct buffer_head
* bh
)
411 unsigned int sector
, count
;
412 struct request
* req
;
413 int rw_ahead
, max_req
, max_sectors
;
416 count
= bh
->b_size
>> 9;
417 sector
= bh
->b_rsector
;
419 /* We'd better have a real physical mapping! */
420 if (!buffer_mapped(bh
))
423 /* It had better not be a new buffer by the time we see it */
427 /* Only one thread can actually submit the I/O. */
428 if (test_and_set_bit(BH_Lock
, &bh
->b_state
))
431 if (blk_size
[major
]) {
432 unsigned long maxsector
= (blk_size
[major
][MINOR(bh
->b_rdev
)] << 1) + 1;
434 if (maxsector
< count
|| maxsector
- count
< sector
) {
435 bh
->b_state
&= (1 << BH_Lock
) | (1 << BH_Mapped
);
436 /* This may well happen - the kernel calls bread()
437 without checking the size of the device, e.g.,
438 when mounting a device. */
440 "attempt to access beyond end of device\n");
441 printk(KERN_INFO
"%s: rw=%d, want=%d, limit=%d\n",
442 kdevname(bh
->b_rdev
), rw
,
444 blk_size
[major
][MINOR(bh
->b_rdev
)]);
449 rw_ahead
= 0; /* normal case; gets changed below for READA */
453 rw
= READ
; /* drop into READ */
455 if (buffer_uptodate(bh
)) /* Hmmph! Already have it */
458 max_req
= NR_REQUEST
; /* reads take precedence */
462 goto do_write
; /* Skip the buffer refile */
464 if (!test_and_clear_bit(BH_Dirty
, &bh
->b_state
))
465 goto end_io
; /* Hmmph! Nothing to write */
469 * We don't allow the write-requests to fill up the
470 * queue completely: we want some room for reads,
471 * as they take precedence. The last third of the
472 * requests are only for reads.
475 max_req
= (NR_REQUEST
* 2) / 3;
478 printk(KERN_ERR
"make_request: bad block dev cmd,"
479 " must be R/W/RA/WA\n");
483 /* look for a free request. */
484 /* Loop uses two requests, 1 for loop and 1 for the real device.
485 * Cut max_req in half to avoid running out and deadlocking. */
486 if ((major
== LOOP_MAJOR
) || (major
== NBD_MAJOR
))
490 * Try to coalesce the new request with old requests
492 max_sectors
= get_max_sectors(bh
->b_rdev
);
495 * Now we acquire the request spinlock, we have to be mega careful
496 * not to schedule or do something nonatomic
498 spin_lock_irqsave(&io_request_lock
,flags
);
499 req
= *get_queue(bh
->b_rdev
);
501 /* MD and loop can't handle plugging without deadlocking */
502 if (major
!= MD_MAJOR
&& major
!= LOOP_MAJOR
&&
503 major
!= DDV_MAJOR
&& major
!= NBD_MAJOR
)
504 plug_device(blk_dev
+ major
); /* is atomic */
505 } else switch (major
) {
506 case IDE0_MAJOR
: /* same as HD_MAJOR */
518 case MFM_ACORN_MAJOR
:
520 * The scsi disk and cdrom drivers completely remove the request
521 * from the queue when they start processing an entry. For this
522 * reason it is safe to continue to add links to the top entry for
525 * All other drivers need to jump over the first entry, as that
526 * entry may be busy being processed and we thus can't change it.
528 if (req
== blk_dev
[major
].current_request
)
534 case SCSI_DISK0_MAJOR
:
535 case SCSI_DISK1_MAJOR
:
536 case SCSI_DISK2_MAJOR
:
537 case SCSI_DISK3_MAJOR
:
538 case SCSI_DISK4_MAJOR
:
539 case SCSI_DISK5_MAJOR
:
540 case SCSI_DISK6_MAJOR
:
541 case SCSI_DISK7_MAJOR
:
542 case SCSI_CDROM_MAJOR
:
552 case COMPAQ_SMART2_MAJOR
+0:
553 case COMPAQ_SMART2_MAJOR
+1:
554 case COMPAQ_SMART2_MAJOR
+2:
555 case COMPAQ_SMART2_MAJOR
+3:
556 case COMPAQ_SMART2_MAJOR
+4:
557 case COMPAQ_SMART2_MAJOR
+5:
558 case COMPAQ_SMART2_MAJOR
+6:
559 case COMPAQ_SMART2_MAJOR
+7:
566 if (req
->nr_sectors
+ count
> max_sectors
)
568 if (req
->rq_dev
!= bh
->b_rdev
)
570 /* Can we add it to the end of this request? */
571 if (req
->sector
+ req
->nr_sectors
== sector
) {
572 req
->bhtail
->b_reqnext
= bh
;
574 req
->nr_sectors
+= count
;
575 /* Can we now merge this req with the next? */
576 attempt_merge(req
, max_sectors
);
577 /* or to the beginning? */
578 } else if (req
->sector
- count
== sector
) {
579 bh
->b_reqnext
= req
->bh
;
581 req
->buffer
= bh
->b_data
;
582 req
->current_nr_sectors
= count
;
583 req
->sector
= sector
;
584 req
->nr_sectors
+= count
;
588 spin_unlock_irqrestore(&io_request_lock
,flags
);
591 } while ((req
= req
->next
) != NULL
);
594 /* find an unused request. */
595 req
= get_request(max_req
, bh
->b_rdev
);
597 spin_unlock_irqrestore(&io_request_lock
,flags
);
599 /* if no request available: if rw_ahead, forget it; otherwise try again blocking.. */
603 req
= __get_request_wait(max_req
, bh
->b_rdev
);
606 /* fill up the request-info, and add it to the queue */
609 req
->sector
= sector
;
610 req
->nr_sectors
= count
;
611 req
->current_nr_sectors
= count
;
612 req
->buffer
= bh
->b_data
;
617 add_request(major
+blk_dev
,req
);
621 bh
->b_end_io(bh
, test_bit(BH_Uptodate
, &bh
->b_state
));
624 /* This function can be used to request a number of buffers from a block
625 device. Currently the only restriction is that all buffers must belong to
628 void ll_rw_block(int rw
, int nr
, struct buffer_head
* bh
[])
632 struct blk_dev_struct
* dev
;
636 if ((major
= MAJOR(bh
[0]->b_dev
)) < MAX_BLKDEV
)
637 dev
= blk_dev
+ major
;
638 if (!dev
|| !dev
->request_fn
) {
640 "ll_rw_block: Trying to read nonexistent block-device %s (%ld)\n",
641 kdevname(bh
[0]->b_dev
), bh
[0]->b_blocknr
);
645 /* Determine correct block size for this device. */
646 correct_size
= BLOCK_SIZE
;
647 if (blksize_size
[major
]) {
648 i
= blksize_size
[major
][MINOR(bh
[0]->b_dev
)];
653 /* Verify requested block sizes. */
654 for (i
= 0; i
< nr
; i
++) {
655 if (bh
[i
]->b_size
!= correct_size
) {
656 printk(KERN_NOTICE
"ll_rw_block: device %s: "
657 "only %d-char blocks implemented (%u)\n",
658 kdevname(bh
[0]->b_dev
),
659 correct_size
, bh
[i
]->b_size
);
663 /* Md remaps blocks now */
664 bh
[i
]->b_rdev
= bh
[i
]->b_dev
;
665 bh
[i
]->b_rsector
=bh
[i
]->b_blocknr
*(bh
[i
]->b_size
>> 9);
666 #ifdef CONFIG_BLK_DEV_MD
667 if (major
==MD_MAJOR
&&
668 md_map (MINOR(bh
[i
]->b_dev
), &bh
[i
]->b_rdev
,
669 &bh
[i
]->b_rsector
, bh
[i
]->b_size
>> 9)) {
671 "Bad md_map in ll_rw_block\n");
677 if ((rw
& WRITE
) && is_read_only(bh
[0]->b_dev
)) {
678 printk(KERN_NOTICE
"Can't write to read-only device %s\n",
679 kdevname(bh
[0]->b_dev
));
683 for (i
= 0; i
< nr
; i
++) {
684 set_bit(BH_Req
, &bh
[i
]->b_state
);
685 #ifdef CONFIG_BLK_DEV_MD
686 if (MAJOR(bh
[i
]->b_dev
) == MD_MAJOR
) {
687 md_make_request(MINOR (bh
[i
]->b_dev
), rw
, bh
[i
]);
691 make_request(MAJOR(bh
[i
]->b_rdev
), rw
, bh
[i
]);
696 for (i
= 0; i
< nr
; i
++) {
697 clear_bit(BH_Dirty
, &bh
[i
]->b_state
);
698 clear_bit(BH_Uptodate
, &bh
[i
]->b_state
);
699 bh
[i
]->b_end_io(bh
[i
], 0);
704 #ifdef CONFIG_STRAM_SWAP
705 extern int stram_device_init( void );
709 * First step of what used to be end_request
711 * 0 means continue with end_that_request_last,
712 * 1 means we are done
716 end_that_request_first( struct request
*req
, int uptodate
, char *name
)
718 struct buffer_head
* bh
;
723 printk("end_request: I/O error, dev %s (%s), sector %lu\n",
724 kdevname(req
->rq_dev
), name
, req
->sector
);
725 if ((bh
= req
->bh
) != NULL
) {
726 nsect
= bh
->b_size
>> 9;
728 req
->nr_sectors
&= ~(nsect
- 1);
729 req
->sector
+= nsect
;
730 req
->sector
&= ~(nsect
- 1);
734 if ((bh
= req
->bh
) != NULL
) {
735 req
->bh
= bh
->b_reqnext
;
736 bh
->b_reqnext
= NULL
;
737 bh
->b_end_io(bh
, uptodate
);
738 if ((bh
= req
->bh
) != NULL
) {
739 req
->current_nr_sectors
= bh
->b_size
>> 9;
740 if (req
->nr_sectors
< req
->current_nr_sectors
) {
741 req
->nr_sectors
= req
->current_nr_sectors
;
742 printk("end_request: buffer-list destroyed\n");
744 req
->buffer
= bh
->b_data
;
752 end_that_request_last( struct request
*req
)
754 if (req
->sem
!= NULL
)
756 req
->rq_status
= RQ_INACTIVE
;
757 wake_up(&wait_for_request
);
760 int __init
blk_dev_init(void)
762 struct request
* req
;
763 struct blk_dev_struct
*dev
;
765 for (dev
= blk_dev
+ MAX_BLKDEV
; dev
-- != blk_dev
;) {
766 dev
->request_fn
= NULL
;
768 dev
->current_request
= NULL
;
769 dev
->plug
.rq_status
= RQ_INACTIVE
;
771 dev
->plug
.next
= NULL
;
772 dev
->plug_tq
.sync
= 0;
773 dev
->plug_tq
.routine
= &unplug_device
;
774 dev
->plug_tq
.data
= dev
;
777 req
= all_requests
+ NR_REQUEST
;
778 while (--req
>= all_requests
) {
779 req
->rq_status
= RQ_INACTIVE
;
782 memset(ro_bits
,0,sizeof(ro_bits
));
783 memset(max_readahead
, 0, sizeof(max_readahead
));
784 memset(max_sectors
, 0, sizeof(max_sectors
));
785 #ifdef CONFIG_AMIGA_Z2RAM
788 #ifdef CONFIG_STRAM_SWAP
791 #ifdef CONFIG_BLK_DEV_RAM
794 #ifdef CONFIG_BLK_DEV_LOOP
797 #ifdef CONFIG_ISP16_CDI
799 #endif CONFIG_ISP16_CDI
800 #ifdef CONFIG_BLK_DEV_IDE
801 ide_init(); /* this MUST precede hd_init */
803 #ifdef CONFIG_BLK_DEV_HD
806 #ifdef CONFIG_BLK_DEV_PS2
809 #ifdef CONFIG_BLK_DEV_XD
812 #ifdef CONFIG_BLK_DEV_MFM
816 { extern void paride_init(void); paride_init(); };
818 #ifdef CONFIG_MAC_FLOPPY
821 #ifdef CONFIG_BLK_DEV_SWIM_IOP
824 #ifdef CONFIG_AMIGA_FLOPPY
827 #ifdef CONFIG_ATARI_FLOPPY
830 #ifdef CONFIG_BLK_DEV_FD
833 #if !defined (__mc68000__) && !defined(CONFIG_PPC) && !defined(__sparc__)\
834 && !defined(CONFIG_APUS) && !defined(__sh__)
841 #ifdef CONFIG_ATARI_ACSI
843 #endif CONFIG_ATARI_ACSI
871 #ifdef CONFIG_BLK_DEV_MD
873 #endif CONFIG_BLK_DEV_MD
874 #ifdef CONFIG_APBLOCK
880 #ifdef CONFIG_BLK_DEV_NBD
886 EXPORT_SYMBOL(io_request_lock
);
887 EXPORT_SYMBOL(end_that_request_first
);
888 EXPORT_SYMBOL(end_that_request_last
);