More memory for these drivers
[minix3.git] / servers / fs / read.c
blobb049aef898c612f83517b20ec4e8a6682b7e87cb
1 /* This file contains the heart of the mechanism used to read (and write)
2 * files. Read and write requests are split up into chunks that do not cross
3 * block boundaries. Each chunk is then processed in turn. Reads on special
4 * files are also detected and handled.
6 * The entry points into this file are
7 * do_read: perform the READ system call by calling read_write
8 * read_write: actually do the work of READ and WRITE
9 * read_map: given an inode and file position, look up its zone number
10 * rd_indir: read an entry in an indirect block
11 * read_ahead: manage the block read ahead business
14 #include "fs.h"
15 #include <fcntl.h>
16 #include <unistd.h>
17 #include <minix/com.h>
18 #include "buf.h"
19 #include "file.h"
20 #include "fproc.h"
21 #include "inode.h"
22 #include "param.h"
23 #include "super.h"
25 FORWARD _PROTOTYPE( int rw_chunk, (struct inode *rip, off_t position,
26 unsigned off, int chunk, unsigned left, int rw_flag,
27 char *buff, int seg, int usr, int block_size, int *completed));
29 /*===========================================================================*
30 * do_read *
31 *===========================================================================*/
32 PUBLIC int do_read()
34 return(read_write(READING));
37 /*===========================================================================*
38 * read_write *
39 *===========================================================================*/
40 PUBLIC int read_write(rw_flag)
41 int rw_flag; /* READING or WRITING */
43 /* Perform read(fd, buffer, nbytes) or write(fd, buffer, nbytes) call. */
45 register struct inode *rip;
46 register struct filp *f;
47 off_t bytes_left, f_size, position;
48 unsigned int off, cum_io;
49 int op, oflags, r, chunk, usr, seg, block_spec, char_spec;
50 int regular, partial_pipe = 0, partial_cnt = 0;
51 mode_t mode_word;
52 struct filp *wf;
53 int block_size = 0;
54 int completed, r2 = OK;
55 phys_bytes p;
57 /* PM loads segments by putting funny things in other bits of the
58 * message, indicated by a high bit in fd.
60 if (who_e == PM_PROC_NR && (m_in.fd & _PM_SEG_FLAG)) {
61 seg = (int) m_in.m1_p2;
62 usr = (int) m_in.m1_p3;
63 m_in.fd &= ~(_PM_SEG_FLAG); /* get rid of flag bit */
64 } else {
65 usr = who_e; /* normal case */
66 seg = D;
69 /* If the file descriptor is valid, get the inode, size and mode. */
70 if (m_in.nbytes < 0) return(EINVAL);
71 if ((f = get_filp(m_in.fd)) == NIL_FILP) return(err_code);
72 if (((f->filp_mode) & (rw_flag == READING ? R_BIT : W_BIT)) == 0) {
73 return(f->filp_mode == FILP_CLOSED ? EIO : EBADF);
75 if (m_in.nbytes == 0)
76 return(0); /* so char special files need not check for 0*/
78 /* check if user process has the memory it needs.
79 * if not, copying will fail later.
80 * do this after 0-check above because umap doesn't want to map 0 bytes.
82 if ((r = sys_umap(usr, seg, (vir_bytes) m_in.buffer, m_in.nbytes, &p)) != OK) {
83 printf("FS: read_write: umap failed for process %d\n", usr);
84 return r;
86 position = f->filp_pos;
87 oflags = f->filp_flags;
88 rip = f->filp_ino;
89 f_size = rip->i_size;
90 r = OK;
91 if (rip->i_pipe == I_PIPE) {
92 /* fp->fp_cum_io_partial is only nonzero when doing partial writes */
93 cum_io = fp->fp_cum_io_partial;
94 } else {
95 cum_io = 0;
97 op = (rw_flag == READING ? DEV_READ : DEV_WRITE);
98 mode_word = rip->i_mode & I_TYPE;
99 regular = mode_word == I_REGULAR || mode_word == I_NAMED_PIPE;
101 if ((char_spec = (mode_word == I_CHAR_SPECIAL ? 1 : 0))) {
102 if (rip->i_zone[0] == NO_DEV)
103 panic(__FILE__,"read_write tries to read from "
104 "character device NO_DEV", NO_NUM);
105 block_size = get_block_size(rip->i_zone[0]);
107 if ((block_spec = (mode_word == I_BLOCK_SPECIAL ? 1 : 0))) {
108 f_size = ULONG_MAX;
109 if (rip->i_zone[0] == NO_DEV)
110 panic(__FILE__,"read_write tries to read from "
111 " block device NO_DEV", NO_NUM);
112 block_size = get_block_size(rip->i_zone[0]);
115 if (!char_spec && !block_spec)
116 block_size = rip->i_sp->s_block_size;
118 rdwt_err = OK; /* set to EIO if disk error occurs */
120 /* Check for character special files. */
121 if (char_spec) {
122 dev_t dev;
123 dev = (dev_t) rip->i_zone[0];
124 r = dev_io(op, dev, usr, m_in.buffer, position, m_in.nbytes, oflags);
125 if (r >= 0) {
126 cum_io = r;
127 position += r;
128 r = OK;
130 } else {
131 if (rw_flag == WRITING && block_spec == 0) {
132 /* Check in advance to see if file will grow too big. */
133 if (position > rip->i_sp->s_max_size - m_in.nbytes)
134 return(EFBIG);
136 /* Check for O_APPEND flag. */
137 if (oflags & O_APPEND) position = f_size;
139 /* Clear the zone containing present EOF if hole about
140 * to be created. This is necessary because all unwritten
141 * blocks prior to the EOF must read as zeros.
143 if (position > f_size) clear_zone(rip, f_size, 0);
146 /* Pipes are a little different. Check. */
147 if (rip->i_pipe == I_PIPE) {
148 r = pipe_check(rip, rw_flag, oflags,
149 m_in.nbytes, position, &partial_cnt, 0);
150 if (r <= 0) return(r);
153 if (partial_cnt > 0) partial_pipe = 1;
155 /* Split the transfer into chunks that don't span two blocks. */
156 while (m_in.nbytes != 0) {
158 off = (unsigned int) (position % block_size);/* offset in blk*/
159 if (partial_pipe) { /* pipes only */
160 chunk = MIN(partial_cnt, block_size - off);
161 } else
162 chunk = MIN(m_in.nbytes, block_size - off);
163 if (chunk < 0) chunk = block_size - off;
165 if (rw_flag == READING) {
166 bytes_left = f_size - position;
167 if (position >= f_size) break; /* we are beyond EOF */
168 if (chunk > bytes_left) chunk = (int) bytes_left;
171 /* Read or write 'chunk' bytes. */
172 r = rw_chunk(rip, position, off, chunk, (unsigned) m_in.nbytes,
173 rw_flag, m_in.buffer, seg, usr, block_size, &completed);
175 if (r != OK) break; /* EOF reached */
176 if (rdwt_err < 0) break;
178 /* Update counters and pointers. */
179 m_in.buffer += chunk; /* user buffer address */
180 m_in.nbytes -= chunk; /* bytes yet to be read */
181 cum_io += chunk; /* bytes read so far */
182 position += chunk; /* position within the file */
184 if (partial_pipe) {
185 partial_cnt -= chunk;
186 if (partial_cnt <= 0) break;
191 /* On write, update file size and access time. */
192 if (rw_flag == WRITING) {
193 if (regular || mode_word == I_DIRECTORY) {
194 if (position > f_size) rip->i_size = position;
196 } else {
197 if (rip->i_pipe == I_PIPE) {
198 if ( position >= rip->i_size) {
199 /* Reset pipe pointers. */
200 rip->i_size = 0; /* no data left */
201 position = 0; /* reset reader(s) */
202 wf = find_filp(rip, W_BIT);
203 if (wf != NIL_FILP) wf->filp_pos = 0;
207 f->filp_pos = position;
209 /* Check to see if read-ahead is called for, and if so, set it up. */
210 if (rw_flag == READING && rip->i_seek == NO_SEEK && position % block_size== 0
211 && (regular || mode_word == I_DIRECTORY)) {
212 rdahed_inode = rip;
213 rdahedpos = position;
215 rip->i_seek = NO_SEEK;
217 if (rdwt_err != OK) r = rdwt_err; /* check for disk error */
218 if (rdwt_err == END_OF_FILE) r = OK;
220 /* if user-space copying failed, read/write failed. */
221 if (r == OK && r2 != OK) {
222 r = r2;
224 if (r == OK) {
225 if (rw_flag == READING) rip->i_update |= ATIME;
226 if (rw_flag == WRITING) rip->i_update |= CTIME | MTIME;
227 rip->i_dirt = DIRTY; /* inode is thus now dirty */
228 if (partial_pipe) {
229 partial_pipe = 0;
230 /* partial write on pipe with */
231 /* O_NONBLOCK, return write count */
232 if (!(oflags & O_NONBLOCK)) {
233 fp->fp_cum_io_partial = cum_io;
234 suspend(XPIPE); /* partial write on pipe with */
235 return(SUSPEND); /* nbyte > PIPE_SIZE - non-atomic */
238 fp->fp_cum_io_partial = 0;
239 return(cum_io);
241 return(r);
244 /*===========================================================================*
245 * rw_chunk *
246 *===========================================================================*/
247 PRIVATE int rw_chunk(rip, position, off, chunk, left, rw_flag, buff,
248 seg, usr, block_size, completed)
249 register struct inode *rip; /* pointer to inode for file to be rd/wr */
250 off_t position; /* position within file to read or write */
251 unsigned off; /* off within the current block */
252 int chunk; /* number of bytes to read or write */
253 unsigned left; /* max number of bytes wanted after position */
254 int rw_flag; /* READING or WRITING */
255 char *buff; /* virtual address of the user buffer */
256 int seg; /* T or D segment in user space */
257 int usr; /* which user process */
258 int block_size; /* block size of FS operating on */
259 int *completed; /* number of bytes copied */
261 /* Read or write (part of) a block. */
263 register struct buf *bp;
264 register int r = OK;
265 int n, block_spec;
266 block_t b;
267 dev_t dev;
269 *completed = 0;
271 block_spec = (rip->i_mode & I_TYPE) == I_BLOCK_SPECIAL;
272 if (block_spec) {
273 b = position/block_size;
274 dev = (dev_t) rip->i_zone[0];
275 } else {
276 b = read_map(rip, position);
277 dev = rip->i_dev;
280 if (!block_spec && b == NO_BLOCK) {
281 if (rw_flag == READING) {
282 /* Reading from a nonexistent block. Must read as all zeros.*/
283 bp = get_block(NO_DEV, NO_BLOCK, NORMAL); /* get a buffer */
284 zero_block(bp);
285 } else {
286 /* Writing to a nonexistent block. Create and enter in inode.*/
287 if ((bp= new_block(rip, position)) == NIL_BUF)return(err_code);
289 } else if (rw_flag == READING) {
290 /* Read and read ahead if convenient. */
291 bp = rahead(rip, b, position, left);
292 } else {
293 /* Normally an existing block to be partially overwritten is first read
294 * in. However, a full block need not be read in. If it is already in
295 * the cache, acquire it, otherwise just acquire a free buffer.
297 n = (chunk == block_size ? NO_READ : NORMAL);
298 if (!block_spec && off == 0 && position >= rip->i_size) n = NO_READ;
299 bp = get_block(dev, b, n);
302 /* In all cases, bp now points to a valid buffer. */
303 if (bp == NIL_BUF) {
304 panic(__FILE__,"bp not valid in rw_chunk, this can't happen", NO_NUM);
306 if (rw_flag == WRITING && chunk != block_size && !block_spec &&
307 position >= rip->i_size && off == 0) {
308 zero_block(bp);
311 if (rw_flag == READING) {
312 /* Copy a chunk from the block buffer to user space. */
313 r = sys_vircopy(FS_PROC_NR, D, (phys_bytes) (bp->b_data+off),
314 usr, seg, (phys_bytes) buff,
315 (phys_bytes) chunk);
316 } else {
317 /* Copy a chunk from user space to the block buffer. */
318 r = sys_vircopy(usr, seg, (phys_bytes) buff,
319 FS_PROC_NR, D, (phys_bytes) (bp->b_data+off),
320 (phys_bytes) chunk);
321 bp->b_dirt = DIRTY;
323 n = (off + chunk == block_size ? FULL_DATA_BLOCK : PARTIAL_DATA_BLOCK);
324 put_block(bp, n);
326 return(r);
330 /*===========================================================================*
331 * read_map *
332 *===========================================================================*/
333 PUBLIC block_t read_map(rip, position)
334 register struct inode *rip; /* ptr to inode to map from */
335 off_t position; /* position in file whose blk wanted */
337 /* Given an inode and a position within the corresponding file, locate the
338 * block (not zone) number in which that position is to be found and return it.
341 register struct buf *bp;
342 register zone_t z;
343 int scale, boff, dzones, nr_indirects, index, zind, ex;
344 block_t b;
345 long excess, zone, block_pos;
347 scale = rip->i_sp->s_log_zone_size; /* for block-zone conversion */
348 block_pos = position/rip->i_sp->s_block_size; /* relative blk # in file */
349 zone = block_pos >> scale; /* position's zone */
350 boff = (int) (block_pos - (zone << scale) ); /* relative blk # within zone */
351 dzones = rip->i_ndzones;
352 nr_indirects = rip->i_nindirs;
354 /* Is 'position' to be found in the inode itself? */
355 if (zone < dzones) {
356 zind = (int) zone; /* index should be an int */
357 z = rip->i_zone[zind];
358 if (z == NO_ZONE) return(NO_BLOCK);
359 b = ((block_t) z << scale) + boff;
360 return(b);
363 /* It is not in the inode, so it must be single or double indirect. */
364 excess = zone - dzones; /* first Vx_NR_DZONES don't count */
366 if (excess < nr_indirects) {
367 /* 'position' can be located via the single indirect block. */
368 z = rip->i_zone[dzones];
369 } else {
370 /* 'position' can be located via the double indirect block. */
371 if ( (z = rip->i_zone[dzones+1]) == NO_ZONE) return(NO_BLOCK);
372 excess -= nr_indirects; /* single indir doesn't count*/
373 b = (block_t) z << scale;
374 bp = get_block(rip->i_dev, b, NORMAL); /* get double indirect block */
375 index = (int) (excess/nr_indirects);
376 z = rd_indir(bp, index); /* z= zone for single*/
377 put_block(bp, INDIRECT_BLOCK); /* release double ind block */
378 excess = excess % nr_indirects; /* index into single ind blk */
381 /* 'z' is zone num for single indirect block; 'excess' is index into it. */
382 if (z == NO_ZONE) return(NO_BLOCK);
383 b = (block_t) z << scale; /* b is blk # for single ind */
384 bp = get_block(rip->i_dev, b, NORMAL); /* get single indirect block */
385 ex = (int) excess; /* need an integer */
386 z = rd_indir(bp, ex); /* get block pointed to */
387 put_block(bp, INDIRECT_BLOCK); /* release single indir blk */
388 if (z == NO_ZONE) return(NO_BLOCK);
389 b = ((block_t) z << scale) + boff;
390 return(b);
393 /*===========================================================================*
394 * rd_indir *
395 *===========================================================================*/
396 PUBLIC zone_t rd_indir(bp, index)
397 struct buf *bp; /* pointer to indirect block */
398 int index; /* index into *bp */
400 /* Given a pointer to an indirect block, read one entry. The reason for
401 * making a separate routine out of this is that there are four cases:
402 * V1 (IBM and 68000), and V2 (IBM and 68000).
405 struct super_block *sp;
406 zone_t zone; /* V2 zones are longs (shorts in V1) */
408 if(bp == NIL_BUF)
409 panic(__FILE__, "rd_indir() on NIL_BUF", NO_NUM);
411 sp = get_super(bp->b_dev); /* need super block to find file sys type */
413 /* read a zone from an indirect block */
414 if (sp->s_version == V1)
415 zone = (zone_t) conv2(sp->s_native, (int) bp->b_v1_ind[index]);
416 else
417 zone = (zone_t) conv4(sp->s_native, (long) bp->b_v2_ind[index]);
419 if (zone != NO_ZONE &&
420 (zone < (zone_t) sp->s_firstdatazone || zone >= sp->s_zones)) {
421 printf("Illegal zone number %ld in indirect block, index %d\n",
422 (long) zone, index);
423 panic(__FILE__,"check file system", NO_NUM);
425 return(zone);
428 /*===========================================================================*
429 * read_ahead *
430 *===========================================================================*/
431 PUBLIC void read_ahead()
433 /* Read a block into the cache before it is needed. */
434 int block_size;
435 register struct inode *rip;
436 struct buf *bp;
437 block_t b;
439 rip = rdahed_inode; /* pointer to inode to read ahead from */
440 block_size = get_block_size(rip->i_dev);
441 rdahed_inode = NIL_INODE; /* turn off read ahead */
442 if ( (b = read_map(rip, rdahedpos)) == NO_BLOCK) return; /* at EOF */
443 bp = rahead(rip, b, rdahedpos, block_size);
444 put_block(bp, PARTIAL_DATA_BLOCK);
447 /*===========================================================================*
448 * rahead *
449 *===========================================================================*/
450 PUBLIC struct buf *rahead(rip, baseblock, position, bytes_ahead)
451 register struct inode *rip; /* pointer to inode for file to be read */
452 block_t baseblock; /* block at current position */
453 off_t position; /* position within file */
454 unsigned bytes_ahead; /* bytes beyond position for immediate use */
456 /* Fetch a block from the cache or the device. If a physical read is
457 * required, prefetch as many more blocks as convenient into the cache.
458 * This usually covers bytes_ahead and is at least BLOCKS_MINIMUM.
459 * The device driver may decide it knows better and stop reading at a
460 * cylinder boundary (or after an error). Rw_scattered() puts an optional
461 * flag on all reads to allow this.
463 int block_size;
464 /* Minimum number of blocks to prefetch. */
465 # define BLOCKS_MINIMUM (NR_BUFS < 50 ? 18 : 32)
466 int block_spec, scale, read_q_size;
467 unsigned int blocks_ahead, fragment;
468 block_t block, blocks_left;
469 off_t ind1_pos;
470 dev_t dev;
471 struct buf *bp;
472 static struct buf *read_q[NR_BUFS];
474 block_spec = (rip->i_mode & I_TYPE) == I_BLOCK_SPECIAL;
475 if (block_spec) {
476 dev = (dev_t) rip->i_zone[0];
477 } else {
478 dev = rip->i_dev;
480 block_size = get_block_size(dev);
482 block = baseblock;
483 bp = get_block(dev, block, PREFETCH);
484 if (bp->b_dev != NO_DEV) return(bp);
486 /* The best guess for the number of blocks to prefetch: A lot.
487 * It is impossible to tell what the device looks like, so we don't even
488 * try to guess the geometry, but leave it to the driver.
490 * The floppy driver can read a full track with no rotational delay, and it
491 * avoids reading partial tracks if it can, so handing it enough buffers to
492 * read two tracks is perfect. (Two, because some diskette types have
493 * an odd number of sectors per track, so a block may span tracks.)
495 * The disk drivers don't try to be smart. With todays disks it is
496 * impossible to tell what the real geometry looks like, so it is best to
497 * read as much as you can. With luck the caching on the drive allows
498 * for a little time to start the next read.
500 * The current solution below is a bit of a hack, it just reads blocks from
501 * the current file position hoping that more of the file can be found. A
502 * better solution must look at the already available zone pointers and
503 * indirect blocks (but don't call read_map!).
506 fragment = position % block_size;
507 position -= fragment;
508 bytes_ahead += fragment;
510 blocks_ahead = (bytes_ahead + block_size - 1) / block_size;
512 if (block_spec && rip->i_size == 0) {
513 blocks_left = NR_IOREQS;
514 } else {
515 blocks_left = (rip->i_size - position + block_size - 1) / block_size;
517 /* Go for the first indirect block if we are in its neighborhood. */
518 if (!block_spec) {
519 scale = rip->i_sp->s_log_zone_size;
520 ind1_pos = (off_t) rip->i_ndzones * (block_size << scale);
521 if (position <= ind1_pos && rip->i_size > ind1_pos) {
522 blocks_ahead++;
523 blocks_left++;
528 /* No more than the maximum request. */
529 if (blocks_ahead > NR_IOREQS) blocks_ahead = NR_IOREQS;
531 /* Read at least the minimum number of blocks, but not after a seek. */
532 if (blocks_ahead < BLOCKS_MINIMUM && rip->i_seek == NO_SEEK)
533 blocks_ahead = BLOCKS_MINIMUM;
535 /* Can't go past end of file. */
536 if (blocks_ahead > blocks_left) blocks_ahead = blocks_left;
538 read_q_size = 0;
540 /* Acquire block buffers. */
541 for (;;) {
542 read_q[read_q_size++] = bp;
544 if (--blocks_ahead == 0) break;
546 /* Don't trash the cache, leave 4 free. */
547 if (bufs_in_use >= NR_BUFS - 4) break;
549 block++;
551 bp = get_block(dev, block, PREFETCH);
552 if (bp->b_dev != NO_DEV) {
553 /* Oops, block already in the cache, get out. */
554 put_block(bp, FULL_DATA_BLOCK);
555 break;
558 rw_scattered(dev, read_q, read_q_size, READING);
559 return(get_block(dev, baseblock, NORMAL));