minix/fs/mfs/read.c

   1 #include "fs.h"
   2 #include <stddef.h>
   3 #include <string.h>
   4 #include <stdlib.h>
   5 #include "buf.h"
   6 #include "inode.h"
   7 #include "super.h"
   8 #include <sys/param.h>
   9 #include <sys/dirent.h>
  10 #include <assert.h>
  11
  12
  13 static struct buf *rahead(struct inode *rip, block_t baseblock, u64_t
  14         position, unsigned bytes_ahead);
  15 static int rw_chunk(struct inode *rip, u64_t position, unsigned off,
  16         size_t chunk, unsigned left, int call, struct fsdriver_data *data,
  17         unsigned buf_off, unsigned int block_size, int *completed);
  18
  19
  20 /*===========================================================================*
  21  *                              fs_readwrite                                 *
  22  *===========================================================================*/
  23 ssize_t fs_readwrite(ino_t ino_nr, struct fsdriver_data *data, size_t nrbytes,
  24         off_t position, int call)
  25 {
  26   int r;
  27   int regular;
  28   off_t f_size, bytes_left;
  29   size_t off, cum_io, block_size, chunk;
  30   mode_t mode_word;
  31   int completed;
  32   struct inode *rip;
  33
  34   r = OK;
  35
  36   /* Find the inode referred */
  37   if ((rip = find_inode(fs_dev, ino_nr)) == NULL)
  38         return(EINVAL);
  39
  40   mode_word = rip->i_mode & I_TYPE;
  41   regular = (mode_word == I_REGULAR);
  42
  43   /* Determine blocksize */
  44   block_size = rip->i_sp->s_block_size;
  45   f_size = rip->i_size;
  46
  47   /* If this is file i/o, check we can write */
  48   if (call == FSC_WRITE) {
  49           if(rip->i_sp->s_rd_only)
  50                   return EROFS;
  51
  52           /* Check in advance to see if file will grow too big. */
  53           if (position > (off_t) (rip->i_sp->s_max_size - nrbytes))
  54                   return(EFBIG);
  55
  56           /* Clear the zone containing present EOF if hole about
  57            * to be created.  This is necessary because all unwritten
  58            * blocks prior to the EOF must read as zeros.
  59            */
  60           if(position > f_size) clear_zone(rip, f_size, 0);
  61   }
  62
  63   cum_io = 0;
  64   /* Split the transfer into chunks that don't span two blocks. */
  65   while (nrbytes > 0) {
  66           off = ((unsigned int) position) % block_size; /* offset in blk*/
  67           chunk = block_size - off;
  68           if (chunk > nrbytes)
  69                 chunk = nrbytes;
  70
  71           if (call != FSC_WRITE) {
  72                   bytes_left = f_size - position;
  73                   if (position >= f_size) break;        /* we are beyond EOF */
  74                   if (chunk > (unsigned int) bytes_left) chunk = bytes_left;
  75           }
  76
  77           /* Read or write 'chunk' bytes. */
  78           r = rw_chunk(rip, ((u64_t)((unsigned long)position)), off, chunk,
  79                 nrbytes, call, data, cum_io, block_size, &completed);
  80
  81           if (r != OK) break;
  82
  83           /* Update counters and pointers. */
  84           nrbytes -= chunk;     /* bytes yet to be read */
  85           cum_io += chunk;      /* bytes read so far */
  86           position += (off_t) chunk;    /* position within the file */
  87   }
  88
  89   /* On write, update file size and access time. */
  90   if (call == FSC_WRITE) {
  91           if (regular || mode_word == I_DIRECTORY) {
  92                   if (position > f_size) rip->i_size = position;
  93           }
  94   }
  95
  96   rip->i_seek = NO_SEEK;
  97
  98   if (r != OK)
  99         return r;
 100
 101   /* even on a ROFS, writing to a device node on it is fine,
 102    * just don't update the inode stats for it. And dito for reading.
 103    */
 104   if (!rip->i_sp->s_rd_only) {
 105           if (call == FSC_READ) rip->i_update |= ATIME;
 106           if (call == FSC_WRITE) rip->i_update |= CTIME | MTIME;
 107           IN_MARKDIRTY(rip);            /* inode is thus now dirty */
 108   }
 109
 110   return cum_io;
 111 }
 112
 113
 114 /*===========================================================================*
 115  *                              rw_chunk                                     *
 116  *===========================================================================*/
 117 static int rw_chunk(rip, position, off, chunk, left, call, data, buf_off,
 118         block_size, completed)
 119 register struct inode *rip;     /* pointer to inode for file to be rd/wr */
 120 u64_t position;                 /* position within file to read or write */
 121 unsigned off;                   /* off within the current block */
 122 size_t chunk;                   /* number of bytes to read or write */
 123 unsigned left;                  /* max number of bytes wanted after position */
 124 int call;                       /* FSC_READ, FSC_WRITE, or FSC_PEEK */
 125 struct fsdriver_data *data;     /* structure for (remote) user buffer */
 126 unsigned buf_off;               /* offset in user buffer */
 127 unsigned int block_size;        /* block size of FS operating on */
 128 int *completed;                 /* number of bytes copied */
 129 {
 130 /* Read or write (part of) a block. */
 131   struct buf *bp = NULL;
 132   register int r = OK;
 133   int n;
 134   block_t b;
 135   dev_t dev;
 136   ino_t ino = VMC_NO_INODE;
 137   u64_t ino_off = rounddown(position, block_size);
 138
 139   *completed = 0;
 140
 141   if (ex64hi(position) != 0)
 142         panic("rw_chunk: position too high");
 143   b = read_map(rip, (off_t) ex64lo(position), 0);
 144   dev = rip->i_dev;
 145   ino = rip->i_num;
 146   assert(ino != VMC_NO_INODE);
 147
 148   if (b == NO_BLOCK) {
 149         if (call == FSC_READ) {
 150                 /* Reading from a nonexistent block.  Must read as all zeros.*/
 151                 r = fsdriver_zero(data, buf_off, chunk);
 152                 if(r != OK) {
 153                         printf("MFS: fsdriver_zero failed\n");
 154                 }
 155                 return r;
 156         } else if (call == FSC_PEEK) {
 157                 /* Peeking a nonexistent block. Report to VM. */
 158                 lmfs_zero_block_ino(dev, ino, ino_off);
 159                 return OK;
 160         } else {
 161                 /* Writing to a nonexistent block.
 162                  * Create and enter in inode.
 163                  */
 164                 if ((bp = new_block(rip, (off_t) ex64lo(position))) == NULL)
 165                         return(err_code);
 166         }
 167   } else if (call != FSC_WRITE) {
 168         /* Read and read ahead if convenient. */
 169         bp = rahead(rip, b, position, left);
 170   } else {
 171         /* Normally an existing block to be partially overwritten is first read
 172          * in.  However, a full block need not be read in.  If it is already in
 173          * the cache, acquire it, otherwise just acquire a free buffer.
 174          */
 175         n = (chunk == block_size ? NO_READ : NORMAL);
 176         if (off == 0 && (off_t) ex64lo(position) >= rip->i_size)
 177                 n = NO_READ;
 178         assert(ino != VMC_NO_INODE);
 179         assert(!(ino_off % block_size));
 180         if ((r = lmfs_get_block_ino(&bp, dev, b, n, ino, ino_off)) != OK)
 181                 panic("MFS: error getting block (%llu,%u): %d", dev, b, r);
 182   }
 183
 184   /* In all cases, bp now points to a valid buffer. */
 185   assert(bp != NULL);
 186
 187   if (call == FSC_WRITE && chunk != block_size &&
 188       (off_t) ex64lo(position) >= rip->i_size && off == 0) {
 189         zero_block(bp);
 190   }
 191
 192   if (call == FSC_READ) {
 193         /* Copy a chunk from the block buffer to user space. */
 194         r = fsdriver_copyout(data, buf_off, b_data(bp)+off, chunk);
 195   } else if (call == FSC_WRITE) {
 196         /* Copy a chunk from user space to the block buffer. */
 197         r = fsdriver_copyin(data, buf_off, b_data(bp)+off, chunk);
 198         MARKDIRTY(bp);
 199   }
 200
 201   put_block(bp);
 202
 203   return(r);
 204 }
 205
 206
 207 /*===========================================================================*
 208  *                              read_map                                     *
 209  *===========================================================================*/
 210 block_t read_map(rip, position, opportunistic)
 211 register struct inode *rip;     /* ptr to inode to map from */
 212 off_t position;                 /* position in file whose blk wanted */
 213 int opportunistic;              /* if nonzero, only use cache for metadata */
 214 {
 215 /* Given an inode and a position within the corresponding file, locate the
 216  * block (not zone) number in which that position is to be found and return it.
 217  */
 218
 219   struct buf *bp;
 220   zone_t z;
 221   int scale, boff, index, zind;
 222   unsigned int dzones, nr_indirects;
 223   block_t b;
 224   unsigned long excess, zone, block_pos;
 225   int iomode;
 226
 227   iomode = opportunistic ? PEEK : NORMAL;
 228
 229   scale = rip->i_sp->s_log_zone_size;   /* for block-zone conversion */
 230   block_pos = position/rip->i_sp->s_block_size; /* relative blk # in file */
 231   zone = block_pos >> scale;    /* position's zone */
 232   boff = (int) (block_pos - (zone << scale) ); /* relative blk # within zone */
 233   dzones = rip->i_ndzones;
 234   nr_indirects = rip->i_nindirs;
 235
 236   /* Is 'position' to be found in the inode itself? */
 237   if (zone < dzones) {
 238         zind = (int) zone;      /* index should be an int */
 239         z = rip->i_zone[zind];
 240         if (z == NO_ZONE) return(NO_BLOCK);
 241         b = (block_t) ((z << scale) + boff);
 242         return(b);
 243   }
 244
 245   /* It is not in the inode, so it must be single or double indirect. */
 246   excess = zone - dzones;       /* first Vx_NR_DZONES don't count */
 247
 248   if (excess < nr_indirects) {
 249         /* 'position' can be located via the single indirect block. */
 250         z = rip->i_zone[dzones];
 251   } else {
 252         /* 'position' can be located via the double indirect block. */
 253         if ( (z = rip->i_zone[dzones+1]) == NO_ZONE) return(NO_BLOCK);
 254         excess -= nr_indirects;                 /* single indir doesn't count*/
 255         b = (block_t) z << scale;
 256         ASSERT(rip->i_dev != NO_DEV);
 257         index = (int) (excess/nr_indirects);
 258         if ((unsigned int) index > rip->i_nindirs)
 259                 return(NO_BLOCK);       /* Can't go beyond double indirects */
 260         bp = get_block(rip->i_dev, b, iomode); /* get double indirect block */
 261         if (bp == NULL)
 262                 return NO_BLOCK;                /* peeking failed */
 263         z = rd_indir(bp, index);                /* z= zone for single*/
 264         put_block(bp);                          /* release double ind block */
 265         excess = excess % nr_indirects;         /* index into single ind blk */
 266   }
 267
 268   /* 'z' is zone num for single indirect block; 'excess' is index into it. */
 269   if (z == NO_ZONE) return(NO_BLOCK);
 270   b = (block_t) z << scale;                     /* b is blk # for single ind */
 271   bp = get_block(rip->i_dev, b, iomode);        /* get single indirect block */
 272   if (bp == NULL)
 273         return NO_BLOCK;                        /* peeking failed */
 274   z = rd_indir(bp, (int) excess);               /* get block pointed to */
 275   put_block(bp);                                /* release single indir blk */
 276   if (z == NO_ZONE) return(NO_BLOCK);
 277   b = (block_t) ((z << scale) + boff);
 278   return(b);
 279 }
 280
 281 struct buf *get_block_map(register struct inode *rip, u64_t position)
 282 {
 283         struct buf *bp;
 284         int r, block_size;
 285         block_t b = read_map(rip, position, 0); /* get block number */
 286         if(b == NO_BLOCK)
 287                 return NULL;
 288         block_size = get_block_size(rip->i_dev);
 289         position = rounddown(position, block_size);
 290         assert(rip->i_num != VMC_NO_INODE);
 291         if ((r = lmfs_get_block_ino(&bp, rip->i_dev, b, NORMAL, rip->i_num,
 292             position)) != OK)
 293                 panic("MFS: error getting block (%llu,%u): %d",
 294                     rip->i_dev, b, r);
 295         return bp;
 296 }
 297
 298 /*===========================================================================*
 299  *                              rd_indir                                     *
 300  *===========================================================================*/
 301 zone_t rd_indir(bp, index)
 302 struct buf *bp;                 /* pointer to indirect block */
 303 int index;                      /* index into *bp */
 304 {
 305   struct super_block *sp;
 306   zone_t zone;
 307
 308   if(bp == NULL)
 309         panic("rd_indir() on NULL");
 310
 311   sp = &superblock;
 312
 313   /* read a zone from an indirect block */
 314   assert(sp->s_version == V3);
 315   zone = (zone_t) conv4(sp->s_native, (long) b_v2_ind(bp)[index]);
 316
 317   if (zone != NO_ZONE &&
 318                 (zone < (zone_t) sp->s_firstdatazone || zone >= sp->s_zones)) {
 319         printf("Illegal zone number %ld in indirect block, index %d\n",
 320                (long) zone, index);
 321         panic("check file system");
 322   }
 323
 324   return(zone);
 325 }
 326
 327 /*===========================================================================*
 328  *                              rahead                                       *
 329  *===========================================================================*/
 330 static struct buf *rahead(rip, baseblock, position, bytes_ahead)
 331 register struct inode *rip;     /* pointer to inode for file to be read */
 332 block_t baseblock;              /* block at current position */
 333 u64_t position;                 /* position within file */
 334 unsigned bytes_ahead;           /* bytes beyond position for immediate use */
 335 {
 336 /* Fetch a block from the cache or the device.  If a physical read is
 337  * required, prefetch as many more blocks as convenient into the cache.
 338  * This usually covers bytes_ahead and is at least BLOCKS_MINIMUM.
 339  * The device driver may decide it knows better and stop reading at a
 340  * cylinder boundary (or after an error).  Rw_scattered() puts an optional
 341  * flag on all reads to allow this.
 342  */
 343 /* Minimum number of blocks to prefetch. */
 344 # define BLOCKS_MINIMUM         32
 345   int r, scale, read_q_size;
 346   unsigned int blocks_ahead, fragment, block_size;
 347   block_t block, blocks_left;
 348   off_t ind1_pos;
 349   dev_t dev;
 350   struct buf *bp;
 351   static block64_t read_q[LMFS_MAX_PREFETCH];
 352   u64_t position_running;
 353
 354   dev = rip->i_dev;
 355   assert(dev != NO_DEV);
 356
 357   block_size = get_block_size(dev);
 358
 359   block = baseblock;
 360
 361   fragment = position % block_size;
 362   position -= fragment;
 363   position_running = position;
 364   bytes_ahead += fragment;
 365   blocks_ahead = (bytes_ahead + block_size - 1) / block_size;
 366
 367   r = lmfs_get_block_ino(&bp, dev, block, PEEK, rip->i_num, position);
 368   if (r == OK)
 369         return(bp);
 370   if (r != ENOENT)
 371         panic("MFS: error getting block (%llu,%u): %d", dev, block, r);
 372
 373   /* The best guess for the number of blocks to prefetch:  A lot.
 374    * It is impossible to tell what the device looks like, so we don't even
 375    * try to guess the geometry, but leave it to the driver.
 376    *
 377    * The floppy driver can read a full track with no rotational delay, and it
 378    * avoids reading partial tracks if it can, so handing it enough buffers to
 379    * read two tracks is perfect.  (Two, because some diskette types have
 380    * an odd number of sectors per track, so a block may span tracks.)
 381    *
 382    * The disk drivers don't try to be smart.  With todays disks it is
 383    * impossible to tell what the real geometry looks like, so it is best to
 384    * read as much as you can.  With luck the caching on the drive allows
 385    * for a little time to start the next read.
 386    *
 387    * The current solution below is a bit of a hack, it just reads blocks from
 388    * the current file position hoping that more of the file can be found.  A
 389    * better solution must look at the already available zone pointers and
 390    * indirect blocks (but don't call read_map!).
 391    */
 392
 393   blocks_left = (block_t) (rip->i_size-ex64lo(position)+(block_size-1)) /
 394                                                                 block_size;
 395
 396   /* Go for the first indirect block if we are in its neighborhood. */
 397   scale = rip->i_sp->s_log_zone_size;
 398   ind1_pos = (off_t) rip->i_ndzones * (block_size << scale);
 399   if ((off_t) ex64lo(position) <= ind1_pos && rip->i_size > ind1_pos) {
 400         blocks_ahead++;
 401         blocks_left++;
 402   }
 403
 404   /* Read at least the minimum number of blocks, but not after a seek. */
 405   if (blocks_ahead < BLOCKS_MINIMUM && rip->i_seek == NO_SEEK)
 406         blocks_ahead = BLOCKS_MINIMUM;
 407
 408   /* Can't go past end of file. */
 409   if (blocks_ahead > blocks_left) blocks_ahead = blocks_left;
 410
 411   /* No more than the maximum request. */
 412   if (blocks_ahead > LMFS_MAX_PREFETCH) blocks_ahead = LMFS_MAX_PREFETCH;
 413
 414   read_q_size = 0;
 415
 416   /* Acquire block buffers. */
 417   for (;;) {
 418         block_t thisblock;
 419         read_q[read_q_size++] = block;
 420
 421         if (--blocks_ahead == 0) break;
 422
 423         block++;
 424         position_running += block_size;
 425
 426         thisblock = read_map(rip, (off_t) ex64lo(position_running), 1);
 427         if (thisblock != NO_BLOCK) {
 428                 r = lmfs_get_block_ino(&bp, dev, thisblock, PEEK, rip->i_num,
 429                     position_running);
 430                 block = thisblock;
 431         } else
 432                 r = lmfs_get_block(&bp, dev, block, PEEK);
 433
 434         if (r == OK) {
 435                 /* Oops, block already in the cache, get out. */
 436                 put_block(bp);
 437                 break;
 438         }
 439         if (r != ENOENT)
 440                 panic("MFS: error getting block (%llu,%u): %d", dev, block, r);
 441   }
 442   lmfs_prefetch(dev, read_q, read_q_size);
 443
 444   r = lmfs_get_block_ino(&bp, dev, baseblock, NORMAL, rip->i_num, position);
 445   if (r != OK)
 446         panic("MFS: error getting block (%llu,%u): %d", dev, baseblock, r);
 447   return bp;
 448 }
 449
 450
 451 /*===========================================================================*
 452  *                              fs_getdents                                  *
 453  *===========================================================================*/
 454 ssize_t fs_getdents(ino_t ino_nr, struct fsdriver_data *data, size_t bytes,
 455         off_t *posp)
 456 {
 457 #define GETDENTS_BUFSIZE        (sizeof(struct dirent) + MFS_NAME_MAX + 1)
 458 #define GETDENTS_ENTRIES        8
 459   static char getdents_buf[GETDENTS_BUFSIZE * GETDENTS_ENTRIES];
 460   struct fsdriver_dentry fsdentry;
 461   struct inode *rip, *entrip;
 462   int r, done;
 463   unsigned int block_size, len, type;
 464   off_t pos, off, block_pos, new_pos, ent_pos;
 465   struct buf *bp;
 466   struct direct *dp;
 467   char *cp;
 468
 469   /* Check whether the position is properly aligned */
 470   pos = *posp;
 471   if( (unsigned int) pos % DIR_ENTRY_SIZE)
 472           return(ENOENT);
 473
 474   if( (rip = get_inode(fs_dev, ino_nr)) == NULL)
 475           return(EINVAL);
 476
 477   block_size = rip->i_sp->s_block_size;
 478   off = (pos % block_size);             /* Offset in block */
 479   block_pos = pos - off;
 480   done = FALSE;         /* Stop processing directory blocks when done is set */
 481
 482   fsdriver_dentry_init(&fsdentry, data, bytes, getdents_buf,
 483         sizeof(getdents_buf));
 484
 485   /* The default position for the next request is EOF. If the user's buffer
 486    * fills up before EOF, new_pos will be modified. */
 487   new_pos = rip->i_size;
 488
 489   r = 0;
 490
 491   for(; block_pos < rip->i_size; block_pos += block_size) {
 492         /* Since directories don't have holes, 'bp' cannot be NULL. */
 493         bp = get_block_map(rip, block_pos);     /* get a dir block */
 494         assert(bp != NULL);
 495
 496         /* Search a directory block. */
 497         if (block_pos < pos)
 498                 dp = &b_dir(bp)[off / DIR_ENTRY_SIZE];
 499         else
 500                 dp = &b_dir(bp)[0];
 501         for (; dp < &b_dir(bp)[NR_DIR_ENTRIES(block_size)]; dp++) {
 502                 if (dp->mfs_d_ino == 0)
 503                         continue;       /* Entry is not in use */
 504
 505                 /* Compute the length of the name */
 506                 cp = memchr(dp->mfs_d_name, '\0', sizeof(dp->mfs_d_name));
 507                 if (cp == NULL)
 508                         len = sizeof(dp->mfs_d_name);
 509                 else
 510                         len = cp - (dp->mfs_d_name);
 511
 512                 /* Need the position of this entry in the directory */
 513                 ent_pos = block_pos + ((char *) dp - (char *) bp->data);
 514
 515                 /* We also need(?) the file type of the target inode. */
 516                 if (!(entrip = get_inode(fs_dev, (ino_t) dp->mfs_d_ino)))
 517                         panic("unexpected get_inode failure");
 518                 type = IFTODT(entrip->i_mode);
 519                 put_inode(entrip);
 520
 521                 /* MFS does not store file types in its directory entries, and
 522                  * fetching the mode from the inode is seriously expensive.
 523                  * Userland should always be prepared to receive DT_UNKNOWN.
 524                  */
 525                 r = fsdriver_dentry_add(&fsdentry, (ino_t) dp->mfs_d_ino,
 526                         dp->mfs_d_name, len, type);
 527
 528                 /* If the user buffer is full, or an error occurred, stop. */
 529                 if (r <= 0) {
 530                         done = TRUE;
 531
 532                         /* Record the position of this entry, it is the
 533                          * starting point of the next request (unless the
 534                          * postion is modified with lseek).
 535                          */
 536                         new_pos = ent_pos;
 537                         break;
 538                 }
 539         }
 540
 541         put_block(bp);
 542         if (done)
 543                 break;
 544   }
 545
 546   if (r >= 0 && (r = fsdriver_dentry_finish(&fsdentry)) >= 0) {
 547           *posp = new_pos;
 548           if(!rip->i_sp->s_rd_only) {
 549                   rip->i_update |= ATIME;
 550                   IN_MARKDIRTY(rip);
 551           }
 552   }
 553
 554   put_inode(rip);               /* release the inode */
 555   return(r);
 556 }