minix/tests/test74.c

   1 /* Test 74 - mmap functionality & regression test.
   2  *
   3  * This test tests some basic functionality of mmap, and also some
   4  * cases that are quite complex for the system to handle.
   5  *
   6  * Memory pages are generally made available on demand. Memory copying
   7  * is done by the kernel. As the kernel may encounter pagefaults in
   8  * legitimate memory ranges (e.g. pages that aren't mapped; pages that
   9  * are mapped RO as they are COW), it cooperates with VM to make the
  10  * mappings and let the copy succeed transparently.
  11  *
  12  * With file-mapped ranges this can result in a deadlock, if care is
  13  * not taken, as the copy might be request by VFS or an FS. This test
  14  * triggers as many of these states as possible to ensure they are
  15  * successful or (where appropriate) fail gracefully, i.e. without
  16  * deadlock.
  17  *
  18  * To do this, system calls are done with source or target buffers with
  19  * missing or readonly mappings, both anonymous and file-mapped. The
  20  * cache is flushed before mmap() so that we know the mappings should
  21  * not be present on mmap() time. Then e.g. a read() or write() is
  22  * executed with that buffer as target. This triggers a FS copying
  23  * to or from a missing range that it itself is needed to map in first.
  24  * VFS detects this, requests VM to map in the pages, which does so with
  25  * the help of another VFS thread and the FS, and then re-issues the
  26  * request to the FS.
  27  *
  28  * Another case is the VFS itself does such a copy. This is actually
  29  * unusual as filenames are already faulted in by the requesting process
  30  * in libc by strlen(). select() allows such a case, however, so this
  31  * is tested too. We are satisfied if the call completes.
  32  */
  33
  34 #include <sys/types.h>
  35 #include <sys/mman.h>
  36 #include <sys/ioctl.h>
  37 #include <sys/ioc_memory.h>
  38 #include <sys/param.h>
  39 #include <minix/paths.h>
  40 #include <stdio.h>
  41 #include <assert.h>
  42 #include <string.h>
  43 #include <stdlib.h>
  44 #include <unistd.h>
  45 #include <fcntl.h>
  46 #include <dirent.h>
  47
  48 #include "common.h"
  49 #include "testcache.h"
  50
  51 int max_error = 0;      /* make all e()'s fatal */
  52
  53 int
  54 dowriteblock(int b, int blocksize, u32_t seed, char *data)
  55 {
  56         u64_t offset;
  57         int fd;
  58
  59         get_fd_offset(b, blocksize, &offset, &fd);
  60
  61         if(pwrite(fd, data, blocksize, offset) < blocksize) {
  62                 perror("pwrite");
  63                 return -1;
  64         }
  65
  66         return blocksize;
  67 }
  68
  69 int
  70 readblock(int b, int blocksize, u32_t seed, char *data)
  71 {
  72         u64_t offset;
  73         int fd;
  74         char *mmapdata;
  75         int pread_first = random() % 2;
  76
  77         get_fd_offset(b, blocksize, &offset, &fd);
  78
  79         if(pread_first) {
  80                 if(pread(fd, data, blocksize, offset) < blocksize) {
  81                         perror("pread");
  82                         return -1;
  83                 }
  84         }
  85
  86         if((mmapdata = mmap(NULL, blocksize, PROT_READ, MAP_PRIVATE | MAP_FILE,
  87                 fd, offset)) == MAP_FAILED) {
  88                 perror("mmap");
  89                 return -1;
  90         }
  91
  92         if(!pread_first) {
  93                 if(pread(fd, data, blocksize, offset) < blocksize) {
  94                         perror("pread");
  95                         return -1;
  96                 }
  97         }
  98
  99         if(memcmp(mmapdata, data, blocksize)) {
 100                 fprintf(stderr, "readblock: mmap, pread mismatch\n");
 101                 return -1;
 102         }
 103
 104         if(munmap(mmapdata, blocksize) < 0) {
 105                 perror("munmap");
 106                 return -1;
 107         }
 108
 109         return blocksize;
 110 }
 111
 112 void testend(void) { }
 113
 114 static void do_read(void *buf, int fd, int writable)
 115 {
 116         ssize_t ret;
 117         size_t n = PAGE_SIZE;
 118         struct stat sb;
 119         if(fstat(fd, &sb) < 0) e(1);
 120         if(S_ISDIR(sb.st_mode)) return;
 121         ret = read(fd, buf, n);
 122
 123         /* if the buffer is writable, it should succeed */
 124         if(writable) { if(ret != n) e(3); return; }
 125
 126         /* if the buffer is not writable, it should fail with EFAULT */
 127         if(ret >= 0) e(4);
 128         if(errno != EFAULT) e(5);
 129 }
 130
 131 static void do_write(void *buf, int fd, int writable)
 132 {
 133         size_t n = PAGE_SIZE;
 134         struct stat sb;
 135         if(fstat(fd, &sb) < 0) e(1);
 136         if(S_ISDIR(sb.st_mode)) return;
 137         if(write(fd, buf, n) != n) e(3);
 138 }
 139
 140 static void do_stat(void *buf, int fd, int writable)
 141 {
 142         int r;
 143         r = fstat(fd, (struct stat *) buf);
 144
 145         /* should succeed if buf is writable */
 146         if(writable) { if(r < 0) e(3); return; }
 147
 148         /* should fail with EFAULT if buf is not */
 149         if(r >= 0) e(4);
 150         if(errno != EFAULT) e(5);
 151 }
 152
 153 static void do_getdents(void *buf, int fd, int writable)
 154 {
 155         struct stat sb;
 156         int r;
 157         if(fstat(fd, &sb) < 0) e(1);
 158         if(!S_ISDIR(sb.st_mode)) return;        /* OK */
 159         r = getdents(fd, buf, PAGE_SIZE);
 160         if(writable) { if(r < 0) e(3); return; }
 161
 162         /* should fail with EFAULT if buf is not */
 163         if(r >= 0) e(4);
 164         if(errno != EFAULT) e(5);
 165 }
 166
 167 static void do_readlink1(void *buf, int fd, int writable)
 168 {
 169         char target[200];
 170         /* the system call just has to fail gracefully */
 171         readlink(buf, target, sizeof(target));
 172 }
 173
 174 #define NODENAME        "a"
 175 #define TARGETNAME      "b"
 176
 177 static void do_readlink2(void *buf, int fd, int writable)
 178 {
 179         ssize_t rl;
 180         unlink(NODENAME);
 181         if(symlink(TARGETNAME, NODENAME) < 0) e(1);
 182         rl=readlink(NODENAME, buf, sizeof(buf));
 183
 184         /* if buf is writable, it should succeed, with a certain result */
 185         if(writable) {
 186                 if(rl < 0) e(2);
 187                 ((char *) buf)[rl] = '\0';
 188                 if(strcmp(buf, TARGETNAME)) {
 189                         fprintf(stderr, "readlink: expected %s, got %s\n",
 190                                 TARGETNAME, (char *)buf);
 191                         e(3);
 192                 }
 193                 return;
 194         }
 195
 196         /* if buf is not writable, it should fail with EFAULT */
 197         if(rl >= 0) e(4);
 198
 199         if(errno != EFAULT) e(5);
 200 }
 201
 202 static void do_symlink1(void *buf, int fd, int writable)
 203 {
 204         /* the system call just has to fail gracefully */
 205         (void)symlink(buf, NODENAME);
 206 }
 207
 208 static void do_symlink2(void *buf, int fd, int writable)
 209 {
 210         /* the system call just has to fail gracefully */
 211         (void)symlink(NODENAME, buf);
 212 }
 213
 214 static void do_open(void *buf, int fd, int writable)
 215 {
 216         int r;
 217         /* the system call just has to fail gracefully */
 218         r = open(buf, O_RDONLY);
 219         if(r >= 0) close(r);
 220 }
 221
 222 static void do_select1(void *buf, int fd, int writable)
 223 {
 224         struct timeval timeout = { 0, 200000 }; /* 0.2 sec */
 225         /* the system call just has to fail gracefully */
 226         (void)select(1, buf, NULL, NULL, &timeout);
 227 }
 228
 229 static void do_select2(void *buf, int fd, int writable)
 230 {
 231         struct timeval timeout = { 0, 200000 }; /* 1 sec */
 232         /* the system call just has to fail gracefully */
 233         (void)select(1, NULL, buf, NULL, &timeout);
 234 }
 235
 236 static void do_select3(void *buf, int fd, int writable)
 237 {
 238         struct timeval timeout = { 0, 200000 }; /* 1 sec */
 239         /* the system call just has to fail gracefully */
 240         (void)select(1, NULL, NULL, buf, &timeout);
 241 }
 242
 243 static void fillfile(int fd, int size)
 244 {
 245         char *buf = malloc(size);
 246
 247         if(size < 1 || size % PAGE_SIZE || !buf) { e(1); }
 248         memset(buf, 'A', size);
 249         buf[50] = '\0'; /* so it can be used as a filename arg */
 250         buf[size-1] = '\0';
 251         if(write(fd, buf, size) != size) { e(2); }
 252         if(lseek(fd, SEEK_SET, 0) < 0) { e(3); }
 253         free(buf);
 254 }
 255
 256 static void make_buffers(int size,
 257         int *ret_fd_rw, int *ret_fd_ro,
 258         void **filebuf_rw, void **filebuf_ro, void **anonbuf)
 259 {
 260         char fn_rw[] = "testfile_rw.XXXXXX", fn_ro[] = "testfile_ro.XXXXXX";
 261         *ret_fd_rw = mkstemp(fn_rw);
 262         *ret_fd_ro = mkstemp(fn_ro);
 263
 264         if(size < 1 || size % PAGE_SIZE) { e(2); }
 265         if(*ret_fd_rw < 0) { e(1); }
 266         if(*ret_fd_ro < 0) { e(1); }
 267         fillfile(*ret_fd_rw, size);
 268         fillfile(*ret_fd_ro, size);
 269         if(fcntl(*ret_fd_rw, F_FLUSH_FS_CACHE) < 0) { e(4); }
 270         if(fcntl(*ret_fd_ro, F_FLUSH_FS_CACHE) < 0) { e(4); }
 271
 272         if((*filebuf_rw = mmap(0, size, PROT_READ | PROT_WRITE,
 273                 MAP_PRIVATE | MAP_FILE, *ret_fd_rw, 0)) == MAP_FAILED) {
 274                 e(5);
 275                 quit();
 276         }
 277
 278         if((*filebuf_ro = mmap(0, size, PROT_READ,
 279                 MAP_PRIVATE | MAP_FILE, *ret_fd_ro, 0)) == MAP_FAILED) {
 280                 e(5);
 281                 quit();
 282         }
 283
 284         if((*anonbuf = mmap(0, size, PROT_READ | PROT_WRITE,
 285                 MAP_PRIVATE | MAP_ANON, -1, 0)) == MAP_FAILED) {
 286                 e(6);
 287                 quit();
 288         }
 289
 290         if(unlink(fn_rw) < 0) { e(12); }
 291         if(unlink(fn_ro) < 0) { e(12); }
 292 }
 293
 294 static void forget_buffers(void *buf1, void *buf2, void *buf3, int fd1, int fd2, int size)
 295 {
 296         if(munmap(buf1, size) < 0) { e(1); }
 297         if(munmap(buf2, size) < 0) { e(2); }
 298         if(munmap(buf3, size) < 0) { e(2); }
 299         if(fcntl(fd1, F_FLUSH_FS_CACHE) < 0) { e(3); }
 300         if(fcntl(fd2, F_FLUSH_FS_CACHE) < 0) { e(3); }
 301         if(close(fd1) < 0) { e(4); }
 302         if(close(fd2) < 0) { e(4); }
 303 }
 304
 305 #define NEXPERIMENTS 12
 306 struct {
 307         void (*do_operation)(void * buf, int fd, int writable);
 308 } experiments[NEXPERIMENTS] = {
 309         { do_read },
 310         { do_write },
 311         { do_stat },
 312         { do_getdents },
 313         { do_readlink1 },
 314         { do_readlink2 },
 315         { do_symlink1 },
 316         { do_symlink2 },
 317         { do_open, },
 318         { do_select1 },
 319         { do_select2 },
 320         { do_select3 },
 321 };
 322
 323 static void test_memory_types_vs_operations(void)
 324 {
 325 #define NFDS 4
 326 #define BUFSIZE (10 * PAGE_SIZE)
 327         int exp, fds[NFDS];
 328         int f = 0, size = BUFSIZE;
 329
 330         /* open some test fd's */
 331 #define OPEN(fn, mode) { assert(f >= 0 && f < NFDS); \
 332         fds[f] = open(fn, mode); if(fds[f] < 0) { e(2); } f++; }
 333         OPEN("regular", O_RDWR | O_CREAT);
 334         OPEN(".", O_RDONLY);
 335         OPEN("/dev/ram", O_RDWR);
 336         OPEN("/dev/zero", O_RDWR);
 337
 338         /* make sure the regular file has plenty of size to play with */
 339         fillfile(fds[0], BUFSIZE);
 340
 341         /* and the ramdisk too */
 342         if(ioctl(fds[2], MIOCRAMSIZE, &size) < 0) { e(3); }
 343
 344         for(exp = 0; exp < NEXPERIMENTS; exp++) {
 345                 for(f = 0; f < NFDS; f++) {
 346                         void *anonmem, *filemem_rw, *filemem_ro;
 347                         int buffd_rw, buffd_ro;
 348
 349                         make_buffers(BUFSIZE, &buffd_rw, &buffd_ro,
 350                                 &filemem_rw, &filemem_ro, &anonmem);
 351
 352                         if(lseek(fds[f], 0, SEEK_SET) != 0) { e(10); }
 353                         experiments[exp].do_operation(anonmem, fds[f], 1);
 354
 355                         if(lseek(fds[f], 0, SEEK_SET) != 0) { e(11); }
 356                         experiments[exp].do_operation(filemem_rw, fds[f], 1);
 357
 358                         if(lseek(fds[f], 0, SEEK_SET) != 0) { e(12); }
 359                         experiments[exp].do_operation(filemem_ro, fds[f], 0);
 360
 361                         forget_buffers(filemem_rw, filemem_ro, anonmem, buffd_rw, buffd_ro, BUFSIZE);
 362                 }
 363         }
 364 }
 365
 366 static void basic_regression(void)
 367 {
 368         int fd, fd1, fd2;
 369         ssize_t rb, wr;
 370         char buf[PAGE_SIZE*2];
 371         void *block, *block1, *block2;
 372 #define BLOCKSIZE (PAGE_SIZE*10)
 373         block = mmap(0, BLOCKSIZE, PROT_READ | PROT_WRITE,
 374                 MAP_PRIVATE | MAP_ANON, -1, 0);
 375
 376         if(block == MAP_FAILED) { e(1); }
 377
 378         memset(block, 0, BLOCKSIZE);
 379
 380         /* shrink from bottom */
 381         munmap(block, PAGE_SIZE);
 382
 383         /* Next test: use a system call write() to access a block of
 384          * unavailable file-mapped memory.
 385          *
 386          * This is a thorny corner case to make succeed transparently
 387          * because
 388          *   (1) it is a filesystem that is doing the memory access
 389          *       (copy from the constblock1 range in this process to the
 390          *       FS) but is also the FS needed to satisfy the range if it
 391          *       isn't in the cache.
 392          *   (2) there are two separate memory regions involved, requiring
 393          *       separate VFS requests from VM to properly satisfy, requiring
 394          *       some complex state to be kept.
 395          */
 396
 397         fd1 = open("../testsh1", O_RDONLY);
 398         if (fd1 < 0) fd1 = open("../testsh1.sh", O_RDONLY);
 399         fd2 = open("../testsh2", O_RDONLY);
 400         if (fd2 < 0) fd2 = open("../testsh2.sh", O_RDONLY);
 401         if(fd1 < 0 || fd2 < 0) { e(2); }
 402
 403         /* just check that we can't mmap() a file writable */
 404         if(mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FILE, fd1, 0) != MAP_FAILED) {
 405                 e(1);
 406         }
 407
 408         /* check that we can mmap() a file MAP_SHARED readonly */
 409         if(mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED | MAP_FILE, fd1, 0) == MAP_FAILED) {
 410                 e(1);
 411         }
 412
 413         /* clear cache of files before mmap so pages won't be present already */
 414         if(fcntl(fd1, F_FLUSH_FS_CACHE) < 0) { e(1); }
 415         if(fcntl(fd2, F_FLUSH_FS_CACHE) < 0) { e(1); }
 416
 417 #define LOCATION1 (void *) 0x90000000
 418 #define LOCATION2 ((void *)((char *)LOCATION1 + PAGE_SIZE))
 419         block1 = mmap(LOCATION1, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_FILE, fd1, 0);
 420         if(block1 == MAP_FAILED) { e(4); }
 421         if(block1 != LOCATION1) { e(5); }
 422
 423         block2 = mmap(LOCATION2, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_FILE, fd2, 0);
 424         if(block2 == MAP_FAILED) { e(10); }
 425         if(block2 != LOCATION2) { e(11); }
 426
 427         unlink("testfile");
 428         fd = open("testfile", O_CREAT | O_RDWR);
 429         if(fd < 0) { e(15); }
 430
 431         /* write() using the mmap()ped memory as buffer */
 432
 433         if((wr=write(fd, LOCATION1, sizeof(buf))) != sizeof(buf)) {
 434                 fprintf(stderr, "wrote %zd bytes instead of %zd\n",
 435                         wr, sizeof(buf));
 436                 e(20);
 437                 quit();
 438         }
 439
 440         /* verify written contents */
 441
 442         if((rb=pread(fd, buf, sizeof(buf), 0)) != sizeof(buf)) {
 443                 if(rb < 0) perror("pread");
 444                 fprintf(stderr, "wrote %zd bytes\n", wr);
 445                 fprintf(stderr, "read %zd bytes instead of %zd\n",
 446                         rb, sizeof(buf));
 447                 e(21);
 448                 quit();
 449         }
 450
 451         if(memcmp(buf, LOCATION1, sizeof(buf))) {
 452                 e(22);
 453                 quit();
 454         }
 455
 456         close(fd);
 457         close(fd1);
 458         close(fd2);
 459
 460 }
 461
 462 /*
 463  * Test mmap on none-dev file systems - file systems that do not have a buffer
 464  * cache and therefore have to fake mmap support.  We use procfs as target.
 465  * The idea is that while we succeed in mapping in /proc/uptime, we also get
 466  * a new uptime value every time we map in the page -- VM must not cache it.
 467  */
 468 static void
 469 nonedev_regression(void)
 470 {
 471         int fd, fd2;
 472         char *buf;
 473         unsigned long uptime1, uptime2, uptime3;
 474
 475         subtest++;
 476
 477         if ((fd = open(_PATH_PROC "uptime", O_RDONLY)) < 0) e(1);
 478
 479         buf = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0);
 480         if (buf == MAP_FAILED) e(2);
 481
 482         if (buf[4095] != 0) e(3);
 483
 484         if ((uptime1 = atoi(buf)) == 0) e(4);
 485
 486         if (munmap(buf, 4096) != 0) e(5);
 487
 488         sleep(2);
 489
 490         buf = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE,
 491             fd, 0);
 492         if (buf == MAP_FAILED) e(6);
 493
 494         if (buf[4095] != 0) e(7);
 495
 496         if ((uptime2 = atoi(buf)) == 0) e(8);
 497
 498         if (uptime1 == uptime2) e(9);
 499
 500         if (munmap(buf, 4096) != 0) e(10);
 501
 502         sleep(2);
 503
 504         buf = mmap(NULL, 4096, PROT_READ, MAP_SHARED | MAP_FILE, fd, 0);
 505         if (buf == MAP_FAILED) e(11);
 506
 507         if (buf[4095] != 0) e(12);
 508
 509         if ((uptime3 = atoi(buf)) == 0) e(13);
 510
 511         if (uptime1 == uptime3) e(14);
 512         if (uptime2 == uptime3) e(15);
 513
 514         if (munmap(buf, 4096) != 0) e(16);
 515
 516         /* Also test page faults not incurred by the process itself. */
 517         if ((fd2 = open("testfile", O_CREAT | O_TRUNC | O_WRONLY)) < 0) e(17);
 518
 519         if (unlink("testfile") != 0) e(18);
 520
 521         buf = mmap(NULL, 4096, PROT_READ, MAP_SHARED | MAP_FILE, fd, 0);
 522         if (buf == MAP_FAILED) e(19);
 523
 524         if (write(fd2, buf, 10) != 10) e(20);
 525
 526         if (munmap(buf, 4096) != 0) e(21);
 527
 528         close(fd2);
 529         close(fd);
 530 }
 531
 532 /*
 533  * Regression test for a nasty memory-mapped file corruption bug, which is not
 534  * easy to reproduce but, before being solved, did occur in practice every once
 535  * in a while.  The executive summary is that through stale inode associations,
 536  * VM could end up using an old block to satisfy a memory mapping.
 537  *
 538  * This subtest relies on a number of assumptions regarding allocation and
 539  * reuse of inode numbers and blocks.  These assumptions hold for MFS but
 540  * possibly no other file system.  However, if the subtest's assumptions are
 541  * not met, it will simply succeed.
 542  */
 543 static void
 544 corruption_regression(void)
 545 {
 546         char *ptr, *buf;
 547         struct statvfs sf;
 548         struct stat st;
 549         size_t block_size;
 550         off_t size;
 551         int fd, fd2;
 552
 553         subtest = 1;
 554
 555         if (statvfs(".", &sf) != 0) e(0);
 556         block_size = sf.f_bsize;
 557
 558         if ((buf = malloc(block_size * 2)) == NULL) e(0);
 559
 560         /*
 561          * We first need a file that is just large enough that it requires the
 562          * allocation of a metadata block - an indirect block - when more data
 563          * is written to it.  This is fileA.  We keep it open throughout the
 564          * test so we can unlink it immediately.
 565          */
 566         if ((fd = open("fileA", O_CREAT | O_TRUNC | O_WRONLY, 0600)) == -1)
 567                 e(0);
 568         if (unlink("fileA") != 0) e(0);
 569
 570         /*
 571          * Write to fileA until its next block requires the allocation of an
 572          * additional metadata block - an indirect block.
 573          */
 574         size = 0;
 575         memset(buf, 'A', block_size);
 576         do {
 577                 /*
 578                  * Repeatedly write an extra block, until the file consists of
 579                  * more blocks than just the file data.
 580                  */
 581                 if (write(fd, buf, block_size) != block_size) e(0);
 582                 size += block_size;
 583                 if (size >= block_size * 64) {
 584                         /*
 585                          * It doesn't look like this is going to work.
 586                          * Skip this subtest altogether.
 587                          */
 588                         if (close(fd) != 0) e(0);
 589                         free(buf);
 590
 591                         return;
 592                 }
 593                 if (fstat(fd, &st) != 0) e(0);
 594         } while (st.st_blocks * 512 == size);
 595
 596         /* Once we get there, go one step back by truncating by one block. */
 597         size -= block_size; /* for MFS, size will end up being 7*block_size */
 598         if (ftruncate(fd, size) != 0) e(0);
 599
 600         /*
 601          * Create a first file, fileB, and write two blocks to it.  FileB's
 602          * blocks are going to end up in the secondary VM cache, associated to
 603          * fileB's inode number (and two different offsets within the file).
 604          * The block cache does not know about files getting deleted, so we can
 605          * unlink fileB immediately after creating it.  So far so good.
 606          */
 607         if ((fd2 = open("fileB", O_CREAT | O_TRUNC | O_WRONLY, 0600)) == -1)
 608                 e(0);
 609         if (unlink("fileB") != 0) e(0);
 610         memset(buf, 'B', block_size * 2);
 611         if (write(fd2, buf, block_size * 2) != block_size * 2) e(0);
 612         if (close(fd2) != 0) e(0);
 613
 614         /*
 615          * Write one extra block to fileA, hoping that this causes allocation
 616          * of a metadata block as well.  This is why we tried to get fileA to
 617          * the point that one more block would also require the allocation of a
 618          * metadata block.  Our intent is to recycle the blocks that we just
 619          * allocated and freed for fileB.  As of writing, for the metadata
 620          * block, this will *not* break the association with fileB's inode,
 621          * which by itself is not a problem, yet crucial to reproducing
 622          * the actual problem a bit later.  Note that the test does not rely on
 623          * whether the file system allocates the data block or the metadata
 624          * block first, although it does need reverse deallocation (see below).
 625          */
 626         memset(buf, 'A', block_size);
 627         if (write(fd, buf, block_size) != block_size) e(0);
 628
 629         /*
 630          * Create a new file, fileC, which recycles the inode number of fileB,
 631          * but uses two new blocks to store its data.  These new blocks will
 632          * get associated to the fileB inode number, and one of them will
 633          * thereby eclipse (but not remove) the association of fileA's metadata
 634          * block to the inode of fileB.
 635          */
 636         if ((fd2 = open("fileC", O_CREAT | O_TRUNC | O_WRONLY, 0600)) == -1)
 637                 e(0);
 638         if (unlink("fileC") != 0) e(0);
 639         memset(buf, 'C', block_size * 2);
 640         if (write(fd2, buf, block_size * 2) != block_size * 2) e(0);
 641         if (close(fd2) != 0) e(0);
 642
 643         /*
 644          * Free up the extra fileA blocks for reallocation, in particular
 645          * including the metadata block.  Again, this will not affect the
 646          * contents of the VM cache in any way.  FileA's metadata block remains
 647          * cached in VM, with the inode association for fileB's block.
 648          */
 649         if (ftruncate(fd, size) != 0) e(0);
 650
 651         /*
 652          * Now create yet one more file, fileD, which also recycles the inode
 653          * number of fileB and fileC.  Write two blocks to it; these blocks
 654          * should recycle the blocks we just freed.  One of these is fileA's
 655          * just-freed metadata block, for which the new inode association will
 656          * be equal to the inode association it had already (as long as blocks
 657          * are freed in reverse order of their allocation, which happens to be
 658          * the case for MFS).  As a result, the block is not updated in the VM
 659          * cache, and VM will therefore continue to see the inode association
 660          * for the corresponding block of fileC which is still in the VM cache.
 661          */
 662         if ((fd2 = open("fileD", O_CREAT | O_TRUNC | O_RDWR, 0600)) == -1)
 663                 e(0);
 664         memset(buf, 'D', block_size * 2);
 665         if (write(fd2, buf, block_size * 2) != block_size * 2) e(0);
 666
 667         ptr = mmap(NULL, block_size * 2, PROT_READ, MAP_FILE, fd2, 0);
 668         if (ptr == MAP_FAILED) e(0);
 669
 670         /*
 671          * Finally, we can test the issue.  Since fileC's block is still the
 672          * block for which VM has the corresponding inode association, VM will
 673          * now find and map in fileC's block, instead of fileD's block.  The
 674          * result is that we get a memory-mapped area with stale contents,
 675          * different from those of the underlying file.
 676          */
 677         if (memcmp(buf, ptr, block_size * 2)) e(0);
 678
 679         /* Clean up. */
 680         if (munmap(ptr, block_size * 2) != 0) e(0);
 681
 682         if (close(fd2) != 0) e(0);
 683         if (unlink("fileD") != 0) e(0);
 684
 685         if (close(fd) != 0) e(0);
 686
 687         free(buf);
 688 }
 689
 690 /*
 691  * Test mmap on file holes.  Holes are a tricky case with the current VM
 692  * implementation.  There are two main issues.  First, whenever a file data
 693  * block is freed, VM has to know about this, or it will later blindly map in
 694  * the old data.  This, file systems explicitly tell VM (through libminixfs)
 695  * whenever a block is freed, upon which VM cache forgets the block.  Second,
 696  * blocks are accessed primarily by a <dev,dev_off> pair and only additionally
 697  * by a <ino,ino_off> pair.  Holes have no meaningful value for the first pair,
 698  * but do need to be registered in VM with the second pair, or accessing them
 699  * will generate a segmentation fault.  Thus, file systems explicitly tell VM
 700  * (through libminixfs) when a hole is being peeked; libminixfs currently fakes
 701  * a device offset to make this work.
 702  */
 703 static void
 704 hole_regression(void)
 705 {
 706         struct statvfs st;
 707         size_t block_size;
 708         char *buf;
 709         int fd;
 710
 711         if (statvfs(".", &st) < 0) e(1);
 712
 713         block_size = st.f_bsize;
 714
 715         if ((buf = malloc(block_size)) == NULL) e(2);
 716
 717         if ((fd = open("testfile", O_CREAT | O_TRUNC | O_RDWR)) < 0) e(3);
 718
 719         if (unlink("testfile") != 0) e(4);
 720
 721         /*
 722          * We perform the test twice, in a not-so-perfect attempt to test the
 723          * two aspects independently.  The first part immediately creates a
 724          * hole, and is supposed to fail only if reporting holes to VM does not
 725          * work.  However, it may also fail if a page for a previous file with
 726          * the same inode number as "testfile" is still in the VM cache.
 727          */
 728         memset(buf, 12, block_size);
 729
 730         if (write(fd, buf, block_size) != block_size) e(5);
 731
 732         if (lseek(fd, block_size * 2, SEEK_CUR) != block_size * 3) e(6);
 733
 734         memset(buf, 78, block_size);
 735
 736         if (write(fd, buf, block_size) != block_size) e(7);
 737
 738         free(buf);
 739
 740         if ((buf = mmap(NULL, 4 * block_size, PROT_READ, MAP_SHARED | MAP_FILE,
 741             fd, 0)) == MAP_FAILED) e(8);
 742
 743         if (buf[0 * block_size] != 12 || buf[1 * block_size - 1] != 12) e(9);
 744         if (buf[1 * block_size] !=  0 || buf[2 * block_size - 1] !=  0) e(10);
 745         if (buf[2 * block_size] !=  0 || buf[3 * block_size - 1] !=  0) e(11);
 746         if (buf[3 * block_size] != 78 || buf[4 * block_size - 1] != 78) e(12);
 747
 748         if (munmap(buf, 4 * block_size) != 0) e(13);
 749
 750         /*
 751          * The second part first creates file content and only turns part of it
 752          * into a file hole, thus ensuring that VM has previously cached pages
 753          * for the blocks that are freed.  The test will fail if VM keeps the
 754          * pages around in its cache.
 755          */
 756         if ((buf = malloc(block_size)) == NULL) e(14);
 757
 758         if (lseek(fd, block_size, SEEK_SET) != block_size) e(15);
 759
 760         memset(buf, 34, block_size);
 761
 762         if (write(fd, buf, block_size) != block_size) e(16);
 763
 764         memset(buf, 56, block_size);
 765
 766         if (write(fd, buf, block_size) != block_size) e(17);
 767
 768         if (ftruncate(fd, block_size) != 0) e(18);
 769
 770         if (lseek(fd, block_size * 3, SEEK_SET) != block_size * 3) e(19);
 771
 772         memset(buf, 78, block_size);
 773
 774         if (write(fd, buf, block_size) != block_size) e(20);
 775
 776         free(buf);
 777
 778         if ((buf = mmap(NULL, 4 * block_size, PROT_READ, MAP_SHARED | MAP_FILE,
 779             fd, 0)) == MAP_FAILED) e(21);
 780
 781         if (buf[0 * block_size] != 12 || buf[1 * block_size - 1] != 12) e(22);
 782         if (buf[1 * block_size] !=  0 || buf[2 * block_size - 1] !=  0) e(23);
 783         if (buf[2 * block_size] !=  0 || buf[3 * block_size - 1] !=  0) e(24);
 784         if (buf[3 * block_size] != 78 || buf[4 * block_size - 1] != 78) e(25);
 785
 786         if (munmap(buf, 4 * block_size) != 0) e(26);
 787
 788         close(fd);
 789 }
 790
 791 /*
 792  * Test that soft faults during file system I/O do not cause functions to
 793  * return partial I/O results.
 794  *
 795  * We refer to the faults that are caused internally within the operating
 796  * system as a result of the deadlock mitigation described at the top of this
 797  * file, as a particular class of "soft faults".  Such soft faults may occur in
 798  * the middle of an I/O operation, and general I/O semantics dictate that upon
 799  * partial success, the partial success is returned (and *not* an error).  As a
 800  * result, these soft faults, if not handled as special cases, may cause even
 801  * common file system operations such as read(2) on a regular file to return
 802  * fewer bytes than requested.  Such unexpected short reads are typically not
 803  * handled well by userland, and the OS must prevent them from occurring if it
 804  * can.  Note that read(2) is the most problematic, but certainly not the only,
 805  * case where this problem can occur.
 806  *
 807  * Unfortunately, several file system services are not following the proper
 808  * general I/O semantics - and this includes MFS.  Therefore, for now, we have
 809  * to test this case using block device I/O, which does do the right thing.
 810  * In this test we hope that the root file system is mounted on a block device
 811  * usable for (read-only!) testing purposes.
 812  */
 813 static void
 814 softfault_partial(void)
 815 {
 816         struct statvfs stf;
 817         struct stat st;
 818         char *buf, *buf2;
 819         ssize_t size;
 820         int fd;
 821
 822         if (statvfs("/", &stf) != 0) e(0);
 823
 824         /*
 825          * If the root file system is not mounted off a block device, or if we
 826          * cannot open that device ourselves, simply skip this subtest.
 827          */
 828         if (stat(stf.f_mntfromname, &st) != 0 || !S_ISBLK(st.st_mode))
 829                 return; /* skip subtest */
 830
 831         if ((fd = open(stf.f_mntfromname, O_RDONLY)) == -1)
 832                 return; /* skip subtest */
 833
 834         /*
 835          * See if we can read in the first two full blocks, or two pages worth
 836          * of data, whichever is larger.  If that fails, there is no point in
 837          * continuing the test.
 838          */
 839         size = MAX(stf.f_bsize, PAGE_SIZE) * 2;
 840
 841         if ((buf = mmap(NULL, size, PROT_READ | PROT_READ,
 842             MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) e(0);
 843
 844         if (read(fd, buf, size) != size) {
 845                 munmap(buf, size);
 846                 close(fd);
 847                 return; /* skip subtest */
 848         }
 849
 850         lseek(fd, 0, SEEK_SET);
 851
 852         /*
 853          * Now attempt a read to a partially faulted-in buffer.  The first time
 854          * around, the I/O transfer will generate a fault and return partial
 855          * success.  In that case, the entire I/O transfer should be retried
 856          * after faulting in the missing page(s), thus resulting in the read
 857          * succeeding in full.
 858          */
 859         if ((buf2 = mmap(NULL, size, PROT_READ | PROT_READ,
 860             MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED) e(0);
 861         buf2[0] = '\0'; /* fault in the first page */
 862
 863         if (read(fd, buf2, size) != size) e(0);
 864
 865         /* The result should be correct, too. */
 866         if (memcmp(buf, buf2, size)) e(0);
 867
 868         /* Clean up. */
 869         munmap(buf2, size);
 870         munmap(buf, size);
 871
 872         close(fd);
 873 }
 874
 875 int
 876 main(int argc, char *argv[])
 877 {
 878         int i, iter = 2;
 879
 880         start(74);
 881
 882         basic_regression();
 883
 884         nonedev_regression();
 885
 886         /*
 887          * Any inode or block allocation happening concurrently with this
 888          * subtest will make the subtest succeed without testing the actual
 889          * issue.  Thus, repeat the subtest a fair number of times.
 890          */
 891         for (i = 0; i < 10; i++)
 892                 corruption_regression();
 893
 894         hole_regression();
 895
 896         test_memory_types_vs_operations();
 897
 898         softfault_partial();
 899
 900         makefiles(MAXFILES);
 901
 902         cachequiet(!bigflag);
 903         if(bigflag) iter = 3;
 904
 905         /* Try various combinations working set sizes
 906          * and block sizes in order to specifically
 907          * target the primary cache, then primary+secondary
 908          * cache, then primary+secondary cache+secondary
 909          * cache eviction.
 910          */
 911
 912         if(dotest(PAGE_SIZE,    100, iter)) e(5);
 913         if(dotest(PAGE_SIZE*2,  100, iter)) e(2);
 914         if(dotest(PAGE_SIZE*3,  100, iter)) e(3);
 915         if(dotest(PAGE_SIZE,  20000, iter)) e(5);
 916
 917         if(bigflag) {
 918                 u32_t totalmem, freemem, cachedmem;
 919                 if(dotest(PAGE_SIZE,  150000, iter)) e(5);
 920                 getmem(&totalmem, &freemem, &cachedmem);
 921                 if(dotest(PAGE_SIZE,  totalmem*1.5, iter)) e(6);
 922         }
 923
 924         quit();
 925
 926         return 0;
 927 }
 928