sys/kern/kern_physio.c

   1 /*      $NetBSD: kern_physio.c,v 1.90 2009/05/18 21:12:33 ad Exp $      */
   2
   3 /*-
   4  * Copyright (c) 1982, 1986, 1990, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  * (c) UNIX System Laboratories, Inc.
   7  * All or some portions of this file are derived from material licensed
   8  * to the University of California by American Telephone and Telegraph
   9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  10  * the permission of UNIX System Laboratories, Inc.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      @(#)kern_physio.c       8.1 (Berkeley) 6/10/93
  37  */
  38
  39 /*-
  40  * Copyright (c) 1994 Christopher G. Demetriou
  41  *
  42  * Redistribution and use in source and binary forms, with or without
  43  * modification, are permitted provided that the following conditions
  44  * are met:
  45  * 1. Redistributions of source code must retain the above copyright
  46  *    notice, this list of conditions and the following disclaimer.
  47  * 2. Redistributions in binary form must reproduce the above copyright
  48  *    notice, this list of conditions and the following disclaimer in the
  49  *    documentation and/or other materials provided with the distribution.
  50  * 3. All advertising materials mentioning features or use of this software
  51  *    must display the following acknowledgement:
  52  *      This product includes software developed by the University of
  53  *      California, Berkeley and its contributors.
  54  * 4. Neither the name of the University nor the names of its contributors
  55  *    may be used to endorse or promote products derived from this software
  56  *    without specific prior written permission.
  57  *
  58  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  59  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  60  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  61  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  62  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  63  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  64  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  68  * SUCH DAMAGE.
  69  *
  70  *      @(#)kern_physio.c       8.1 (Berkeley) 6/10/93
  71  */
  72
  73 #include <sys/cdefs.h>
  74 __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.90 2009/05/18 21:12:33 ad Exp $");
  75
  76 #include <sys/param.h>
  77 #include <sys/systm.h>
  78 #include <sys/buf.h>
  79 #include <sys/proc.h>
  80 #include <sys/once.h>
  81 #include <sys/workqueue.h>
  82 #include <sys/kmem.h>
  83
  84 #include <uvm/uvm_extern.h>
  85
  86 ONCE_DECL(physio_initialized);
  87 struct workqueue *physio_workqueue;
  88
  89 int physio_concurrency = 16;
  90
  91 /* #define      PHYSIO_DEBUG */
  92 #if defined(PHYSIO_DEBUG)
  93 #define DPRINTF(a)      printf a
  94 #else /* defined(PHYSIO_DEBUG) */
  95 #define DPRINTF(a)      /* nothing */
  96 #endif /* defined(PHYSIO_DEBUG) */
  97
  98 struct physio_stat {
  99         int ps_running;
 100         int ps_error;
 101         int ps_failed;
 102         off_t ps_endoffset;
 103         buf_t *ps_orig_bp;
 104         kmutex_t ps_lock;
 105         kcondvar_t ps_cv;
 106 };
 107
 108 static void
 109 physio_done(struct work *wk, void *dummy)
 110 {
 111         struct buf *bp = (void *)wk;
 112         size_t todo = bp->b_bufsize;
 113         size_t done = bp->b_bcount - bp->b_resid;
 114         struct physio_stat *ps = bp->b_private;
 115         bool is_iobuf;
 116
 117         KASSERT(&bp->b_work == wk);
 118         KASSERT(bp->b_bcount <= todo);
 119         KASSERT(bp->b_resid <= bp->b_bcount);
 120         KASSERT((bp->b_flags & B_PHYS) != 0);
 121         KASSERT(dummy == NULL);
 122
 123         vunmapbuf(bp, todo);
 124         uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo);
 125
 126         mutex_enter(&ps->ps_lock);
 127         is_iobuf = (bp != ps->ps_orig_bp);
 128         if (__predict_false(done != todo)) {
 129                 off_t endoffset = dbtob(bp->b_blkno) + done;
 130
 131                 /*
 132                  * we got an error or hit EOM.
 133                  *
 134                  * we only care about the first one.
 135                  * ie. the one at the lowest offset.
 136                  */
 137
 138                 KASSERT(ps->ps_endoffset != endoffset);
 139                 DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64
 140                     ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n",
 141                     __func__, bp->b_error, dbtob(bp->b_blkno), endoffset,
 142                     bp->b_blkno, bp->b_bcount, bp->b_flags));
 143
 144                 if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) {
 145                         DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64
 146                             " -> %" PRIu64 "\n",
 147                             __func__, ps,
 148                             ps->ps_error, bp->b_error,
 149                             ps->ps_endoffset, endoffset));
 150
 151                         ps->ps_endoffset = endoffset;
 152                         ps->ps_error = bp->b_error;
 153                 }
 154                 ps->ps_failed++;
 155         } else {
 156                 KASSERT(bp->b_error == 0);
 157         }
 158
 159         ps->ps_running--;
 160         cv_signal(&ps->ps_cv);
 161         mutex_exit(&ps->ps_lock);
 162
 163         if (is_iobuf)
 164                 putiobuf(bp);
 165 }
 166
 167 static void
 168 physio_biodone(struct buf *bp)
 169 {
 170 #if defined(DIAGNOSTIC)
 171         struct physio_stat *ps = bp->b_private;
 172         size_t todo = bp->b_bufsize;
 173         size_t done = bp->b_bcount - bp->b_resid;
 174
 175         KASSERT(ps->ps_running > 0);
 176         KASSERT(bp->b_bcount <= todo);
 177         KASSERT(bp->b_resid <= bp->b_bcount);
 178         if (done == todo)
 179                 KASSERT(bp->b_error == 0);
 180 #endif /* defined(DIAGNOSTIC) */
 181
 182         workqueue_enqueue(physio_workqueue, &bp->b_work, NULL);
 183 }
 184
 185 static void
 186 physio_wait(struct physio_stat *ps, int n)
 187 {
 188
 189         KASSERT(mutex_owned(&ps->ps_lock));
 190
 191         while (ps->ps_running > n)
 192                 cv_wait(&ps->ps_cv, &ps->ps_lock);
 193 }
 194
 195 static int
 196 physio_init(void)
 197 {
 198         int error;
 199
 200         KASSERT(physio_workqueue == NULL);
 201
 202         error = workqueue_create(&physio_workqueue, "physiod",
 203             physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE);
 204
 205         return error;
 206 }
 207
 208 /*
 209  * Do "physical I/O" on behalf of a user.  "Physical I/O" is I/O directly
 210  * from the raw device to user buffers, and bypasses the buffer cache.
 211  */
 212 int
 213 physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
 214     void (*min_phys)(struct buf *), struct uio *uio)
 215 {
 216         struct iovec *iovp;
 217         struct lwp *l = curlwp;
 218         struct proc *p = l->l_proc;
 219         int i, error;
 220         struct buf *bp = NULL;
 221         struct physio_stat *ps;
 222         int concurrency = physio_concurrency - 1;
 223
 224         error = RUN_ONCE(&physio_initialized, physio_init);
 225         if (__predict_false(error != 0)) {
 226                 return error;
 227         }
 228
 229         DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n",
 230             __func__, uio->uio_offset, uio->uio_resid));
 231
 232         flags &= B_READ | B_WRITE;
 233
 234         if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL)
 235                 return ENOMEM;
 236         /* ps->ps_running = 0; */
 237         /* ps->ps_error = 0; */
 238         /* ps->ps_failed = 0; */
 239         ps->ps_orig_bp = obp;
 240         ps->ps_endoffset = -1;
 241         mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE);
 242         cv_init(&ps->ps_cv, "physio");
 243
 244         /* Make sure we have a buffer, creating one if necessary. */
 245         if (obp != NULL) {
 246                 mutex_enter(&bufcache_lock);
 247                 /* Mark it busy, so nobody else will use it. */
 248                 while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH)
 249                         ;
 250                 mutex_exit(&bufcache_lock);
 251                 concurrency = 0; /* see "XXXkludge" comment below */
 252         }
 253
 254         for (i = 0; i < uio->uio_iovcnt; i++) {
 255                 bool sync = true;
 256
 257                 iovp = &uio->uio_iov[i];
 258                 while (iovp->iov_len > 0) {
 259                         size_t todo;
 260                         vaddr_t endp;
 261
 262                         mutex_enter(&ps->ps_lock);
 263                         if (ps->ps_failed != 0) {
 264                                 goto done_locked;
 265                         }
 266                         physio_wait(ps, sync ? 0 : concurrency);
 267                         mutex_exit(&ps->ps_lock);
 268                         if (obp != NULL) {
 269                                 /*
 270                                  * XXXkludge
 271                                  * some drivers use "obp" as an identifier.
 272                                  */
 273                                 bp = obp;
 274                         } else {
 275                                 bp = getiobuf(NULL, true);
 276                                 bp->b_cflags = BC_BUSY;
 277                         }
 278                         bp->b_dev = dev;
 279                         bp->b_proc = p;
 280                         bp->b_private = ps;
 281
 282                         /*
 283                          * Mrk the buffer busy for physical I/O.  Also set
 284                          * B_PHYS because it's an I/O to user memory, and
 285                          * B_RAW because B_RAW is to be "set by physio for
 286                          * raw transfers".
 287                          */
 288                         bp->b_oflags = 0;
 289                         bp->b_cflags = BC_BUSY;
 290                         bp->b_flags = flags | B_PHYS | B_RAW;
 291                         bp->b_iodone = physio_biodone;
 292
 293                         /* Set up the buffer for a maximum-sized transfer. */
 294                         bp->b_blkno = btodb(uio->uio_offset);
 295                         if (dbtob(bp->b_blkno) != uio->uio_offset) {
 296                                 error = EINVAL;
 297                                 goto done;
 298                         }
 299                         bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
 300                         bp->b_data = iovp->iov_base;
 301
 302                         /*
 303                          * Call minphys to bound the transfer size,
 304                          * and remember the amount of data to transfer,
 305                          * for later comparison.
 306                          */
 307                         (*min_phys)(bp);
 308                         todo = bp->b_bufsize = bp->b_bcount;
 309 #if defined(DIAGNOSTIC)
 310                         if (todo > MAXPHYS)
 311                                 panic("todo(%zu) > MAXPHYS; minphys broken",
 312                                     todo);
 313 #endif /* defined(DIAGNOSTIC) */
 314
 315                         sync = false;
 316                         endp = (vaddr_t)bp->b_data + todo;
 317                         if (trunc_page(endp) != endp) {
 318                                 /*
 319                                  * Following requests can overlap.
 320                                  * note that uvm_vslock does round_page.
 321                                  */
 322                                 sync = true;
 323                         }
 324
 325                         /*
 326                          * Lock the part of the user address space involved
 327                          * in the transfer.  Beware vmapbuf(); it clobbers
 328                          * b_data and saves it in b_saveaddr.  However,
 329                          * vunmapbuf() restores it.
 330                          */
 331                         error = uvm_vslock(p->p_vmspace, bp->b_data, todo,
 332                             (flags & B_READ) ?  VM_PROT_WRITE : VM_PROT_READ);
 333                         if (error) {
 334                                 goto done;
 335                         }
 336                         vmapbuf(bp, todo);
 337
 338                         BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
 339
 340                         mutex_enter(&ps->ps_lock);
 341                         ps->ps_running++;
 342                         mutex_exit(&ps->ps_lock);
 343
 344                         /* Call strategy to start the transfer. */
 345                         (*strategy)(bp);
 346                         bp = NULL;
 347
 348                         iovp->iov_len -= todo;
 349                         iovp->iov_base = (char *)iovp->iov_base + todo;
 350                         uio->uio_offset += todo;
 351                         uio->uio_resid -= todo;
 352                 }
 353         }
 354
 355 done:
 356         mutex_enter(&ps->ps_lock);
 357 done_locked:
 358         physio_wait(ps, 0);
 359         mutex_exit(&ps->ps_lock);
 360
 361         if (ps->ps_failed != 0) {
 362                 off_t delta;
 363
 364                 delta = uio->uio_offset - ps->ps_endoffset;
 365                 KASSERT(delta > 0);
 366                 uio->uio_resid += delta;
 367                 /* uio->uio_offset = ps->ps_endoffset; */
 368         } else {
 369                 KASSERT(ps->ps_endoffset == -1);
 370         }
 371         if (bp != NULL && bp != obp) {
 372                 putiobuf(bp);
 373         }
 374         if (error == 0) {
 375                 error = ps->ps_error;
 376         }
 377         mutex_destroy(&ps->ps_lock);
 378         cv_destroy(&ps->ps_cv);
 379         kmem_free(ps, sizeof(*ps));
 380
 381         /*
 382          * Clean up the state of the buffer.  Remember if somebody wants
 383          * it, so we can wake them up below.  Also, if we had to steal it,
 384          * give it back.
 385          */
 386         if (obp != NULL) {
 387                 KASSERT((obp->b_cflags & BC_BUSY) != 0);
 388
 389                 /*
 390                  * If another process is waiting for the raw I/O buffer,
 391                  * wake up processes waiting to do physical I/O;
 392                  */
 393                 mutex_enter(&bufcache_lock);
 394                 obp->b_cflags &= ~(BC_BUSY | BC_WANTED);
 395                 obp->b_flags &= ~(B_PHYS | B_RAW);
 396                 obp->b_iodone = NULL;
 397                 cv_broadcast(&obp->b_busy);
 398                 mutex_exit(&bufcache_lock);
 399         }
 400
 401         DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n",
 402             __func__, uio->uio_offset, uio->uio_resid));
 403
 404         return error;
 405 }
 406
 407 /*
 408  * A minphys() routine is called by physio() to adjust the size of each
 409  * I/O transfer before the latter is passed to the strategy routine.
 410  *
 411  * This minphys() is a default that must be called to enforce limits
 412  * that are applicable to all devices, because of limitations in the
 413  * kernel or the hardware platform.
 414  */
 415 void
 416 minphys(struct buf *bp)
 417 {
 418
 419         if (bp->b_bcount > MAXPHYS)
 420                 bp->b_bcount = MAXPHYS;
 421 }