usr/src/lib/libc/port/aio/aio.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 #pragma ident   "%Z%%M% %I%     %E% SMI"
  28
  29 #include "lint.h"
  30 #include "thr_uberdata.h"
  31 #include "asyncio.h"
  32 #include <atomic.h>
  33 #include <sys/param.h>
  34 #include <sys/file.h>
  35 #include <sys/port.h>
  36
  37 static int _aio_hash_insert(aio_result_t *, aio_req_t *);
  38 static aio_req_t *_aio_req_get(aio_worker_t *);
  39 static void _aio_req_add(aio_req_t *, aio_worker_t **, int);
  40 static void _aio_req_del(aio_worker_t *, aio_req_t *, int);
  41 static void _aio_work_done(aio_worker_t *);
  42 static void _aio_enq_doneq(aio_req_t *);
  43
  44 extern void _aio_lio_free(aio_lio_t *);
  45
  46 extern int __fdsync(int, int);
  47 extern int __fcntl(int, int, ...);
  48 extern int _port_dispatch(int, int, int, int, uintptr_t, void *);
  49
  50 static int _aio_fsync_del(aio_worker_t *, aio_req_t *);
  51 static void _aiodone(aio_req_t *, ssize_t, int);
  52 static void _aio_cancel_work(aio_worker_t *, int, int *, int *);
  53 static void _aio_finish_request(aio_worker_t *, ssize_t, int);
  54
  55 /*
  56  * switch for kernel async I/O
  57  */
  58 int _kaio_ok = 0;               /* 0 = disabled, 1 = on, -1 = error */
  59
  60 /*
  61  * Key for thread-specific data
  62  */
  63 pthread_key_t _aio_key;
  64
  65 /*
  66  * Array for determining whether or not a file supports kaio.
  67  * Initialized in _kaio_init().
  68  */
  69 uint32_t *_kaio_supported = NULL;
  70
  71 /*
  72  *  workers for read/write requests
  73  * (__aio_mutex lock protects circular linked list of workers)
  74  */
  75 aio_worker_t *__workers_rw;     /* circular list of AIO workers */
  76 aio_worker_t *__nextworker_rw;  /* next worker in list of workers */
  77 int __rw_workerscnt;            /* number of read/write workers */
  78
  79 /*
  80  * worker for notification requests.
  81  */
  82 aio_worker_t *__workers_no;     /* circular list of AIO workers */
  83 aio_worker_t *__nextworker_no;  /* next worker in list of workers */
  84 int __no_workerscnt;            /* number of write workers */
  85
  86 aio_req_t *_aio_done_tail;              /* list of done requests */
  87 aio_req_t *_aio_done_head;
  88
  89 mutex_t __aio_initlock = DEFAULTMUTEX;  /* makes aio initialization atomic */
  90 cond_t __aio_initcv = DEFAULTCV;
  91 int __aio_initbusy = 0;
  92
  93 mutex_t __aio_mutex = DEFAULTMUTEX;     /* protects counts, and linked lists */
  94 cond_t _aio_iowait_cv = DEFAULTCV;      /* wait for userland I/Os */
  95
  96 pid_t __pid = (pid_t)-1;                /* initialize as invalid pid */
  97 int _sigio_enabled = 0;                 /* when set, send SIGIO signal */
  98
  99 aio_hash_t *_aio_hash;
 100
 101 aio_req_t *_aio_doneq;                  /* double linked done queue list */
 102
 103 int _aio_donecnt = 0;
 104 int _aio_waitncnt = 0;                  /* # of requests for aio_waitn */
 105 int _aio_doneq_cnt = 0;
 106 int _aio_outstand_cnt = 0;              /* # of outstanding requests */
 107 int _kaio_outstand_cnt = 0;             /* # of outstanding kaio requests */
 108 int _aio_req_done_cnt = 0;              /* req. done but not in "done queue" */
 109 int _aio_kernel_suspend = 0;            /* active kernel kaio calls */
 110 int _aio_suscv_cnt = 0;                 /* aio_suspend calls waiting on cv's */
 111
 112 int _max_workers = 256;                 /* max number of workers permitted */
 113 int _min_workers = 4;                   /* min number of workers */
 114 int _minworkload = 2;                   /* min number of request in q */
 115 int _aio_worker_cnt = 0;                /* number of workers to do requests */
 116 int __uaio_ok = 0;                      /* AIO has been enabled */
 117 sigset_t _worker_set;                   /* worker's signal mask */
 118
 119 int _aiowait_flag = 0;                  /* when set, aiowait() is inprogress */
 120 int _aio_flags = 0;                     /* see asyncio.h defines for */
 121
 122 aio_worker_t *_kaiowp = NULL;           /* points to kaio cleanup thread */
 123
 124 int hz;                                 /* clock ticks per second */
 125
 126 static int
 127 _kaio_supported_init(void)
 128 {
 129         void *ptr;
 130         size_t size;
 131
 132         if (_kaio_supported != NULL)    /* already initialized */
 133                 return (0);
 134
 135         size = MAX_KAIO_FDARRAY_SIZE * sizeof (uint32_t);
 136         ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
 137             MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
 138         if (ptr == MAP_FAILED)
 139                 return (-1);
 140         _kaio_supported = ptr;
 141         return (0);
 142 }
 143
 144 /*
 145  * The aio subsystem is initialized when an AIO request is made.
 146  * Constants are initialized like the max number of workers that
 147  * the subsystem can create, and the minimum number of workers
 148  * permitted before imposing some restrictions.  Also, some
 149  * workers are created.
 150  */
 151 int
 152 __uaio_init(void)
 153 {
 154         int ret = -1;
 155         int i;
 156         int cancel_state;
 157
 158         lmutex_lock(&__aio_initlock);
 159         (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state);
 160         while (__aio_initbusy)
 161                 (void) cond_wait(&__aio_initcv, &__aio_initlock);
 162         (void) pthread_setcancelstate(cancel_state, NULL);
 163         if (__uaio_ok) {        /* already initialized */
 164                 lmutex_unlock(&__aio_initlock);
 165                 return (0);
 166         }
 167         __aio_initbusy = 1;
 168         lmutex_unlock(&__aio_initlock);
 169
 170         hz = (int)sysconf(_SC_CLK_TCK);
 171         __pid = getpid();
 172
 173         setup_cancelsig(SIGAIOCANCEL);
 174
 175         if (_kaio_supported_init() != 0)
 176                 goto out;
 177
 178         /*
 179          * Allocate and initialize the hash table.
 180          * Do this only once, even if __uaio_init() is called twice.
 181          */
 182         if (_aio_hash == NULL) {
 183                 /* LINTED pointer cast */
 184                 _aio_hash = mmap(NULL,
 185                     HASHSZ * sizeof (aio_hash_t), PROT_READ | PROT_WRITE,
 186                     MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
 187                 if ((void *)_aio_hash == MAP_FAILED) {
 188                         _aio_hash = NULL;
 189                         goto out;
 190                 }
 191                 for (i = 0; i < HASHSZ; i++)
 192                         (void) mutex_init(&_aio_hash[i].hash_lock,
 193                             USYNC_THREAD, NULL);
 194         }
 195
 196         /*
 197          * Initialize worker's signal mask to only catch SIGAIOCANCEL.
 198          */
 199         (void) sigfillset(&_worker_set);
 200         (void) sigdelset(&_worker_set, SIGAIOCANCEL);
 201
 202         /*
 203          * Create one worker to send asynchronous notifications.
 204          * Do this only once, even if __uaio_init() is called twice.
 205          */
 206         if (__no_workerscnt == 0 &&
 207             (_aio_create_worker(NULL, AIONOTIFY) != 0)) {
 208                 errno = EAGAIN;
 209                 goto out;
 210         }
 211
 212         /*
 213          * Create the minimum number of read/write workers.
 214          * And later check whether atleast one worker is created;
 215          * lwp_create() calls could fail because of segkp exhaustion.
 216          */
 217         for (i = 0; i < _min_workers; i++)
 218                 (void) _aio_create_worker(NULL, AIOREAD);
 219         if (__rw_workerscnt == 0) {
 220                 errno = EAGAIN;
 221                 goto out;
 222         }
 223
 224         ret = 0;
 225 out:
 226         lmutex_lock(&__aio_initlock);
 227         if (ret == 0)
 228                 __uaio_ok = 1;
 229         __aio_initbusy = 0;
 230         (void) cond_broadcast(&__aio_initcv);
 231         lmutex_unlock(&__aio_initlock);
 232         return (ret);
 233 }
 234
 235 /*
 236  * Called from close() before actually performing the real _close().
 237  */
 238 void
 239 _aio_close(int fd)
 240 {
 241         if (fd < 0)     /* avoid cancelling everything */
 242                 return;
 243         /*
 244          * Cancel all outstanding aio requests for this file descriptor.
 245          */
 246         if (__uaio_ok)
 247                 (void) aiocancel_all(fd);
 248         /*
 249          * If we have allocated the bit array, clear the bit for this file.
 250          * The next open may re-use this file descriptor and the new file
 251          * may have different kaio() behaviour.
 252          */
 253         if (_kaio_supported != NULL)
 254                 CLEAR_KAIO_SUPPORTED(fd);
 255 }
 256
 257 /*
 258  * special kaio cleanup thread sits in a loop in the
 259  * kernel waiting for pending kaio requests to complete.
 260  */
 261 void *
 262 _kaio_cleanup_thread(void *arg)
 263 {
 264         if (pthread_setspecific(_aio_key, arg) != 0)
 265                 aio_panic("_kaio_cleanup_thread, pthread_setspecific()");
 266         (void) _kaio(AIOSTART);
 267         return (arg);
 268 }
 269
 270 /*
 271  * initialize kaio.
 272  */
 273 void
 274 _kaio_init()
 275 {
 276         int error;
 277         sigset_t oset;
 278         int cancel_state;
 279
 280         lmutex_lock(&__aio_initlock);
 281         (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state);
 282         while (__aio_initbusy)
 283                 (void) cond_wait(&__aio_initcv, &__aio_initlock);
 284         (void) pthread_setcancelstate(cancel_state, NULL);
 285         if (_kaio_ok) {         /* already initialized */
 286                 lmutex_unlock(&__aio_initlock);
 287                 return;
 288         }
 289         __aio_initbusy = 1;
 290         lmutex_unlock(&__aio_initlock);
 291
 292         if (_kaio_supported_init() != 0)
 293                 error = ENOMEM;
 294         else if ((_kaiowp = _aio_worker_alloc()) == NULL)
 295                 error = ENOMEM;
 296         else if ((error = (int)_kaio(AIOINIT)) == 0) {
 297                 (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
 298                 error = thr_create(NULL, AIOSTKSIZE, _kaio_cleanup_thread,
 299                     _kaiowp, THR_DAEMON, &_kaiowp->work_tid);
 300                 (void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
 301         }
 302         if (error && _kaiowp != NULL) {
 303                 _aio_worker_free(_kaiowp);
 304                 _kaiowp = NULL;
 305         }
 306
 307         lmutex_lock(&__aio_initlock);
 308         if (error)
 309                 _kaio_ok = -1;
 310         else
 311                 _kaio_ok = 1;
 312         __aio_initbusy = 0;
 313         (void) cond_broadcast(&__aio_initcv);
 314         lmutex_unlock(&__aio_initlock);
 315 }
 316
 317 int
 318 aioread(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
 319     aio_result_t *resultp)
 320 {
 321         return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOREAD));
 322 }
 323
 324 int
 325 aiowrite(int fd, caddr_t buf, int bufsz, off_t offset, int whence,
 326     aio_result_t *resultp)
 327 {
 328         return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOWRITE));
 329 }
 330
 331 #if !defined(_LP64)
 332 int
 333 aioread64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
 334     aio_result_t *resultp)
 335 {
 336         return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAREAD64));
 337 }
 338
 339 int
 340 aiowrite64(int fd, caddr_t buf, int bufsz, off64_t offset, int whence,
 341     aio_result_t *resultp)
 342 {
 343         return (_aiorw(fd, buf, bufsz, offset, whence, resultp, AIOAWRITE64));
 344 }
 345 #endif  /* !defined(_LP64) */
 346
 347 int
 348 _aiorw(int fd, caddr_t buf, int bufsz, offset_t offset, int whence,
 349     aio_result_t *resultp, int mode)
 350 {
 351         aio_req_t *reqp;
 352         aio_args_t *ap;
 353         offset_t loffset;
 354         struct stat64 stat64;
 355         int error = 0;
 356         int kerr;
 357         int umode;
 358
 359         switch (whence) {
 360
 361         case SEEK_SET:
 362                 loffset = offset;
 363                 break;
 364         case SEEK_CUR:
 365                 if ((loffset = llseek(fd, 0, SEEK_CUR)) == -1)
 366                         error = -1;
 367                 else
 368                         loffset += offset;
 369                 break;
 370         case SEEK_END:
 371                 if (fstat64(fd, &stat64) == -1)
 372                         error = -1;
 373                 else
 374                         loffset = offset + stat64.st_size;
 375                 break;
 376         default:
 377                 errno = EINVAL;
 378                 error = -1;
 379         }
 380
 381         if (error)
 382                 return (error);
 383
 384         /* initialize kaio */
 385         if (!_kaio_ok)
 386                 _kaio_init();
 387
 388         /*
 389          * _aio_do_request() needs the original request code (mode) to be able
 390          * to choose the appropiate 32/64 bit function.  All other functions
 391          * only require the difference between READ and WRITE (umode).
 392          */
 393         if (mode == AIOAREAD64 || mode == AIOAWRITE64)
 394                 umode = mode - AIOAREAD64;
 395         else
 396                 umode = mode;
 397
 398         /*
 399          * Try kernel aio first.
 400          * If errno is ENOTSUP/EBADFD, fall back to the thread implementation.
 401          */
 402         if (_kaio_ok > 0 && KAIO_SUPPORTED(fd)) {
 403                 resultp->aio_errno = 0;
 404                 sig_mutex_lock(&__aio_mutex);
 405                 _kaio_outstand_cnt++;
 406                 sig_mutex_unlock(&__aio_mutex);
 407                 kerr = (int)_kaio(((resultp->aio_return == AIO_INPROGRESS) ?
 408                     (umode | AIO_POLL_BIT) : umode),
 409                     fd, buf, bufsz, loffset, resultp);
 410                 if (kerr == 0) {
 411                         return (0);
 412                 }
 413                 sig_mutex_lock(&__aio_mutex);
 414                 _kaio_outstand_cnt--;
 415                 sig_mutex_unlock(&__aio_mutex);
 416                 if (errno != ENOTSUP && errno != EBADFD)
 417                         return (-1);
 418                 if (errno == EBADFD)
 419                         SET_KAIO_NOT_SUPPORTED(fd);
 420         }
 421
 422         if (!__uaio_ok && __uaio_init() == -1)
 423                 return (-1);
 424
 425         if ((reqp = _aio_req_alloc()) == NULL) {
 426                 errno = EAGAIN;
 427                 return (-1);
 428         }
 429
 430         /*
 431          * _aio_do_request() checks reqp->req_op to differentiate
 432          * between 32 and 64 bit access.
 433          */
 434         reqp->req_op = mode;
 435         reqp->req_resultp = resultp;
 436         ap = &reqp->req_args;
 437         ap->fd = fd;
 438         ap->buf = buf;
 439         ap->bufsz = bufsz;
 440         ap->offset = loffset;
 441
 442         if (_aio_hash_insert(resultp, reqp) != 0) {
 443                 _aio_req_free(reqp);
 444                 errno = EINVAL;
 445                 return (-1);
 446         }
 447         /*
 448          * _aio_req_add() only needs the difference between READ and
 449          * WRITE to choose the right worker queue.
 450          */
 451         _aio_req_add(reqp, &__nextworker_rw, umode);
 452         return (0);
 453 }
 454
 455 int
 456 aiocancel(aio_result_t *resultp)
 457 {
 458         aio_req_t *reqp;
 459         aio_worker_t *aiowp;
 460         int ret;
 461         int done = 0;
 462         int canceled = 0;
 463
 464         if (!__uaio_ok) {
 465                 errno = EINVAL;
 466                 return (-1);
 467         }
 468
 469         sig_mutex_lock(&__aio_mutex);
 470         reqp = _aio_hash_find(resultp);
 471         if (reqp == NULL) {
 472                 if (_aio_outstand_cnt == _aio_req_done_cnt)
 473                         errno = EINVAL;
 474                 else
 475                         errno = EACCES;
 476                 ret = -1;
 477         } else {
 478                 aiowp = reqp->req_worker;
 479                 sig_mutex_lock(&aiowp->work_qlock1);
 480                 (void) _aio_cancel_req(aiowp, reqp, &canceled, &done);
 481                 sig_mutex_unlock(&aiowp->work_qlock1);
 482
 483                 if (canceled) {
 484                         ret = 0;
 485                 } else {
 486                         if (_aio_outstand_cnt == 0 ||
 487                             _aio_outstand_cnt == _aio_req_done_cnt)
 488                                 errno = EINVAL;
 489                         else
 490                                 errno = EACCES;
 491                         ret = -1;
 492                 }
 493         }
 494         sig_mutex_unlock(&__aio_mutex);
 495         return (ret);
 496 }
 497
 498 /* ARGSUSED */
 499 static void
 500 _aiowait_cleanup(void *arg)
 501 {
 502         sig_mutex_lock(&__aio_mutex);
 503         _aiowait_flag--;
 504         sig_mutex_unlock(&__aio_mutex);
 505 }
 506
 507 /*
 508  * This must be asynch safe and cancel safe
 509  */
 510 aio_result_t *
 511 aiowait(struct timeval *uwait)
 512 {
 513         aio_result_t *uresultp;
 514         aio_result_t *kresultp;
 515         aio_result_t *resultp;
 516         int dontblock;
 517         int timedwait = 0;
 518         int kaio_errno = 0;
 519         struct timeval twait;
 520         struct timeval *wait = NULL;
 521         hrtime_t hrtend;
 522         hrtime_t hres;
 523
 524         if (uwait) {
 525                 /*
 526                  * Check for a valid specified wait time.
 527                  * If it is invalid, fail the call right away.
 528                  */
 529                 if (uwait->tv_sec < 0 || uwait->tv_usec < 0 ||
 530                     uwait->tv_usec >= MICROSEC) {
 531                         errno = EINVAL;
 532                         return ((aio_result_t *)-1);
 533                 }
 534
 535                 if (uwait->tv_sec > 0 || uwait->tv_usec > 0) {
 536                         hrtend = gethrtime() +
 537                             (hrtime_t)uwait->tv_sec * NANOSEC +
 538                             (hrtime_t)uwait->tv_usec * (NANOSEC / MICROSEC);
 539                         twait = *uwait;
 540                         wait = &twait;
 541                         timedwait++;
 542                 } else {
 543                         /* polling */
 544                         sig_mutex_lock(&__aio_mutex);
 545                         if (_kaio_outstand_cnt == 0) {
 546                                 kresultp = (aio_result_t *)-1;
 547                         } else {
 548                                 kresultp = (aio_result_t *)_kaio(AIOWAIT,
 549                                     (struct timeval *)-1, 1);
 550                                 if (kresultp != (aio_result_t *)-1 &&
 551                                     kresultp != NULL &&
 552                                     kresultp != (aio_result_t *)1) {
 553                                         _kaio_outstand_cnt--;
 554                                         sig_mutex_unlock(&__aio_mutex);
 555                                         return (kresultp);
 556                                 }
 557                         }
 558                         uresultp = _aio_req_done();
 559                         sig_mutex_unlock(&__aio_mutex);
 560                         if (uresultp != NULL &&
 561                             uresultp != (aio_result_t *)-1) {
 562                                 return (uresultp);
 563                         }
 564                         if (uresultp == (aio_result_t *)-1 &&
 565                             kresultp == (aio_result_t *)-1) {
 566                                 errno = EINVAL;
 567                                 return ((aio_result_t *)-1);
 568                         } else {
 569                                 return (NULL);
 570                         }
 571                 }
 572         }
 573
 574         for (;;) {
 575                 sig_mutex_lock(&__aio_mutex);
 576                 uresultp = _aio_req_done();
 577                 if (uresultp != NULL && uresultp != (aio_result_t *)-1) {
 578                         sig_mutex_unlock(&__aio_mutex);
 579                         resultp = uresultp;
 580                         break;
 581                 }
 582                 _aiowait_flag++;
 583                 dontblock = (uresultp == (aio_result_t *)-1);
 584                 if (dontblock && _kaio_outstand_cnt == 0) {
 585                         kresultp = (aio_result_t *)-1;
 586                         kaio_errno = EINVAL;
 587                 } else {
 588                         sig_mutex_unlock(&__aio_mutex);
 589                         pthread_cleanup_push(_aiowait_cleanup, NULL);
 590                         _cancel_prologue();
 591                         kresultp = (aio_result_t *)_kaio(AIOWAIT,
 592                             wait, dontblock);
 593                         _cancel_epilogue();
 594                         pthread_cleanup_pop(0);
 595                         sig_mutex_lock(&__aio_mutex);
 596                         kaio_errno = errno;
 597                 }
 598                 _aiowait_flag--;
 599                 sig_mutex_unlock(&__aio_mutex);
 600                 if (kresultp == (aio_result_t *)1) {
 601                         /* aiowait() awakened by an aionotify() */
 602                         continue;
 603                 } else if (kresultp != NULL &&
 604                     kresultp != (aio_result_t *)-1) {
 605                         resultp = kresultp;
 606                         sig_mutex_lock(&__aio_mutex);
 607                         _kaio_outstand_cnt--;
 608                         sig_mutex_unlock(&__aio_mutex);
 609                         break;
 610                 } else if (kresultp == (aio_result_t *)-1 &&
 611                     kaio_errno == EINVAL &&
 612                     uresultp == (aio_result_t *)-1) {
 613                         errno = kaio_errno;
 614                         resultp = (aio_result_t *)-1;
 615                         break;
 616                 } else if (kresultp == (aio_result_t *)-1 &&
 617                     kaio_errno == EINTR) {
 618                         errno = kaio_errno;
 619                         resultp = (aio_result_t *)-1;
 620                         break;
 621                 } else if (timedwait) {
 622                         hres = hrtend - gethrtime();
 623                         if (hres <= 0) {
 624                                 /* time is up; return */
 625                                 resultp = NULL;
 626                                 break;
 627                         } else {
 628                                 /*
 629                                  * Some time left.  Round up the remaining time
 630                                  * in nanoseconds to microsec.  Retry the call.
 631                                  */
 632                                 hres += (NANOSEC / MICROSEC) - 1;
 633                                 wait->tv_sec = hres / NANOSEC;
 634                                 wait->tv_usec =
 635                                     (hres % NANOSEC) / (NANOSEC / MICROSEC);
 636                         }
 637                 } else {
 638                         ASSERT(kresultp == NULL && uresultp == NULL);
 639                         resultp = NULL;
 640                         continue;
 641                 }
 642         }
 643         return (resultp);
 644 }
 645
 646 /*
 647  * _aio_get_timedelta calculates the remaining time and stores the result
 648  * into timespec_t *wait.
 649  */
 650
 651 int
 652 _aio_get_timedelta(timespec_t *end, timespec_t *wait)
 653 {
 654         int     ret = 0;
 655         struct  timeval cur;
 656         timespec_t curtime;
 657
 658         (void) gettimeofday(&cur, NULL);
 659         curtime.tv_sec = cur.tv_sec;
 660         curtime.tv_nsec = cur.tv_usec * 1000;   /* convert us to ns */
 661
 662         if (end->tv_sec >= curtime.tv_sec) {
 663                 wait->tv_sec = end->tv_sec - curtime.tv_sec;
 664                 if (end->tv_nsec >= curtime.tv_nsec) {
 665                         wait->tv_nsec = end->tv_nsec - curtime.tv_nsec;
 666                         if (wait->tv_sec == 0 && wait->tv_nsec == 0)
 667                                 ret = -1;       /* timer expired */
 668                 } else {
 669                         if (end->tv_sec > curtime.tv_sec) {
 670                                 wait->tv_sec -= 1;
 671                                 wait->tv_nsec = NANOSEC -
 672                                     (curtime.tv_nsec - end->tv_nsec);
 673                         } else {
 674                                 ret = -1;       /* timer expired */
 675                         }
 676                 }
 677         } else {
 678                 ret = -1;
 679         }
 680         return (ret);
 681 }
 682
 683 /*
 684  * If closing by file descriptor: we will simply cancel all the outstanding
 685  * aio`s and return.  Those aio's in question will have either noticed the
 686  * cancellation notice before, during, or after initiating io.
 687  */
 688 int
 689 aiocancel_all(int fd)
 690 {
 691         aio_req_t *reqp;
 692         aio_req_t **reqpp, *last;
 693         aio_worker_t *first;
 694         aio_worker_t *next;
 695         int canceled = 0;
 696         int done = 0;
 697         int cancelall = 0;
 698
 699         sig_mutex_lock(&__aio_mutex);
 700
 701         if (_aio_outstand_cnt == 0) {
 702                 sig_mutex_unlock(&__aio_mutex);
 703                 return (AIO_ALLDONE);
 704         }
 705
 706         /*
 707          * Cancel requests from the read/write workers' queues.
 708          */
 709         first = __nextworker_rw;
 710         next = first;
 711         do {
 712                 _aio_cancel_work(next, fd, &canceled, &done);
 713         } while ((next = next->work_forw) != first);
 714
 715         /*
 716          * finally, check if there are requests on the done queue that
 717          * should be canceled.
 718          */
 719         if (fd < 0)
 720                 cancelall = 1;
 721         reqpp = &_aio_done_tail;
 722         last = _aio_done_tail;
 723         while ((reqp = *reqpp) != NULL) {
 724                 if (cancelall || reqp->req_args.fd == fd) {
 725                         *reqpp = reqp->req_next;
 726                         if (last == reqp) {
 727                                 last = reqp->req_next;
 728                         }
 729                         if (_aio_done_head == reqp) {
 730                                 /* this should be the last req in list */
 731                                 _aio_done_head = last;
 732                         }
 733                         _aio_donecnt--;
 734                         _aio_set_result(reqp, -1, ECANCELED);
 735                         (void) _aio_hash_del(reqp->req_resultp);
 736                         _aio_req_free(reqp);
 737                 } else {
 738                         reqpp = &reqp->req_next;
 739                         last = reqp;
 740                 }
 741         }
 742
 743         if (cancelall) {
 744                 ASSERT(_aio_donecnt == 0);
 745                 _aio_done_head = NULL;
 746         }
 747         sig_mutex_unlock(&__aio_mutex);
 748
 749         if (canceled && done == 0)
 750                 return (AIO_CANCELED);
 751         else if (done && canceled == 0)
 752                 return (AIO_ALLDONE);
 753         else if ((canceled + done == 0) && KAIO_SUPPORTED(fd))
 754                 return ((int)_kaio(AIOCANCEL, fd, NULL));
 755         return (AIO_NOTCANCELED);
 756 }
 757
 758 /*
 759  * Cancel requests from a given work queue.  If the file descriptor
 760  * parameter, fd, is non-negative, then only cancel those requests
 761  * in this queue that are to this file descriptor.  If the fd
 762  * parameter is -1, then cancel all requests.
 763  */
 764 static void
 765 _aio_cancel_work(aio_worker_t *aiowp, int fd, int *canceled, int *done)
 766 {
 767         aio_req_t *reqp;
 768
 769         sig_mutex_lock(&aiowp->work_qlock1);
 770         /*
 771          * cancel queued requests first.
 772          */
 773         reqp = aiowp->work_tail1;
 774         while (reqp != NULL) {
 775                 if (fd < 0 || reqp->req_args.fd == fd) {
 776                         if (_aio_cancel_req(aiowp, reqp, canceled, done)) {
 777                                 /*
 778                                  * Callers locks were dropped.
 779                                  * reqp is invalid; start traversing
 780                                  * the list from the beginning again.
 781                                  */
 782                                 reqp = aiowp->work_tail1;
 783                                 continue;
 784                         }
 785                 }
 786                 reqp = reqp->req_next;
 787         }
 788         /*
 789          * Since the queued requests have been canceled, there can
 790          * only be one inprogress request that should be canceled.
 791          */
 792         if ((reqp = aiowp->work_req) != NULL &&
 793             (fd < 0 || reqp->req_args.fd == fd))
 794                 (void) _aio_cancel_req(aiowp, reqp, canceled, done);
 795         sig_mutex_unlock(&aiowp->work_qlock1);
 796 }
 797
 798 /*
 799  * Cancel a request.  Return 1 if the callers locks were temporarily
 800  * dropped, otherwise return 0.
 801  */
 802 int
 803 _aio_cancel_req(aio_worker_t *aiowp, aio_req_t *reqp, int *canceled, int *done)
 804 {
 805         int ostate = reqp->req_state;
 806
 807         ASSERT(MUTEX_HELD(&__aio_mutex));
 808         ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
 809         if (ostate == AIO_REQ_CANCELED)
 810                 return (0);
 811         if (ostate == AIO_REQ_DONE && !POSIX_AIO(reqp) &&
 812             aiowp->work_prev1 == reqp) {
 813                 ASSERT(aiowp->work_done1 != 0);
 814                 /*
 815                  * If not on the done queue yet, just mark it CANCELED,
 816                  * _aio_work_done() will do the necessary clean up.
 817                  * This is required to ensure that aiocancel_all() cancels
 818                  * all the outstanding requests, including this one which
 819                  * is not yet on done queue but has been marked done.
 820                  */
 821                 _aio_set_result(reqp, -1, ECANCELED);
 822                 (void) _aio_hash_del(reqp->req_resultp);
 823                 reqp->req_state = AIO_REQ_CANCELED;
 824                 (*canceled)++;
 825                 return (0);
 826         }
 827
 828         if (ostate == AIO_REQ_DONE || ostate == AIO_REQ_DONEQ) {
 829                 (*done)++;
 830                 return (0);
 831         }
 832         if (reqp->req_op == AIOFSYNC && reqp != aiowp->work_req) {
 833                 ASSERT(POSIX_AIO(reqp));
 834                 /* Cancel the queued aio_fsync() request */
 835                 if (!reqp->req_head->lio_canned) {
 836                         reqp->req_head->lio_canned = 1;
 837                         _aio_outstand_cnt--;
 838                         (*canceled)++;
 839                 }
 840                 return (0);
 841         }
 842         reqp->req_state = AIO_REQ_CANCELED;
 843         _aio_req_del(aiowp, reqp, ostate);
 844         (void) _aio_hash_del(reqp->req_resultp);
 845         (*canceled)++;
 846         if (reqp == aiowp->work_req) {
 847                 ASSERT(ostate == AIO_REQ_INPROGRESS);
 848                 /*
 849                  * Set the result values now, before _aiodone() is called.
 850                  * We do this because the application can expect aio_return
 851                  * and aio_errno to be set to -1 and ECANCELED, respectively,
 852                  * immediately after a successful return from aiocancel()
 853                  * or aio_cancel().
 854                  */
 855                 _aio_set_result(reqp, -1, ECANCELED);
 856                 (void) thr_kill(aiowp->work_tid, SIGAIOCANCEL);
 857                 return (0);
 858         }
 859         if (!POSIX_AIO(reqp)) {
 860                 _aio_outstand_cnt--;
 861                 _aio_set_result(reqp, -1, ECANCELED);
 862                 _aio_req_free(reqp);
 863                 return (0);
 864         }
 865         sig_mutex_unlock(&aiowp->work_qlock1);
 866         sig_mutex_unlock(&__aio_mutex);
 867         _aiodone(reqp, -1, ECANCELED);
 868         sig_mutex_lock(&__aio_mutex);
 869         sig_mutex_lock(&aiowp->work_qlock1);
 870         return (1);
 871 }
 872
 873 int
 874 _aio_create_worker(aio_req_t *reqp, int mode)
 875 {
 876         aio_worker_t *aiowp, **workers, **nextworker;
 877         int *aio_workerscnt;
 878         void *(*func)(void *);
 879         sigset_t oset;
 880         int error;
 881
 882         /*
 883          * Put the new worker thread in the right queue.
 884          */
 885         switch (mode) {
 886         case AIOREAD:
 887         case AIOWRITE:
 888         case AIOAREAD:
 889         case AIOAWRITE:
 890 #if !defined(_LP64)
 891         case AIOAREAD64:
 892         case AIOAWRITE64:
 893 #endif
 894                 workers = &__workers_rw;
 895                 nextworker = &__nextworker_rw;
 896                 aio_workerscnt = &__rw_workerscnt;
 897                 func = _aio_do_request;
 898                 break;
 899         case AIONOTIFY:
 900                 workers = &__workers_no;
 901                 nextworker = &__nextworker_no;
 902                 func = _aio_do_notify;
 903                 aio_workerscnt = &__no_workerscnt;
 904                 break;
 905         default:
 906                 aio_panic("_aio_create_worker: invalid mode");
 907                 break;
 908         }
 909
 910         if ((aiowp = _aio_worker_alloc()) == NULL)
 911                 return (-1);
 912
 913         if (reqp) {
 914                 reqp->req_state = AIO_REQ_QUEUED;
 915                 reqp->req_worker = aiowp;
 916                 aiowp->work_head1 = reqp;
 917                 aiowp->work_tail1 = reqp;
 918                 aiowp->work_next1 = reqp;
 919                 aiowp->work_count1 = 1;
 920                 aiowp->work_minload1 = 1;
 921         }
 922
 923         (void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
 924         error = thr_create(NULL, AIOSTKSIZE, func, aiowp,
 925             THR_DAEMON | THR_SUSPENDED, &aiowp->work_tid);
 926         (void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
 927         if (error) {
 928                 if (reqp) {
 929                         reqp->req_state = 0;
 930                         reqp->req_worker = NULL;
 931                 }
 932                 _aio_worker_free(aiowp);
 933                 return (-1);
 934         }
 935
 936         lmutex_lock(&__aio_mutex);
 937         (*aio_workerscnt)++;
 938         if (*workers == NULL) {
 939                 aiowp->work_forw = aiowp;
 940                 aiowp->work_backw = aiowp;
 941                 *nextworker = aiowp;
 942                 *workers = aiowp;
 943         } else {
 944                 aiowp->work_backw = (*workers)->work_backw;
 945                 aiowp->work_forw = (*workers);
 946                 (*workers)->work_backw->work_forw = aiowp;
 947                 (*workers)->work_backw = aiowp;
 948         }
 949         _aio_worker_cnt++;
 950         lmutex_unlock(&__aio_mutex);
 951
 952         (void) thr_continue(aiowp->work_tid);
 953
 954         return (0);
 955 }
 956
 957 /*
 958  * This is the worker's main routine.
 959  * The task of this function is to execute all queued requests;
 960  * once the last pending request is executed this function will block
 961  * in _aio_idle().  A new incoming request must wakeup this thread to
 962  * restart the work.
 963  * Every worker has an own work queue.  The queue lock is required
 964  * to synchronize the addition of new requests for this worker or
 965  * cancellation of pending/running requests.
 966  *
 967  * Cancellation scenarios:
 968  * The cancellation of a request is being done asynchronously using
 969  * _aio_cancel_req() from another thread context.
 970  * A queued request can be cancelled in different manners :
 971  * a) request is queued but not "in progress" or "done" (AIO_REQ_QUEUED):
 972  *      - lock the queue -> remove the request -> unlock the queue
 973  *      - this function/thread does not detect this cancellation process
 974  * b) request is in progress (AIO_REQ_INPROGRESS) :
 975  *      - this function first allow the cancellation of the running
 976  *        request with the flag "work_cancel_flg=1"
 977  *              see _aio_req_get() -> _aio_cancel_on()
 978  *        During this phase, it is allowed to interrupt the worker
 979  *        thread running the request (this thread) using the SIGAIOCANCEL
 980  *        signal.
 981  *        Once this thread returns from the kernel (because the request
 982  *        is just done), then it must disable a possible cancellation
 983  *        and proceed to finish the request.  To disable the cancellation
 984  *        this thread must use _aio_cancel_off() to set "work_cancel_flg=0".
 985  * c) request is already done (AIO_REQ_DONE || AIO_REQ_DONEQ):
 986  *        same procedure as in a)
 987  *
 988  * To b)
 989  *      This thread uses sigsetjmp() to define the position in the code, where
 990  *      it wish to continue working in the case that a SIGAIOCANCEL signal
 991  *      is detected.
 992  *      Normally this thread should get the cancellation signal during the
 993  *      kernel phase (reading or writing).  In that case the signal handler
 994  *      aiosigcancelhndlr() is activated using the worker thread context,
 995  *      which again will use the siglongjmp() function to break the standard
 996  *      code flow and jump to the "sigsetjmp" position, provided that
 997  *      "work_cancel_flg" is set to "1".
 998  *      Because the "work_cancel_flg" is only manipulated by this worker
 999  *      thread and it can only run on one CPU at a given time, it is not
1000  *      necessary to protect that flag with the queue lock.
1001  *      Returning from the kernel (read or write system call) we must
1002  *      first disable the use of the SIGAIOCANCEL signal and accordingly
1003  *      the use of the siglongjmp() function to prevent a possible deadlock:
1004  *      - It can happens that this worker thread returns from the kernel and
1005  *        blocks in "work_qlock1",
1006  *      - then a second thread cancels the apparently "in progress" request
1007  *        and sends the SIGAIOCANCEL signal to the worker thread,
1008  *      - the worker thread gets assigned the "work_qlock1" and will returns
1009  *        from the kernel,
1010  *      - the kernel detects the pending signal and activates the signal
1011  *        handler instead,
1012  *      - if the "work_cancel_flg" is still set then the signal handler
1013  *        should use siglongjmp() to cancel the "in progress" request and
1014  *        it would try to acquire the same work_qlock1 in _aio_req_get()
1015  *        for a second time => deadlock.
1016  *      To avoid that situation we disable the cancellation of the request
1017  *      in progress BEFORE we try to acquire the work_qlock1.
1018  *      In that case the signal handler will not call siglongjmp() and the
1019  *      worker thread will continue running the standard code flow.
1020  *      Then this thread must check the AIO_REQ_CANCELED flag to emulate
1021  *      an eventually required siglongjmp() freeing the work_qlock1 and
1022  *      avoiding a deadlock.
1023  */
1024 void *
1025 _aio_do_request(void *arglist)
1026 {
1027         aio_worker_t *aiowp = (aio_worker_t *)arglist;
1028         ulwp_t *self = curthread;
1029         struct aio_args *arg;
1030         aio_req_t *reqp;                /* current AIO request */
1031         ssize_t retval;
1032         int append;
1033         int error;
1034
1035         if (pthread_setspecific(_aio_key, aiowp) != 0)
1036                 aio_panic("_aio_do_request, pthread_setspecific()");
1037         (void) pthread_sigmask(SIG_SETMASK, &_worker_set, NULL);
1038         ASSERT(aiowp->work_req == NULL);
1039
1040         /*
1041          * We resume here when an operation is cancelled.
1042          * On first entry, aiowp->work_req == NULL, so all
1043          * we do is block SIGAIOCANCEL.
1044          */
1045         (void) sigsetjmp(aiowp->work_jmp_buf, 0);
1046         ASSERT(self->ul_sigdefer == 0);
1047
1048         sigoff(self);   /* block SIGAIOCANCEL */
1049         if (aiowp->work_req != NULL)
1050                 _aio_finish_request(aiowp, -1, ECANCELED);
1051
1052         for (;;) {
1053                 /*
1054                  * Put completed requests on aio_done_list.  This has
1055                  * to be done as part of the main loop to ensure that
1056                  * we don't artificially starve any aiowait'ers.
1057                  */
1058                 if (aiowp->work_done1)
1059                         _aio_work_done(aiowp);
1060
1061 top:
1062                 /* consume any deferred SIGAIOCANCEL signal here */
1063                 sigon(self);
1064                 sigoff(self);
1065
1066                 while ((reqp = _aio_req_get(aiowp)) == NULL) {
1067                         if (_aio_idle(aiowp) != 0)
1068                                 goto top;
1069                 }
1070                 arg = &reqp->req_args;
1071                 ASSERT(reqp->req_state == AIO_REQ_INPROGRESS ||
1072                     reqp->req_state == AIO_REQ_CANCELED);
1073                 error = 0;
1074
1075                 switch (reqp->req_op) {
1076                 case AIOREAD:
1077                 case AIOAREAD:
1078                         sigon(self);    /* unblock SIGAIOCANCEL */
1079                         retval = pread(arg->fd, arg->buf,
1080                             arg->bufsz, arg->offset);
1081                         if (retval == -1) {
1082                                 if (errno == ESPIPE) {
1083                                         retval = read(arg->fd,
1084                                             arg->buf, arg->bufsz);
1085                                         if (retval == -1)
1086                                                 error = errno;
1087                                 } else {
1088                                         error = errno;
1089                                 }
1090                         }
1091                         sigoff(self);   /* block SIGAIOCANCEL */
1092                         break;
1093                 case AIOWRITE:
1094                 case AIOAWRITE:
1095                         /*
1096                          * The SUSv3 POSIX spec for aio_write() states:
1097                          *      If O_APPEND is set for the file descriptor,
1098                          *      write operations append to the file in the
1099                          *      same order as the calls were made.
1100                          * but, somewhat inconsistently, it requires pwrite()
1101                          * to ignore the O_APPEND setting.  So we have to use
1102                          * fcntl() to get the open modes and call write() for
1103                          * the O_APPEND case.
1104                          */
1105                         append = (__fcntl(arg->fd, F_GETFL) & O_APPEND);
1106                         sigon(self);    /* unblock SIGAIOCANCEL */
1107                         retval = append?
1108                             write(arg->fd, arg->buf, arg->bufsz) :
1109                             pwrite(arg->fd, arg->buf, arg->bufsz,
1110                             arg->offset);
1111                         if (retval == -1) {
1112                                 if (errno == ESPIPE) {
1113                                         retval = write(arg->fd,
1114                                             arg->buf, arg->bufsz);
1115                                         if (retval == -1)
1116                                                 error = errno;
1117                                 } else {
1118                                         error = errno;
1119                                 }
1120                         }
1121                         sigoff(self);   /* block SIGAIOCANCEL */
1122                         break;
1123 #if !defined(_LP64)
1124                 case AIOAREAD64:
1125                         sigon(self);    /* unblock SIGAIOCANCEL */
1126                         retval = pread64(arg->fd, arg->buf,
1127                             arg->bufsz, arg->offset);
1128                         if (retval == -1) {
1129                                 if (errno == ESPIPE) {
1130                                         retval = read(arg->fd,
1131                                             arg->buf, arg->bufsz);
1132                                         if (retval == -1)
1133                                                 error = errno;
1134                                 } else {
1135                                         error = errno;
1136                                 }
1137                         }
1138                         sigoff(self);   /* block SIGAIOCANCEL */
1139                         break;
1140                 case AIOAWRITE64:
1141                         /*
1142                          * The SUSv3 POSIX spec for aio_write() states:
1143                          *      If O_APPEND is set for the file descriptor,
1144                          *      write operations append to the file in the
1145                          *      same order as the calls were made.
1146                          * but, somewhat inconsistently, it requires pwrite()
1147                          * to ignore the O_APPEND setting.  So we have to use
1148                          * fcntl() to get the open modes and call write() for
1149                          * the O_APPEND case.
1150                          */
1151                         append = (__fcntl(arg->fd, F_GETFL) & O_APPEND);
1152                         sigon(self);    /* unblock SIGAIOCANCEL */
1153                         retval = append?
1154                             write(arg->fd, arg->buf, arg->bufsz) :
1155                             pwrite64(arg->fd, arg->buf, arg->bufsz,
1156                             arg->offset);
1157                         if (retval == -1) {
1158                                 if (errno == ESPIPE) {
1159                                         retval = write(arg->fd,
1160                                             arg->buf, arg->bufsz);
1161                                         if (retval == -1)
1162                                                 error = errno;
1163                                 } else {
1164                                         error = errno;
1165                                 }
1166                         }
1167                         sigoff(self);   /* block SIGAIOCANCEL */
1168                         break;
1169 #endif  /* !defined(_LP64) */
1170                 case AIOFSYNC:
1171                         if (_aio_fsync_del(aiowp, reqp))
1172                                 goto top;
1173                         ASSERT(reqp->req_head == NULL);
1174                         /*
1175                          * All writes for this fsync request are now
1176                          * acknowledged.  Now make these writes visible
1177                          * and put the final request into the hash table.
1178                          */
1179                         if (reqp->req_state == AIO_REQ_CANCELED) {
1180                                 /* EMPTY */;
1181                         } else if (arg->offset == O_SYNC) {
1182                                 if ((retval = __fdsync(arg->fd, O_SYNC)) == -1)
1183                                         error = errno;
1184                         } else {
1185                                 if ((retval = __fdsync(arg->fd, O_DSYNC)) == -1)
1186                                         error = errno;
1187                         }
1188                         if (_aio_hash_insert(reqp->req_resultp, reqp) != 0)
1189                                 aio_panic("_aio_do_request(): AIOFSYNC: "
1190                                     "request already in hash table");
1191                         break;
1192                 default:
1193                         aio_panic("_aio_do_request, bad op");
1194                 }
1195
1196                 _aio_finish_request(aiowp, retval, error);
1197         }
1198         /* NOTREACHED */
1199         return (NULL);
1200 }
1201
1202 /*
1203  * Perform the tail processing for _aio_do_request().
1204  * The in-progress request may or may not have been cancelled.
1205  */
1206 static void
1207 _aio_finish_request(aio_worker_t *aiowp, ssize_t retval, int error)
1208 {
1209         aio_req_t *reqp;
1210
1211         sig_mutex_lock(&aiowp->work_qlock1);
1212         if ((reqp = aiowp->work_req) == NULL)
1213                 sig_mutex_unlock(&aiowp->work_qlock1);
1214         else {
1215                 aiowp->work_req = NULL;
1216                 if (reqp->req_state == AIO_REQ_CANCELED) {
1217                         retval = -1;
1218                         error = ECANCELED;
1219                 }
1220                 if (!POSIX_AIO(reqp)) {
1221                         int notify;
1222                         if (reqp->req_state == AIO_REQ_INPROGRESS) {
1223                                 reqp->req_state = AIO_REQ_DONE;
1224                                 _aio_set_result(reqp, retval, error);
1225                         }
1226                         sig_mutex_unlock(&aiowp->work_qlock1);
1227                         sig_mutex_lock(&__aio_mutex);
1228                         /*
1229                          * If it was canceled, this request will not be
1230                          * added to done list. Just free it.
1231                          */
1232                         if (error == ECANCELED) {
1233                                 _aio_outstand_cnt--;
1234                                 _aio_req_free(reqp);
1235                         } else {
1236                                 _aio_req_done_cnt++;
1237                         }
1238                         /*
1239                          * Notify any thread that may have blocked
1240                          * because it saw an outstanding request.
1241                          */
1242                         notify = 0;
1243                         if (_aio_outstand_cnt == 0 && _aiowait_flag) {
1244                                 notify = 1;
1245                         }
1246                         sig_mutex_unlock(&__aio_mutex);
1247                         if (notify) {
1248                                 (void) _kaio(AIONOTIFY);
1249                         }
1250                 } else {
1251                         if (reqp->req_state == AIO_REQ_INPROGRESS)
1252                                 reqp->req_state = AIO_REQ_DONE;
1253                         sig_mutex_unlock(&aiowp->work_qlock1);
1254                         _aiodone(reqp, retval, error);
1255                 }
1256         }
1257 }
1258
1259 void
1260 _aio_req_mark_done(aio_req_t *reqp)
1261 {
1262 #if !defined(_LP64)
1263         if (reqp->req_largefile)
1264                 ((aiocb64_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
1265         else
1266 #endif
1267                 ((aiocb_t *)reqp->req_aiocbp)->aio_state = USERAIO_DONE;
1268 }
1269
1270 /*
1271  * Sleep for 'ticks' clock ticks to give somebody else a chance to run,
1272  * hopefully to consume one of our queued signals.
1273  */
1274 static void
1275 _aio_delay(int ticks)
1276 {
1277         (void) usleep(ticks * (MICROSEC / hz));
1278 }
1279
1280 /*
1281  * Actually send the notifications.
1282  * We could block indefinitely here if the application
1283  * is not listening for the signal or port notifications.
1284  */
1285 static void
1286 send_notification(notif_param_t *npp)
1287 {
1288         extern int __sigqueue(pid_t pid, int signo,
1289             /* const union sigval */ void *value, int si_code, int block);
1290
1291         if (npp->np_signo)
1292                 (void) __sigqueue(__pid, npp->np_signo, npp->np_user,
1293                     SI_ASYNCIO, 1);
1294         else if (npp->np_port >= 0)
1295                 (void) _port_dispatch(npp->np_port, 0, PORT_SOURCE_AIO,
1296                     npp->np_event, npp->np_object, npp->np_user);
1297
1298         if (npp->np_lio_signo)
1299                 (void) __sigqueue(__pid, npp->np_lio_signo, npp->np_lio_user,
1300                     SI_ASYNCIO, 1);
1301         else if (npp->np_lio_port >= 0)
1302                 (void) _port_dispatch(npp->np_lio_port, 0, PORT_SOURCE_AIO,
1303                     npp->np_lio_event, npp->np_lio_object, npp->np_lio_user);
1304 }
1305
1306 /*
1307  * Asynchronous notification worker.
1308  */
1309 void *
1310 _aio_do_notify(void *arg)
1311 {
1312         aio_worker_t *aiowp = (aio_worker_t *)arg;
1313         aio_req_t *reqp;
1314
1315         /*
1316          * This isn't really necessary.  All signals are blocked.
1317          */
1318         if (pthread_setspecific(_aio_key, aiowp) != 0)
1319                 aio_panic("_aio_do_notify, pthread_setspecific()");
1320
1321         /*
1322          * Notifications are never cancelled.
1323          * All signals remain blocked, forever.
1324          */
1325         for (;;) {
1326                 while ((reqp = _aio_req_get(aiowp)) == NULL) {
1327                         if (_aio_idle(aiowp) != 0)
1328                                 aio_panic("_aio_do_notify: _aio_idle() failed");
1329                 }
1330                 send_notification(&reqp->req_notify);
1331                 _aio_req_free(reqp);
1332         }
1333
1334         /* NOTREACHED */
1335         return (NULL);
1336 }
1337
1338 /*
1339  * Do the completion semantics for a request that was either canceled
1340  * by _aio_cancel_req() or was completed by _aio_do_request().
1341  */
1342 static void
1343 _aiodone(aio_req_t *reqp, ssize_t retval, int error)
1344 {
1345         aio_result_t *resultp = reqp->req_resultp;
1346         int notify = 0;
1347         aio_lio_t *head;
1348         int sigev_none;
1349         int sigev_signal;
1350         int sigev_thread;
1351         int sigev_port;
1352         notif_param_t np;
1353
1354         /*
1355          * We call _aiodone() only for Posix I/O.
1356          */
1357         ASSERT(POSIX_AIO(reqp));
1358
1359         sigev_none = 0;
1360         sigev_signal = 0;
1361         sigev_thread = 0;
1362         sigev_port = 0;
1363         np.np_signo = 0;
1364         np.np_port = -1;
1365         np.np_lio_signo = 0;
1366         np.np_lio_port = -1;
1367
1368         switch (reqp->req_sigevent.sigev_notify) {
1369         case SIGEV_NONE:
1370                 sigev_none = 1;
1371                 break;
1372         case SIGEV_SIGNAL:
1373                 sigev_signal = 1;
1374                 break;
1375         case SIGEV_THREAD:
1376                 sigev_thread = 1;
1377                 break;
1378         case SIGEV_PORT:
1379                 sigev_port = 1;
1380                 break;
1381         default:
1382                 aio_panic("_aiodone: improper sigev_notify");
1383                 break;
1384         }
1385
1386         /*
1387          * Figure out the notification parameters while holding __aio_mutex.
1388          * Actually perform the notifications after dropping __aio_mutex.
1389          * This allows us to sleep for a long time (if the notifications
1390          * incur delays) without impeding other async I/O operations.
1391          */
1392
1393         sig_mutex_lock(&__aio_mutex);
1394
1395         if (sigev_signal) {
1396                 if ((np.np_signo = reqp->req_sigevent.sigev_signo) != 0)
1397                         notify = 1;
1398                 np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
1399         } else if (sigev_thread | sigev_port) {
1400                 if ((np.np_port = reqp->req_sigevent.sigev_signo) >= 0)
1401                         notify = 1;
1402                 np.np_event = reqp->req_op;
1403                 if (np.np_event == AIOFSYNC && reqp->req_largefile)
1404                         np.np_event = AIOFSYNC64;
1405                 np.np_object = (uintptr_t)reqp->req_aiocbp;
1406                 np.np_user = reqp->req_sigevent.sigev_value.sival_ptr;
1407         }
1408
1409         if (resultp->aio_errno == EINPROGRESS)
1410                 _aio_set_result(reqp, retval, error);
1411
1412         _aio_outstand_cnt--;
1413
1414         head = reqp->req_head;
1415         reqp->req_head = NULL;
1416
1417         if (sigev_none) {
1418                 _aio_enq_doneq(reqp);
1419                 reqp = NULL;
1420         } else {
1421                 (void) _aio_hash_del(resultp);
1422                 _aio_req_mark_done(reqp);
1423         }
1424
1425         _aio_waitn_wakeup();
1426
1427         /*
1428          * __aio_waitn() sets AIO_WAIT_INPROGRESS and
1429          * __aio_suspend() increments "_aio_kernel_suspend"
1430          * when they are waiting in the kernel for completed I/Os.
1431          *
1432          * _kaio(AIONOTIFY) awakes the corresponding function
1433          * in the kernel; then the corresponding __aio_waitn() or
1434          * __aio_suspend() function could reap the recently
1435          * completed I/Os (_aiodone()).
1436          */
1437         if ((_aio_flags & AIO_WAIT_INPROGRESS) || _aio_kernel_suspend > 0)
1438                 (void) _kaio(AIONOTIFY);
1439
1440         sig_mutex_unlock(&__aio_mutex);
1441
1442         if (head != NULL) {
1443                 /*
1444                  * If all the lio requests have completed,
1445                  * prepare to notify the waiting thread.
1446                  */
1447                 sig_mutex_lock(&head->lio_mutex);
1448                 ASSERT(head->lio_refcnt == head->lio_nent);
1449                 if (head->lio_refcnt == 1) {
1450                         int waiting = 0;
1451                         if (head->lio_mode == LIO_WAIT) {
1452                                 if ((waiting = head->lio_waiting) != 0)
1453                                         (void) cond_signal(&head->lio_cond_cv);
1454                         } else if (head->lio_port < 0) { /* none or signal */
1455                                 if ((np.np_lio_signo = head->lio_signo) != 0)
1456                                         notify = 1;
1457                                 np.np_lio_user = head->lio_sigval.sival_ptr;
1458                         } else {                        /* thread or port */
1459                                 notify = 1;
1460                                 np.np_lio_port = head->lio_port;
1461                                 np.np_lio_event = head->lio_event;
1462                                 np.np_lio_object =
1463                                     (uintptr_t)head->lio_sigevent;
1464                                 np.np_lio_user = head->lio_sigval.sival_ptr;
1465                         }
1466                         head->lio_nent = head->lio_refcnt = 0;
1467                         sig_mutex_unlock(&head->lio_mutex);
1468                         if (waiting == 0)
1469                                 _aio_lio_free(head);
1470                 } else {
1471                         head->lio_nent--;
1472                         head->lio_refcnt--;
1473                         sig_mutex_unlock(&head->lio_mutex);
1474                 }
1475         }
1476
1477         /*
1478          * The request is completed; now perform the notifications.
1479          */
1480         if (notify) {
1481                 if (reqp != NULL) {
1482                         /*
1483                          * We usually put the request on the notification
1484                          * queue because we don't want to block and delay
1485                          * other operations behind us in the work queue.
1486                          * Also we must never block on a cancel notification
1487                          * because we are being called from an application
1488                          * thread in this case and that could lead to deadlock
1489                          * if no other thread is receiving notificatins.
1490                          */
1491                         reqp->req_notify = np;
1492                         reqp->req_op = AIONOTIFY;
1493                         _aio_req_add(reqp, &__workers_no, AIONOTIFY);
1494                         reqp = NULL;
1495                 } else {
1496                         /*
1497                          * We already put the request on the done queue,
1498                          * so we can't queue it to the notification queue.
1499                          * Just do the notification directly.
1500                          */
1501                         send_notification(&np);
1502                 }
1503         }
1504
1505         if (reqp != NULL)
1506                 _aio_req_free(reqp);
1507 }
1508
1509 /*
1510  * Delete fsync requests from list head until there is
1511  * only one left.  Return 0 when there is only one,
1512  * otherwise return a non-zero value.
1513  */
1514 static int
1515 _aio_fsync_del(aio_worker_t *aiowp, aio_req_t *reqp)
1516 {
1517         aio_lio_t *head = reqp->req_head;
1518         int rval = 0;
1519
1520         ASSERT(reqp == aiowp->work_req);
1521         sig_mutex_lock(&aiowp->work_qlock1);
1522         sig_mutex_lock(&head->lio_mutex);
1523         if (head->lio_refcnt > 1) {
1524                 head->lio_refcnt--;
1525                 head->lio_nent--;
1526                 aiowp->work_req = NULL;
1527                 sig_mutex_unlock(&head->lio_mutex);
1528                 sig_mutex_unlock(&aiowp->work_qlock1);
1529                 sig_mutex_lock(&__aio_mutex);
1530                 _aio_outstand_cnt--;
1531                 _aio_waitn_wakeup();
1532                 sig_mutex_unlock(&__aio_mutex);
1533                 _aio_req_free(reqp);
1534                 return (1);
1535         }
1536         ASSERT(head->lio_nent == 1 && head->lio_refcnt == 1);
1537         reqp->req_head = NULL;
1538         if (head->lio_canned)
1539                 reqp->req_state = AIO_REQ_CANCELED;
1540         if (head->lio_mode == LIO_DESTROY) {
1541                 aiowp->work_req = NULL;
1542                 rval = 1;
1543         }
1544         sig_mutex_unlock(&head->lio_mutex);
1545         sig_mutex_unlock(&aiowp->work_qlock1);
1546         head->lio_refcnt--;
1547         head->lio_nent--;
1548         _aio_lio_free(head);
1549         if (rval != 0)
1550                 _aio_req_free(reqp);
1551         return (rval);
1552 }
1553
1554 /*
1555  * A worker is set idle when its work queue is empty.
1556  * The worker checks again that it has no more work
1557  * and then goes to sleep waiting for more work.
1558  */
1559 int
1560 _aio_idle(aio_worker_t *aiowp)
1561 {
1562         int error = 0;
1563
1564         sig_mutex_lock(&aiowp->work_qlock1);
1565         if (aiowp->work_count1 == 0) {
1566                 ASSERT(aiowp->work_minload1 == 0);
1567                 aiowp->work_idleflg = 1;
1568                 /*
1569                  * A cancellation handler is not needed here.
1570                  * aio worker threads are never cancelled via pthread_cancel().
1571                  */
1572                 error = sig_cond_wait(&aiowp->work_idle_cv,
1573                     &aiowp->work_qlock1);
1574                 /*
1575                  * The idle flag is normally cleared before worker is awakened
1576                  * by aio_req_add().  On error (EINTR), we clear it ourself.
1577                  */
1578                 if (error)
1579                         aiowp->work_idleflg = 0;
1580         }
1581         sig_mutex_unlock(&aiowp->work_qlock1);
1582         return (error);
1583 }
1584
1585 /*
1586  * A worker's completed AIO requests are placed onto a global
1587  * done queue.  The application is only sent a SIGIO signal if
1588  * the process has a handler enabled and it is not waiting via
1589  * aiowait().
1590  */
1591 static void
1592 _aio_work_done(aio_worker_t *aiowp)
1593 {
1594         aio_req_t *reqp;
1595
1596         sig_mutex_lock(&__aio_mutex);
1597         sig_mutex_lock(&aiowp->work_qlock1);
1598         reqp = aiowp->work_prev1;
1599         reqp->req_next = NULL;
1600         aiowp->work_done1 = 0;
1601         aiowp->work_tail1 = aiowp->work_next1;
1602         if (aiowp->work_tail1 == NULL)
1603                 aiowp->work_head1 = NULL;
1604         aiowp->work_prev1 = NULL;
1605         _aio_outstand_cnt--;
1606         _aio_req_done_cnt--;
1607         if (reqp->req_state == AIO_REQ_CANCELED) {
1608                 /*
1609                  * Request got cancelled after it was marked done. This can
1610                  * happen because _aio_finish_request() marks it AIO_REQ_DONE
1611                  * and drops all locks. Don't add the request to the done
1612                  * queue and just discard it.
1613                  */
1614                 sig_mutex_unlock(&aiowp->work_qlock1);
1615                 _aio_req_free(reqp);
1616                 if (_aio_outstand_cnt == 0 && _aiowait_flag) {
1617                         sig_mutex_unlock(&__aio_mutex);
1618                         (void) _kaio(AIONOTIFY);
1619                 } else {
1620                         sig_mutex_unlock(&__aio_mutex);
1621                 }
1622                 return;
1623         }
1624         sig_mutex_unlock(&aiowp->work_qlock1);
1625         _aio_donecnt++;
1626         ASSERT(_aio_donecnt > 0 &&
1627             _aio_outstand_cnt >= 0 &&
1628             _aio_req_done_cnt >= 0);
1629         ASSERT(reqp != NULL);
1630
1631         if (_aio_done_tail == NULL) {
1632                 _aio_done_head = _aio_done_tail = reqp;
1633         } else {
1634                 _aio_done_head->req_next = reqp;
1635                 _aio_done_head = reqp;
1636         }
1637
1638         if (_aiowait_flag) {
1639                 sig_mutex_unlock(&__aio_mutex);
1640                 (void) _kaio(AIONOTIFY);
1641         } else {
1642                 sig_mutex_unlock(&__aio_mutex);
1643                 if (_sigio_enabled)
1644                         (void) kill(__pid, SIGIO);
1645         }
1646 }
1647
1648 /*
1649  * The done queue consists of AIO requests that are in either the
1650  * AIO_REQ_DONE or AIO_REQ_CANCELED state.  Requests that were cancelled
1651  * are discarded.  If the done queue is empty then NULL is returned.
1652  * Otherwise the address of a done aio_result_t is returned.
1653  */
1654 aio_result_t *
1655 _aio_req_done(void)
1656 {
1657         aio_req_t *reqp;
1658         aio_result_t *resultp;
1659
1660         ASSERT(MUTEX_HELD(&__aio_mutex));
1661
1662         if ((reqp = _aio_done_tail) != NULL) {
1663                 if ((_aio_done_tail = reqp->req_next) == NULL)
1664                         _aio_done_head = NULL;
1665                 ASSERT(_aio_donecnt > 0);
1666                 _aio_donecnt--;
1667                 (void) _aio_hash_del(reqp->req_resultp);
1668                 resultp = reqp->req_resultp;
1669                 ASSERT(reqp->req_state == AIO_REQ_DONE);
1670                 _aio_req_free(reqp);
1671                 return (resultp);
1672         }
1673         /* is queue empty? */
1674         if (reqp == NULL && _aio_outstand_cnt == 0) {
1675                 return ((aio_result_t *)-1);
1676         }
1677         return (NULL);
1678 }
1679
1680 /*
1681  * Set the return and errno values for the application's use.
1682  *
1683  * For the Posix interfaces, we must set the return value first followed
1684  * by the errno value because the Posix interfaces allow for a change
1685  * in the errno value from EINPROGRESS to something else to signal
1686  * the completion of the asynchronous request.
1687  *
1688  * The opposite is true for the Solaris interfaces.  These allow for
1689  * a change in the return value from AIO_INPROGRESS to something else
1690  * to signal the completion of the asynchronous request.
1691  */
1692 void
1693 _aio_set_result(aio_req_t *reqp, ssize_t retval, int error)
1694 {
1695         aio_result_t *resultp = reqp->req_resultp;
1696
1697         if (POSIX_AIO(reqp)) {
1698                 resultp->aio_return = retval;
1699                 membar_producer();
1700                 resultp->aio_errno = error;
1701         } else {
1702                 resultp->aio_errno = error;
1703                 membar_producer();
1704                 resultp->aio_return = retval;
1705         }
1706 }
1707
1708 /*
1709  * Add an AIO request onto the next work queue.
1710  * A circular list of workers is used to choose the next worker.
1711  */
1712 void
1713 _aio_req_add(aio_req_t *reqp, aio_worker_t **nextworker, int mode)
1714 {
1715         ulwp_t *self = curthread;
1716         aio_worker_t *aiowp;
1717         aio_worker_t *first;
1718         int load_bal_flg = 1;
1719         int found;
1720
1721         ASSERT(reqp->req_state != AIO_REQ_DONEQ);
1722         reqp->req_next = NULL;
1723         /*
1724          * Try to acquire the next worker's work queue.  If it is locked,
1725          * then search the list of workers until a queue is found unlocked,
1726          * or until the list is completely traversed at which point another
1727          * worker will be created.
1728          */
1729         sigoff(self);           /* defer SIGIO */
1730         sig_mutex_lock(&__aio_mutex);
1731         first = aiowp = *nextworker;
1732         if (mode != AIONOTIFY)
1733                 _aio_outstand_cnt++;
1734         sig_mutex_unlock(&__aio_mutex);
1735
1736         switch (mode) {
1737         case AIOREAD:
1738         case AIOWRITE:
1739         case AIOAREAD:
1740         case AIOAWRITE:
1741 #if !defined(_LP64)
1742         case AIOAREAD64:
1743         case AIOAWRITE64:
1744 #endif
1745                 /* try to find an idle worker */
1746                 found = 0;
1747                 do {
1748                         if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
1749                                 if (aiowp->work_idleflg) {
1750                                         found = 1;
1751                                         break;
1752                                 }
1753                                 sig_mutex_unlock(&aiowp->work_qlock1);
1754                         }
1755                 } while ((aiowp = aiowp->work_forw) != first);
1756
1757                 if (found) {
1758                         aiowp->work_minload1++;
1759                         break;
1760                 }
1761
1762                 /* try to acquire some worker's queue lock */
1763                 do {
1764                         if (sig_mutex_trylock(&aiowp->work_qlock1) == 0) {
1765                                 found = 1;
1766                                 break;
1767                         }
1768                 } while ((aiowp = aiowp->work_forw) != first);
1769
1770                 /*
1771                  * Create more workers when the workers appear overloaded.
1772                  * Either all the workers are busy draining their queues
1773                  * or no worker's queue lock could be acquired.
1774                  */
1775                 if (!found) {
1776                         if (_aio_worker_cnt < _max_workers) {
1777                                 if (_aio_create_worker(reqp, mode))
1778                                         aio_panic("_aio_req_add: add worker");
1779                                 sigon(self);    /* reenable SIGIO */
1780                                 return;
1781                         }
1782
1783                         /*
1784                          * No worker available and we have created
1785                          * _max_workers, keep going through the
1786                          * list slowly until we get a lock
1787                          */
1788                         while (sig_mutex_trylock(&aiowp->work_qlock1) != 0) {
1789                                 /*
1790                                  * give someone else a chance
1791                                  */
1792                                 _aio_delay(1);
1793                                 aiowp = aiowp->work_forw;
1794                         }
1795                 }
1796
1797                 ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
1798                 if (_aio_worker_cnt < _max_workers &&
1799                     aiowp->work_minload1 >= _minworkload) {
1800                         sig_mutex_unlock(&aiowp->work_qlock1);
1801                         sig_mutex_lock(&__aio_mutex);
1802                         *nextworker = aiowp->work_forw;
1803                         sig_mutex_unlock(&__aio_mutex);
1804                         if (_aio_create_worker(reqp, mode))
1805                                 aio_panic("aio_req_add: add worker");
1806                         sigon(self);    /* reenable SIGIO */
1807                         return;
1808                 }
1809                 aiowp->work_minload1++;
1810                 break;
1811         case AIOFSYNC:
1812         case AIONOTIFY:
1813                 load_bal_flg = 0;
1814                 sig_mutex_lock(&aiowp->work_qlock1);
1815                 break;
1816         default:
1817                 aio_panic("_aio_req_add: invalid mode");
1818                 break;
1819         }
1820         /*
1821          * Put request onto worker's work queue.
1822          */
1823         if (aiowp->work_tail1 == NULL) {
1824                 ASSERT(aiowp->work_count1 == 0);
1825                 aiowp->work_tail1 = reqp;
1826                 aiowp->work_next1 = reqp;
1827         } else {
1828                 aiowp->work_head1->req_next = reqp;
1829                 if (aiowp->work_next1 == NULL)
1830                         aiowp->work_next1 = reqp;
1831         }
1832         reqp->req_state = AIO_REQ_QUEUED;
1833         reqp->req_worker = aiowp;
1834         aiowp->work_head1 = reqp;
1835         /*
1836          * Awaken worker if it is not currently active.
1837          */
1838         if (aiowp->work_count1++ == 0 && aiowp->work_idleflg) {
1839                 aiowp->work_idleflg = 0;
1840                 (void) cond_signal(&aiowp->work_idle_cv);
1841         }
1842         sig_mutex_unlock(&aiowp->work_qlock1);
1843
1844         if (load_bal_flg) {
1845                 sig_mutex_lock(&__aio_mutex);
1846                 *nextworker = aiowp->work_forw;
1847                 sig_mutex_unlock(&__aio_mutex);
1848         }
1849         sigon(self);    /* reenable SIGIO */
1850 }
1851
1852 /*
1853  * Get an AIO request for a specified worker.
1854  * If the work queue is empty, return NULL.
1855  */
1856 aio_req_t *
1857 _aio_req_get(aio_worker_t *aiowp)
1858 {
1859         aio_req_t *reqp;
1860
1861         sig_mutex_lock(&aiowp->work_qlock1);
1862         if ((reqp = aiowp->work_next1) != NULL) {
1863                 /*
1864                  * Remove a POSIX request from the queue; the
1865                  * request queue is a singularly linked list
1866                  * with a previous pointer.  The request is
1867                  * removed by updating the previous pointer.
1868                  *
1869                  * Non-posix requests are left on the queue
1870                  * to eventually be placed on the done queue.
1871                  */
1872
1873                 if (POSIX_AIO(reqp)) {
1874                         if (aiowp->work_prev1 == NULL) {
1875                                 aiowp->work_tail1 = reqp->req_next;
1876                                 if (aiowp->work_tail1 == NULL)
1877                                         aiowp->work_head1 = NULL;
1878                         } else {
1879                                 aiowp->work_prev1->req_next = reqp->req_next;
1880                                 if (aiowp->work_head1 == reqp)
1881                                         aiowp->work_head1 = reqp->req_next;
1882                         }
1883
1884                 } else {
1885                         aiowp->work_prev1 = reqp;
1886                         ASSERT(aiowp->work_done1 >= 0);
1887                         aiowp->work_done1++;
1888                 }
1889                 ASSERT(reqp != reqp->req_next);
1890                 aiowp->work_next1 = reqp->req_next;
1891                 ASSERT(aiowp->work_count1 >= 1);
1892                 aiowp->work_count1--;
1893                 switch (reqp->req_op) {
1894                 case AIOREAD:
1895                 case AIOWRITE:
1896                 case AIOAREAD:
1897                 case AIOAWRITE:
1898 #if !defined(_LP64)
1899                 case AIOAREAD64:
1900                 case AIOAWRITE64:
1901 #endif
1902                         ASSERT(aiowp->work_minload1 > 0);
1903                         aiowp->work_minload1--;
1904                         break;
1905                 }
1906                 reqp->req_state = AIO_REQ_INPROGRESS;
1907         }
1908         aiowp->work_req = reqp;
1909         ASSERT(reqp != NULL || aiowp->work_count1 == 0);
1910         sig_mutex_unlock(&aiowp->work_qlock1);
1911         return (reqp);
1912 }
1913
1914 static void
1915 _aio_req_del(aio_worker_t *aiowp, aio_req_t *reqp, int ostate)
1916 {
1917         aio_req_t **last;
1918         aio_req_t *lastrp;
1919         aio_req_t *next;
1920
1921         ASSERT(aiowp != NULL);
1922         ASSERT(MUTEX_HELD(&aiowp->work_qlock1));
1923         if (POSIX_AIO(reqp)) {
1924                 if (ostate != AIO_REQ_QUEUED)
1925                         return;
1926         }
1927         last = &aiowp->work_tail1;
1928         lastrp = aiowp->work_tail1;
1929         ASSERT(ostate == AIO_REQ_QUEUED || ostate == AIO_REQ_INPROGRESS);
1930         while ((next = *last) != NULL) {
1931                 if (next == reqp) {
1932                         *last = next->req_next;
1933                         if (aiowp->work_next1 == next)
1934                                 aiowp->work_next1 = next->req_next;
1935
1936                         /*
1937                          * if this is the first request on the queue, move
1938                          * the lastrp pointer forward.
1939                          */
1940                         if (lastrp == next)
1941                                 lastrp = next->req_next;
1942
1943                         /*
1944                          * if this request is pointed by work_head1, then
1945                          * make work_head1 point to the last request that is
1946                          * present on the queue.
1947                          */
1948                         if (aiowp->work_head1 == next)
1949                                 aiowp->work_head1 = lastrp;
1950
1951                         /*
1952                          * work_prev1 is used only in non posix case and it
1953                          * points to the current AIO_REQ_INPROGRESS request.
1954                          * If work_prev1 points to this request which is being
1955                          * deleted, make work_prev1 NULL and set  work_done1
1956                          * to 0.
1957                          *
1958                          * A worker thread can be processing only one request
1959                          * at a time.
1960                          */
1961                         if (aiowp->work_prev1 == next) {
1962                                 ASSERT(ostate == AIO_REQ_INPROGRESS &&
1963                                     !POSIX_AIO(reqp) && aiowp->work_done1 > 0);
1964                                         aiowp->work_prev1 = NULL;
1965                                         aiowp->work_done1--;
1966                         }
1967
1968                         if (ostate == AIO_REQ_QUEUED) {
1969                                 ASSERT(aiowp->work_count1 >= 1);
1970                                 aiowp->work_count1--;
1971                                 ASSERT(aiowp->work_minload1 >= 1);
1972                                 aiowp->work_minload1--;
1973                         }
1974                         return;
1975                 }
1976                 last = &next->req_next;
1977                 lastrp = next;
1978         }
1979         /* NOTREACHED */
1980 }
1981
1982 static void
1983 _aio_enq_doneq(aio_req_t *reqp)
1984 {
1985         if (_aio_doneq == NULL) {
1986                 _aio_doneq = reqp;
1987                 reqp->req_next = reqp->req_prev = reqp;
1988         } else {
1989                 reqp->req_next = _aio_doneq;
1990                 reqp->req_prev = _aio_doneq->req_prev;
1991                 _aio_doneq->req_prev->req_next = reqp;
1992                 _aio_doneq->req_prev = reqp;
1993         }
1994         reqp->req_state = AIO_REQ_DONEQ;
1995         _aio_doneq_cnt++;
1996 }
1997
1998 /*
1999  * caller owns the _aio_mutex
2000  */
2001 aio_req_t *
2002 _aio_req_remove(aio_req_t *reqp)
2003 {
2004         if (reqp && reqp->req_state != AIO_REQ_DONEQ)
2005                 return (NULL);
2006
2007         if (reqp) {
2008                 /* request in done queue */
2009                 if (_aio_doneq == reqp)
2010                         _aio_doneq = reqp->req_next;
2011                 if (_aio_doneq == reqp) {
2012                         /* only one request on queue */
2013                         _aio_doneq = NULL;
2014                 } else {
2015                         aio_req_t *tmp = reqp->req_next;
2016                         reqp->req_prev->req_next = tmp;
2017                         tmp->req_prev = reqp->req_prev;
2018                 }
2019         } else if ((reqp = _aio_doneq) != NULL) {
2020                 if (reqp == reqp->req_next) {
2021                         /* only one request on queue */
2022                         _aio_doneq = NULL;
2023                 } else {
2024                         reqp->req_prev->req_next = _aio_doneq = reqp->req_next;
2025                         _aio_doneq->req_prev = reqp->req_prev;
2026                 }
2027         }
2028         if (reqp) {
2029                 _aio_doneq_cnt--;
2030                 reqp->req_next = reqp->req_prev = reqp;
2031                 reqp->req_state = AIO_REQ_DONE;
2032         }
2033         return (reqp);
2034 }
2035
2036 /*
2037  * An AIO request is identified by an aio_result_t pointer.  The library
2038  * maps this aio_result_t pointer to its internal representation using a
2039  * hash table.  This function adds an aio_result_t pointer to the hash table.
2040  */
2041 static int
2042 _aio_hash_insert(aio_result_t *resultp, aio_req_t *reqp)
2043 {
2044         aio_hash_t *hashp;
2045         aio_req_t **prev;
2046         aio_req_t *next;
2047
2048         hashp = _aio_hash + AIOHASH(resultp);
2049         lmutex_lock(&hashp->hash_lock);
2050         prev = &hashp->hash_ptr;
2051         while ((next = *prev) != NULL) {
2052                 if (resultp == next->req_resultp) {
2053                         lmutex_unlock(&hashp->hash_lock);
2054                         return (-1);
2055                 }
2056                 prev = &next->req_link;
2057         }
2058         *prev = reqp;
2059         ASSERT(reqp->req_link == NULL);
2060         lmutex_unlock(&hashp->hash_lock);
2061         return (0);
2062 }
2063
2064 /*
2065  * Remove an entry from the hash table.
2066  */
2067 aio_req_t *
2068 _aio_hash_del(aio_result_t *resultp)
2069 {
2070         aio_hash_t *hashp;
2071         aio_req_t **prev;
2072         aio_req_t *next = NULL;
2073
2074         if (_aio_hash != NULL) {
2075                 hashp = _aio_hash + AIOHASH(resultp);
2076                 lmutex_lock(&hashp->hash_lock);
2077                 prev = &hashp->hash_ptr;
2078                 while ((next = *prev) != NULL) {
2079                         if (resultp == next->req_resultp) {
2080                                 *prev = next->req_link;
2081                                 next->req_link = NULL;
2082                                 break;
2083                         }
2084                         prev = &next->req_link;
2085                 }
2086                 lmutex_unlock(&hashp->hash_lock);
2087         }
2088         return (next);
2089 }
2090
2091 /*
2092  *  find an entry in the hash table
2093  */
2094 aio_req_t *
2095 _aio_hash_find(aio_result_t *resultp)
2096 {
2097         aio_hash_t *hashp;
2098         aio_req_t **prev;
2099         aio_req_t *next = NULL;
2100
2101         if (_aio_hash != NULL) {
2102                 hashp = _aio_hash + AIOHASH(resultp);
2103                 lmutex_lock(&hashp->hash_lock);
2104                 prev = &hashp->hash_ptr;
2105                 while ((next = *prev) != NULL) {
2106                         if (resultp == next->req_resultp)
2107                                 break;
2108                         prev = &next->req_link;
2109                 }
2110                 lmutex_unlock(&hashp->hash_lock);
2111         }
2112         return (next);
2113 }
2114
2115 /*
2116  * AIO interface for POSIX
2117  */
2118 int
2119 _aio_rw(aiocb_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
2120     int mode, int flg)
2121 {
2122         aio_req_t *reqp;
2123         aio_args_t *ap;
2124         int kerr;
2125
2126         if (aiocbp == NULL) {
2127                 errno = EINVAL;
2128                 return (-1);
2129         }
2130
2131         /* initialize kaio */
2132         if (!_kaio_ok)
2133                 _kaio_init();
2134
2135         aiocbp->aio_state = NOCHECK;
2136
2137         /*
2138          * If we have been called because a list I/O
2139          * kaio() failed, we dont want to repeat the
2140          * system call
2141          */
2142
2143         if (flg & AIO_KAIO) {
2144                 /*
2145                  * Try kernel aio first.
2146                  * If errno is ENOTSUP/EBADFD,
2147                  * fall back to the thread implementation.
2148                  */
2149                 if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
2150                         aiocbp->aio_resultp.aio_errno = EINPROGRESS;
2151                         aiocbp->aio_state = CHECK;
2152                         kerr = (int)_kaio(mode, aiocbp);
2153                         if (kerr == 0)
2154                                 return (0);
2155                         if (errno != ENOTSUP && errno != EBADFD) {
2156                                 aiocbp->aio_resultp.aio_errno = errno;
2157                                 aiocbp->aio_resultp.aio_return = -1;
2158                                 aiocbp->aio_state = NOCHECK;
2159                                 return (-1);
2160                         }
2161                         if (errno == EBADFD)
2162                                 SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
2163                 }
2164         }
2165
2166         aiocbp->aio_resultp.aio_errno = EINPROGRESS;
2167         aiocbp->aio_state = USERAIO;
2168
2169         if (!__uaio_ok && __uaio_init() == -1)
2170                 return (-1);
2171
2172         if ((reqp = _aio_req_alloc()) == NULL) {
2173                 errno = EAGAIN;
2174                 return (-1);
2175         }
2176
2177         /*
2178          * If an LIO request, add the list head to the aio request
2179          */
2180         reqp->req_head = lio_head;
2181         reqp->req_type = AIO_POSIX_REQ;
2182         reqp->req_op = mode;
2183         reqp->req_largefile = 0;
2184
2185         if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
2186                 reqp->req_sigevent.sigev_notify = SIGEV_NONE;
2187         } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2188                 reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
2189                 reqp->req_sigevent.sigev_signo =
2190                     aiocbp->aio_sigevent.sigev_signo;
2191                 reqp->req_sigevent.sigev_value.sival_ptr =
2192                     aiocbp->aio_sigevent.sigev_value.sival_ptr;
2193         } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
2194                 port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
2195                 reqp->req_sigevent.sigev_notify = SIGEV_PORT;
2196                 /*
2197                  * Reuse the sigevent structure to contain the port number
2198                  * and the user value.  Same for SIGEV_THREAD, below.
2199                  */
2200                 reqp->req_sigevent.sigev_signo =
2201                     pn->portnfy_port;
2202                 reqp->req_sigevent.sigev_value.sival_ptr =
2203                     pn->portnfy_user;
2204         } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
2205                 reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
2206                 /*
2207                  * The sigevent structure contains the port number
2208                  * and the user value.  Same for SIGEV_PORT, above.
2209                  */
2210                 reqp->req_sigevent.sigev_signo =
2211                     aiocbp->aio_sigevent.sigev_signo;
2212                 reqp->req_sigevent.sigev_value.sival_ptr =
2213                     aiocbp->aio_sigevent.sigev_value.sival_ptr;
2214         }
2215
2216         reqp->req_resultp = &aiocbp->aio_resultp;
2217         reqp->req_aiocbp = aiocbp;
2218         ap = &reqp->req_args;
2219         ap->fd = aiocbp->aio_fildes;
2220         ap->buf = (caddr_t)aiocbp->aio_buf;
2221         ap->bufsz = aiocbp->aio_nbytes;
2222         ap->offset = aiocbp->aio_offset;
2223
2224         if ((flg & AIO_NO_DUPS) &&
2225             _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
2226                 aio_panic("_aio_rw(): request already in hash table");
2227                 _aio_req_free(reqp);
2228                 errno = EINVAL;
2229                 return (-1);
2230         }
2231         _aio_req_add(reqp, nextworker, mode);
2232         return (0);
2233 }
2234
2235 #if !defined(_LP64)
2236 /*
2237  * 64-bit AIO interface for POSIX
2238  */
2239 int
2240 _aio_rw64(aiocb64_t *aiocbp, aio_lio_t *lio_head, aio_worker_t **nextworker,
2241     int mode, int flg)
2242 {
2243         aio_req_t *reqp;
2244         aio_args_t *ap;
2245         int kerr;
2246
2247         if (aiocbp == NULL) {
2248                 errno = EINVAL;
2249                 return (-1);
2250         }
2251
2252         /* initialize kaio */
2253         if (!_kaio_ok)
2254                 _kaio_init();
2255
2256         aiocbp->aio_state = NOCHECK;
2257
2258         /*
2259          * If we have been called because a list I/O
2260          * kaio() failed, we dont want to repeat the
2261          * system call
2262          */
2263
2264         if (flg & AIO_KAIO) {
2265                 /*
2266                  * Try kernel aio first.
2267                  * If errno is ENOTSUP/EBADFD,
2268                  * fall back to the thread implementation.
2269                  */
2270                 if (_kaio_ok > 0 && KAIO_SUPPORTED(aiocbp->aio_fildes)) {
2271                         aiocbp->aio_resultp.aio_errno = EINPROGRESS;
2272                         aiocbp->aio_state = CHECK;
2273                         kerr = (int)_kaio(mode, aiocbp);
2274                         if (kerr == 0)
2275                                 return (0);
2276                         if (errno != ENOTSUP && errno != EBADFD) {
2277                                 aiocbp->aio_resultp.aio_errno = errno;
2278                                 aiocbp->aio_resultp.aio_return = -1;
2279                                 aiocbp->aio_state = NOCHECK;
2280                                 return (-1);
2281                         }
2282                         if (errno == EBADFD)
2283                                 SET_KAIO_NOT_SUPPORTED(aiocbp->aio_fildes);
2284                 }
2285         }
2286
2287         aiocbp->aio_resultp.aio_errno = EINPROGRESS;
2288         aiocbp->aio_state = USERAIO;
2289
2290         if (!__uaio_ok && __uaio_init() == -1)
2291                 return (-1);
2292
2293         if ((reqp = _aio_req_alloc()) == NULL) {
2294                 errno = EAGAIN;
2295                 return (-1);
2296         }
2297
2298         /*
2299          * If an LIO request, add the list head to the aio request
2300          */
2301         reqp->req_head = lio_head;
2302         reqp->req_type = AIO_POSIX_REQ;
2303         reqp->req_op = mode;
2304         reqp->req_largefile = 1;
2305
2306         if (aiocbp->aio_sigevent.sigev_notify == SIGEV_NONE) {
2307                 reqp->req_sigevent.sigev_notify = SIGEV_NONE;
2308         } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2309                 reqp->req_sigevent.sigev_notify = SIGEV_SIGNAL;
2310                 reqp->req_sigevent.sigev_signo =
2311                     aiocbp->aio_sigevent.sigev_signo;
2312                 reqp->req_sigevent.sigev_value.sival_ptr =
2313                     aiocbp->aio_sigevent.sigev_value.sival_ptr;
2314         } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_PORT) {
2315                 port_notify_t *pn = aiocbp->aio_sigevent.sigev_value.sival_ptr;
2316                 reqp->req_sigevent.sigev_notify = SIGEV_PORT;
2317                 reqp->req_sigevent.sigev_signo =
2318                     pn->portnfy_port;
2319                 reqp->req_sigevent.sigev_value.sival_ptr =
2320                     pn->portnfy_user;
2321         } else if (aiocbp->aio_sigevent.sigev_notify == SIGEV_THREAD) {
2322                 reqp->req_sigevent.sigev_notify = SIGEV_THREAD;
2323                 reqp->req_sigevent.sigev_signo =
2324                     aiocbp->aio_sigevent.sigev_signo;
2325                 reqp->req_sigevent.sigev_value.sival_ptr =
2326                     aiocbp->aio_sigevent.sigev_value.sival_ptr;
2327         }
2328
2329         reqp->req_resultp = &aiocbp->aio_resultp;
2330         reqp->req_aiocbp = aiocbp;
2331         ap = &reqp->req_args;
2332         ap->fd = aiocbp->aio_fildes;
2333         ap->buf = (caddr_t)aiocbp->aio_buf;
2334         ap->bufsz = aiocbp->aio_nbytes;
2335         ap->offset = aiocbp->aio_offset;
2336
2337         if ((flg & AIO_NO_DUPS) &&
2338             _aio_hash_insert(&aiocbp->aio_resultp, reqp) != 0) {
2339                 aio_panic("_aio_rw64(): request already in hash table");
2340                 _aio_req_free(reqp);
2341                 errno = EINVAL;
2342                 return (-1);
2343         }
2344         _aio_req_add(reqp, nextworker, mode);
2345         return (0);
2346 }
2347 #endif  /* !defined(_LP64) */