1 /* $NetBSD: sys_pipe.c,v 1.126 2009/12/15 18:35:18 dsl Exp $ */
4 * Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Paul Kranenburg, and by Andrew Doran.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * Copyright (c) 1996 John S. Dyson
34 * All rights reserved.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice immediately at the beginning of the file, without modification,
41 * this list of conditions, and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. Absolutely no warranty of function or purpose is made by the author
47 * 4. Modifications may be freely made to this file if the above conditions
52 * This file contains a high-performance replacement for the socket-based
53 * pipes scheme originally used. It does not support all features of
54 * sockets, but does do everything that pipes normally do.
56 * This code has two modes of operation, a small write mode and a large
57 * write mode. The small write mode acts like conventional pipes with
58 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
59 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
60 * and PIPE_SIZE in size it is mapped read-only into the kernel address space
61 * using the UVM page loan facility from where the receiving process can copy
62 * the data directly from the pages in the sending process.
64 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
65 * happen for small transfers so that the system will not spend all of
66 * its time context switching. PIPE_SIZE is constrained by the
67 * amount of kernel virtual memory.
70 #include <sys/cdefs.h>
71 __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.126 2009/12/15 18:35:18 dsl Exp $");
73 #include <sys/param.h>
74 #include <sys/systm.h>
76 #include <sys/fcntl.h>
78 #include <sys/filedesc.h>
79 #include <sys/filio.h>
80 #include <sys/kernel.h>
81 #include <sys/ttycom.h>
84 #include <sys/signalvar.h>
85 #include <sys/vnode.h>
87 #include <sys/select.h>
88 #include <sys/mount.h>
89 #include <sys/syscallargs.h>
90 #include <sys/sysctl.h>
91 #include <sys/kauth.h>
92 #include <sys/atomic.h>
98 * Use this to disable direct I/O and decrease the code size:
99 * #define PIPE_NODIRECT
102 /* XXX Disabled for now; rare hangs switching between direct/buffered */
103 #define PIPE_NODIRECT
105 static int pipe_read(file_t
*, off_t
*, struct uio
*, kauth_cred_t
, int);
106 static int pipe_write(file_t
*, off_t
*, struct uio
*, kauth_cred_t
, int);
107 static int pipe_close(file_t
*);
108 static int pipe_poll(file_t
*, int);
109 static int pipe_kqfilter(file_t
*, struct knote
*);
110 static int pipe_stat(file_t
*, struct stat
*);
111 static int pipe_ioctl(file_t
*, u_long
, void *);
112 static void pipe_restart(file_t
*);
114 static const struct fileops pipeops
= {
115 .fo_read
= pipe_read
,
116 .fo_write
= pipe_write
,
117 .fo_ioctl
= pipe_ioctl
,
118 .fo_fcntl
= fnullop_fcntl
,
119 .fo_poll
= pipe_poll
,
120 .fo_stat
= pipe_stat
,
121 .fo_close
= pipe_close
,
122 .fo_kqfilter
= pipe_kqfilter
,
123 .fo_restart
= pipe_restart
,
127 * Default pipe buffer size(s), this can be kind-of large now because pipe
128 * space is pageable. The pipe code will try to maintain locality of
129 * reference for performance reasons, so small amounts of outstanding I/O
130 * will not wipe the cache.
132 #define MINPIPESIZE (PIPE_SIZE / 3)
133 #define MAXPIPESIZE (2 * PIPE_SIZE / 3)
136 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
137 * is there so that on large systems, we don't exhaust it.
139 #define MAXPIPEKVA (8 * 1024 * 1024)
140 static u_int maxpipekva
= MAXPIPEKVA
;
143 * Limit for direct transfers, we cannot, of course limit
144 * the amount of kva for pipes in general though.
146 #define LIMITPIPEKVA (16 * 1024 * 1024)
147 static u_int limitpipekva
= LIMITPIPEKVA
;
150 * Limit the number of "big" pipes
152 #define LIMITBIGPIPES 32
153 static u_int maxbigpipes
= LIMITBIGPIPES
;
154 static u_int nbigpipe
= 0;
157 * Amount of KVA consumed by pipe buffers.
159 static u_int amountpipekva
= 0;
161 static void pipeclose(struct pipe
*);
162 static void pipe_free_kmem(struct pipe
*);
163 static int pipe_create(struct pipe
**, pool_cache_t
);
164 static int pipelock(struct pipe
*, int);
165 static inline void pipeunlock(struct pipe
*);
166 static void pipeselwakeup(struct pipe
*, struct pipe
*, int);
167 #ifndef PIPE_NODIRECT
168 static int pipe_direct_write(file_t
*, struct pipe
*, struct uio
*);
170 static int pipespace(struct pipe
*, int);
171 static int pipe_ctor(void *, void *, int);
172 static void pipe_dtor(void *, void *);
174 #ifndef PIPE_NODIRECT
175 static int pipe_loan_alloc(struct pipe
*, int);
176 static void pipe_loan_free(struct pipe
*);
177 #endif /* PIPE_NODIRECT */
179 static pool_cache_t pipe_wr_cache
;
180 static pool_cache_t pipe_rd_cache
;
186 /* Writer side is not automatically allocated KVA. */
187 pipe_wr_cache
= pool_cache_init(sizeof(struct pipe
), 0, 0, 0, "pipewr",
188 NULL
, IPL_NONE
, pipe_ctor
, pipe_dtor
, NULL
);
189 KASSERT(pipe_wr_cache
!= NULL
);
191 /* Reader side gets preallocated KVA. */
192 pipe_rd_cache
= pool_cache_init(sizeof(struct pipe
), 0, 0, 0, "piperd",
193 NULL
, IPL_NONE
, pipe_ctor
, pipe_dtor
, (void *)1);
194 KASSERT(pipe_rd_cache
!= NULL
);
198 pipe_ctor(void *arg
, void *obj
, int flags
)
205 memset(pipe
, 0, sizeof(struct pipe
));
207 /* Preallocate space. */
208 va
= uvm_km_alloc(kernel_map
, PIPE_SIZE
, 0,
209 UVM_KMF_PAGEABLE
| UVM_KMF_WAITVA
);
211 pipe
->pipe_kmem
= va
;
212 atomic_add_int(&amountpipekva
, PIPE_SIZE
);
214 cv_init(&pipe
->pipe_rcv
, "piperd");
215 cv_init(&pipe
->pipe_wcv
, "pipewr");
216 cv_init(&pipe
->pipe_draincv
, "pipedrain");
217 cv_init(&pipe
->pipe_lkcv
, "pipelk");
218 selinit(&pipe
->pipe_sel
);
219 pipe
->pipe_state
= PIPE_SIGNALR
;
225 pipe_dtor(void *arg
, void *obj
)
231 cv_destroy(&pipe
->pipe_rcv
);
232 cv_destroy(&pipe
->pipe_wcv
);
233 cv_destroy(&pipe
->pipe_draincv
);
234 cv_destroy(&pipe
->pipe_lkcv
);
235 seldestroy(&pipe
->pipe_sel
);
236 if (pipe
->pipe_kmem
!= 0) {
237 uvm_km_free(kernel_map
, pipe
->pipe_kmem
, PIPE_SIZE
,
239 atomic_add_int(&amountpipekva
, -PIPE_SIZE
);
244 * The pipe system call for the DTYPE_PIPE type of pipes
247 sys_pipe(struct lwp
*l
, const void *v
, register_t
*retval
)
249 struct pipe
*rpipe
, *wpipe
;
255 rpipe
= wpipe
= NULL
;
256 if (pipe_create(&rpipe
, pipe_rd_cache
) ||
257 pipe_create(&wpipe
, pipe_wr_cache
)) {
261 rpipe
->pipe_lock
= mutex_obj_alloc(MUTEX_DEFAULT
, IPL_NONE
);
262 wpipe
->pipe_lock
= rpipe
->pipe_lock
;
263 mutex_obj_hold(wpipe
->pipe_lock
);
265 error
= fd_allocfile(&rf
, &fd
);
270 rf
->f_type
= DTYPE_PIPE
;
271 rf
->f_data
= (void *)rpipe
;
272 rf
->f_ops
= &pipeops
;
274 error
= fd_allocfile(&wf
, &fd
);
279 wf
->f_type
= DTYPE_PIPE
;
280 wf
->f_data
= (void *)wpipe
;
281 wf
->f_ops
= &pipeops
;
283 rpipe
->pipe_peer
= wpipe
;
284 wpipe
->pipe_peer
= rpipe
;
286 fd_affix(p
, rf
, (int)retval
[0]);
287 fd_affix(p
, wf
, (int)retval
[1]);
290 fd_abort(p
, rf
, (int)retval
[0]);
299 * Allocate kva for pipe circular buffer, the space is pageable
300 * This routine will 'realloc' the size of a pipe safely, if it fails
301 * it will retain the old buffer.
302 * If it fails it will return ENOMEM.
305 pipespace(struct pipe
*pipe
, int size
)
310 * Allocate pageable virtual address space. Physical memory is
311 * allocated on demand.
313 if (size
== PIPE_SIZE
&& pipe
->pipe_kmem
!= 0) {
314 buffer
= (void *)pipe
->pipe_kmem
;
316 buffer
= (void *)uvm_km_alloc(kernel_map
, round_page(size
),
317 0, UVM_KMF_PAGEABLE
);
320 atomic_add_int(&amountpipekva
, size
);
323 /* free old resources if we're resizing */
324 pipe_free_kmem(pipe
);
325 pipe
->pipe_buffer
.buffer
= buffer
;
326 pipe
->pipe_buffer
.size
= size
;
327 pipe
->pipe_buffer
.in
= 0;
328 pipe
->pipe_buffer
.out
= 0;
329 pipe
->pipe_buffer
.cnt
= 0;
334 * Initialize and allocate VM and memory for pipe.
337 pipe_create(struct pipe
**pipep
, pool_cache_t cache
)
342 pipe
= pool_cache_get(cache
, PR_WAITOK
);
343 KASSERT(pipe
!= NULL
);
346 getnanotime(&pipe
->pipe_btime
);
347 pipe
->pipe_atime
= pipe
->pipe_mtime
= pipe
->pipe_btime
;
348 pipe
->pipe_lock
= NULL
;
349 if (cache
== pipe_rd_cache
) {
350 error
= pipespace(pipe
, PIPE_SIZE
);
352 pipe
->pipe_buffer
.buffer
= NULL
;
353 pipe
->pipe_buffer
.size
= 0;
354 pipe
->pipe_buffer
.in
= 0;
355 pipe
->pipe_buffer
.out
= 0;
356 pipe
->pipe_buffer
.cnt
= 0;
362 * Lock a pipe for I/O, blocking other access
363 * Called with pipe spin lock held.
366 pipelock(struct pipe
*pipe
, int catch)
370 KASSERT(mutex_owned(pipe
->pipe_lock
));
372 while (pipe
->pipe_state
& PIPE_LOCKFL
) {
373 pipe
->pipe_state
|= PIPE_LWANT
;
375 error
= cv_wait_sig(&pipe
->pipe_lkcv
, pipe
->pipe_lock
);
379 cv_wait(&pipe
->pipe_lkcv
, pipe
->pipe_lock
);
382 pipe
->pipe_state
|= PIPE_LOCKFL
;
388 * unlock a pipe I/O lock
391 pipeunlock(struct pipe
*pipe
)
394 KASSERT(pipe
->pipe_state
& PIPE_LOCKFL
);
396 pipe
->pipe_state
&= ~PIPE_LOCKFL
;
397 if (pipe
->pipe_state
& PIPE_LWANT
) {
398 pipe
->pipe_state
&= ~PIPE_LWANT
;
399 cv_broadcast(&pipe
->pipe_lkcv
);
404 * Select/poll wakup. This also sends SIGIO to peer connected to
405 * 'sigpipe' side of pipe.
408 pipeselwakeup(struct pipe
*selp
, struct pipe
*sigp
, int code
)
414 band
= POLLIN
|POLLRDNORM
;
417 band
= POLLOUT
|POLLWRNORM
;
428 printf("bad siginfo code %d in pipe notification.\n", code
);
433 selnotify(&selp
->pipe_sel
, band
, NOTE_SUBMIT
);
435 if (sigp
== NULL
|| (sigp
->pipe_state
& PIPE_ASYNC
) == 0)
438 fownsignal(sigp
->pipe_pgid
, SIGIO
, code
, band
, selp
);
442 pipe_read(file_t
*fp
, off_t
*offset
, struct uio
*uio
, kauth_cred_t cred
,
445 struct pipe
*rpipe
= (struct pipe
*) fp
->f_data
;
446 struct pipebuf
*bp
= &rpipe
->pipe_buffer
;
447 kmutex_t
*lock
= rpipe
->pipe_lock
;
452 unsigned int wakeup_state
= 0;
459 error
= pipelock(rpipe
, 1);
463 while (uio
->uio_resid
) {
465 * Normal pipe buffer receive.
468 size
= bp
->size
- bp
->out
;
471 if (size
> uio
->uio_resid
)
472 size
= uio
->uio_resid
;
475 error
= uiomove((char *)bp
->buffer
+ bp
->out
, size
, uio
);
481 if (bp
->out
>= bp
->size
)
487 * If there is no more to read in the pipe, reset
488 * its pointers to the beginning. This improves
499 #ifndef PIPE_NODIRECT
500 if ((rpipe
->pipe_state
& PIPE_DIRECTR
) != 0) {
502 * Direct copy, bypassing a kernel buffer.
507 KASSERT(rpipe
->pipe_state
& PIPE_DIRECTW
);
509 size
= rpipe
->pipe_map
.cnt
;
510 if (size
> uio
->uio_resid
)
511 size
= uio
->uio_resid
;
513 va
= (char *)rpipe
->pipe_map
.kva
+ rpipe
->pipe_map
.pos
;
514 gen
= rpipe
->pipe_map
.egen
;
518 * Consume emap and read the data from loaned pages.
520 uvm_emap_consume(gen
);
521 error
= uiomove(va
, size
, uio
);
527 rpipe
->pipe_map
.pos
+= size
;
528 rpipe
->pipe_map
.cnt
-= size
;
529 if (rpipe
->pipe_map
.cnt
== 0) {
530 rpipe
->pipe_state
&= ~PIPE_DIRECTR
;
531 cv_broadcast(&rpipe
->pipe_wcv
);
537 * Break if some data was read.
543 * Detect EOF condition.
544 * Read returns 0 on EOF, no need to set error.
546 if (rpipe
->pipe_state
& PIPE_EOF
)
550 * Don't block on non-blocking I/O.
552 if (fp
->f_flag
& FNONBLOCK
) {
558 * Unlock the pipe buffer for our remaining processing.
559 * We will either break out with an error or we will
560 * sleep and relock to loop.
565 * Re-check to see if more direct writes are pending.
567 if ((rpipe
->pipe_state
& PIPE_DIRECTR
) != 0)
570 #if 1 /* XXX (dsl) I'm sure these aren't needed here ... */
572 * We want to read more, wake up select/poll.
574 pipeselwakeup(rpipe
, rpipe
->pipe_peer
, POLL_OUT
);
577 * If the "write-side" is blocked, wake it up now.
579 cv_broadcast(&rpipe
->pipe_wcv
);
582 if (wakeup_state
& PIPE_RESTART
) {
587 /* Now wait until the pipe is filled */
588 error
= cv_wait_sig(&rpipe
->pipe_rcv
, lock
);
591 wakeup_state
= rpipe
->pipe_state
;
596 getnanotime(&rpipe
->pipe_atime
);
601 if (rpipe
->pipe_busy
== 0) {
602 rpipe
->pipe_state
&= ~PIPE_RESTART
;
603 cv_broadcast(&rpipe
->pipe_draincv
);
605 if (bp
->cnt
< MINPIPESIZE
) {
606 cv_broadcast(&rpipe
->pipe_wcv
);
610 * If anything was read off the buffer, signal to the writer it's
611 * possible to write more data. Also send signal if we are here for the
612 * first time after last write.
614 if ((bp
->size
- bp
->cnt
) >= PIPE_BUF
615 && (ocnt
!= bp
->cnt
|| (rpipe
->pipe_state
& PIPE_SIGNALR
))) {
616 pipeselwakeup(rpipe
, rpipe
->pipe_peer
, POLL_OUT
);
617 rpipe
->pipe_state
&= ~PIPE_SIGNALR
;
624 #ifndef PIPE_NODIRECT
626 * Allocate structure for loan transfer.
629 pipe_loan_alloc(struct pipe
*wpipe
, int npages
)
633 len
= (vsize_t
)npages
<< PAGE_SHIFT
;
634 atomic_add_int(&amountpipekva
, len
);
635 wpipe
->pipe_map
.kva
= uvm_km_alloc(kernel_map
, len
, 0,
636 UVM_KMF_VAONLY
| UVM_KMF_WAITVA
);
637 if (wpipe
->pipe_map
.kva
== 0) {
638 atomic_add_int(&amountpipekva
, -len
);
642 wpipe
->pipe_map
.npages
= npages
;
643 wpipe
->pipe_map
.pgs
= kmem_alloc(npages
* sizeof(struct vm_page
*),
649 * Free resources allocated for loan transfer.
652 pipe_loan_free(struct pipe
*wpipe
)
656 len
= (vsize_t
)wpipe
->pipe_map
.npages
<< PAGE_SHIFT
;
657 uvm_emap_remove(wpipe
->pipe_map
.kva
, len
); /* XXX */
658 uvm_km_free(kernel_map
, wpipe
->pipe_map
.kva
, len
, UVM_KMF_VAONLY
);
659 wpipe
->pipe_map
.kva
= 0;
660 atomic_add_int(&amountpipekva
, -len
);
661 kmem_free(wpipe
->pipe_map
.pgs
,
662 wpipe
->pipe_map
.npages
* sizeof(struct vm_page
*));
663 wpipe
->pipe_map
.pgs
= NULL
;
667 * NetBSD direct write, using uvm_loan() mechanism.
668 * This implements the pipe buffer write mechanism. Note that only
669 * a direct write OR a normal pipe write can be pending at any given time.
670 * If there are any characters in the pipe buffer, the direct write will
671 * be deferred until the receiving process grabs all of the bytes from
672 * the pipe buffer. Then the direct mapping write is set-up.
674 * Called with the long-term pipe lock held.
677 pipe_direct_write(file_t
*fp
, struct pipe
*wpipe
, struct uio
*uio
)
679 struct vm_page
**pgs
;
680 vaddr_t bbase
, base
, bend
;
684 kmutex_t
*lock
= wpipe
->pipe_lock
;
686 KASSERT(mutex_owned(wpipe
->pipe_lock
));
687 KASSERT(wpipe
->pipe_map
.cnt
== 0);
692 * Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers
693 * not aligned to PAGE_SIZE.
695 bbase
= (vaddr_t
)uio
->uio_iov
->iov_base
;
696 base
= trunc_page(bbase
);
697 bend
= round_page(bbase
+ uio
->uio_iov
->iov_len
);
701 if (blen
> PIPE_DIRECT_CHUNK
) {
702 blen
= PIPE_DIRECT_CHUNK
;
704 bcnt
= PIPE_DIRECT_CHUNK
- bpos
;
706 bcnt
= uio
->uio_iov
->iov_len
;
708 npages
= blen
>> PAGE_SHIFT
;
711 * Free the old kva if we need more pages than we have
714 if (wpipe
->pipe_map
.kva
!= 0 && npages
> wpipe
->pipe_map
.npages
)
715 pipe_loan_free(wpipe
);
717 /* Allocate new kva. */
718 if (wpipe
->pipe_map
.kva
== 0) {
719 error
= pipe_loan_alloc(wpipe
, npages
);
726 /* Loan the write buffer memory from writer process */
727 pgs
= wpipe
->pipe_map
.pgs
;
728 error
= uvm_loan(&uio
->uio_vmspace
->vm_map
, base
, blen
,
729 pgs
, UVM_LOAN_TOPAGE
);
731 pipe_loan_free(wpipe
);
733 return (ENOMEM
); /* so that caller fallback to ordinary write */
736 /* Enter the loaned pages to KVA, produce new emap generation number. */
737 uvm_emap_enter(wpipe
->pipe_map
.kva
, pgs
, npages
);
738 wpipe
->pipe_map
.egen
= uvm_emap_produce();
740 /* Now we can put the pipe in direct write mode */
741 wpipe
->pipe_map
.pos
= bpos
;
742 wpipe
->pipe_map
.cnt
= bcnt
;
745 * But before we can let someone do a direct read, we
746 * have to wait until the pipe is drained. Release the
747 * pipe lock while we wait.
750 wpipe
->pipe_state
|= PIPE_DIRECTW
;
753 while (error
== 0 && wpipe
->pipe_buffer
.cnt
> 0) {
754 cv_broadcast(&wpipe
->pipe_rcv
);
755 error
= cv_wait_sig(&wpipe
->pipe_wcv
, lock
);
756 if (error
== 0 && wpipe
->pipe_state
& PIPE_EOF
)
760 /* Pipe is drained; next read will off the direct buffer */
761 wpipe
->pipe_state
|= PIPE_DIRECTR
;
763 /* Wait until the reader is done */
764 while (error
== 0 && (wpipe
->pipe_state
& PIPE_DIRECTR
)) {
765 cv_broadcast(&wpipe
->pipe_rcv
);
766 pipeselwakeup(wpipe
, wpipe
, POLL_IN
);
767 error
= cv_wait_sig(&wpipe
->pipe_wcv
, lock
);
768 if (error
== 0 && wpipe
->pipe_state
& PIPE_EOF
)
772 /* Take pipe out of direct write mode */
773 wpipe
->pipe_state
&= ~(PIPE_DIRECTW
| PIPE_DIRECTR
);
775 /* Acquire the pipe lock and cleanup */
776 (void)pipelock(wpipe
, 0);
780 /* XXX: uvm_emap_remove */
781 uvm_unloan(pgs
, npages
, UVM_LOAN_TOPAGE
);
783 if (error
|| amountpipekva
> maxpipekva
)
784 pipe_loan_free(wpipe
);
788 pipeselwakeup(wpipe
, wpipe
, POLL_ERR
);
791 * If nothing was read from what we offered, return error
792 * straight on. Otherwise update uio resid first. Caller
793 * will deal with the error condition, returning short
794 * write, error, or restarting the write(2) as appropriate.
796 if (wpipe
->pipe_map
.cnt
== bcnt
) {
797 wpipe
->pipe_map
.cnt
= 0;
798 cv_broadcast(&wpipe
->pipe_wcv
);
802 bcnt
-= wpipe
->pipe_map
.cnt
;
805 uio
->uio_resid
-= bcnt
;
806 /* uio_offset not updated, not set/used for write(2) */
807 uio
->uio_iov
->iov_base
= (char *)uio
->uio_iov
->iov_base
+ bcnt
;
808 uio
->uio_iov
->iov_len
-= bcnt
;
809 if (uio
->uio_iov
->iov_len
== 0) {
814 wpipe
->pipe_map
.cnt
= 0;
817 #endif /* !PIPE_NODIRECT */
820 pipe_write(file_t
*fp
, off_t
*offset
, struct uio
*uio
, kauth_cred_t cred
,
823 struct pipe
*wpipe
, *rpipe
;
827 unsigned int wakeup_state
= 0;
829 /* We want to write to our peer */
830 rpipe
= (struct pipe
*) fp
->f_data
;
831 lock
= rpipe
->pipe_lock
;
835 wpipe
= rpipe
->pipe_peer
;
838 * Detect loss of pipe read side, issue SIGPIPE if lost.
840 if (wpipe
== NULL
|| (wpipe
->pipe_state
& PIPE_EOF
) != 0) {
846 /* Aquire the long-term pipe lock */
847 if ((error
= pipelock(wpipe
, 1)) != 0) {
849 if (wpipe
->pipe_busy
== 0) {
850 wpipe
->pipe_state
&= ~PIPE_RESTART
;
851 cv_broadcast(&wpipe
->pipe_draincv
);
857 bp
= &wpipe
->pipe_buffer
;
860 * If it is advantageous to resize the pipe buffer, do so.
862 if ((uio
->uio_resid
> PIPE_SIZE
) &&
863 (nbigpipe
< maxbigpipes
) &&
864 #ifndef PIPE_NODIRECT
865 (wpipe
->pipe_state
& PIPE_DIRECTW
) == 0 &&
867 (bp
->size
<= PIPE_SIZE
) && (bp
->cnt
== 0)) {
869 if (pipespace(wpipe
, BIG_PIPE_SIZE
) == 0)
870 atomic_inc_uint(&nbigpipe
);
873 while (uio
->uio_resid
) {
876 #ifndef PIPE_NODIRECT
878 * Pipe buffered writes cannot be coincidental with
879 * direct writes. Also, only one direct write can be
880 * in progress at any one time. We wait until the currently
881 * executing direct write is completed before continuing.
883 * We break out if a signal occurs or the reader goes away.
885 while (error
== 0 && wpipe
->pipe_state
& PIPE_DIRECTW
) {
886 cv_broadcast(&wpipe
->pipe_rcv
);
888 error
= cv_wait_sig(&wpipe
->pipe_wcv
, lock
);
889 (void)pipelock(wpipe
, 0);
890 if (wpipe
->pipe_state
& PIPE_EOF
)
897 * If the transfer is large, we can gain performance if
898 * we do process-to-process copies directly.
899 * If the write is non-blocking, we don't use the
900 * direct write mechanism.
902 * The direct write mechanism will detect the reader going
905 if ((uio
->uio_iov
->iov_len
>= PIPE_MINDIRECT
) &&
906 (fp
->f_flag
& FNONBLOCK
) == 0 &&
907 (wpipe
->pipe_map
.kva
|| (amountpipekva
< limitpipekva
))) {
908 error
= pipe_direct_write(fp
, wpipe
, uio
);
911 * Break out if error occurred, unless it's ENOMEM.
912 * ENOMEM means we failed to allocate some resources
913 * for direct write, so we just fallback to ordinary
914 * write. If the direct write was successful,
915 * process rest of data via ordinary write.
923 #endif /* PIPE_NODIRECT */
925 space
= bp
->size
- bp
->cnt
;
927 /* Writes of size <= PIPE_BUF must be atomic. */
928 if ((space
< uio
->uio_resid
) && (uio
->uio_resid
<= PIPE_BUF
))
932 int size
; /* Transfer size */
933 int segsize
; /* first segment to transfer */
936 * Transfer size is minimum of uio transfer
937 * and free space in pipe buffer.
939 if (space
> uio
->uio_resid
)
940 size
= uio
->uio_resid
;
944 * First segment to transfer is minimum of
945 * transfer size and contiguous space in
946 * pipe buffer. If first segment to transfer
947 * is less than the transfer size, we've got
948 * a wraparound in the buffer.
950 segsize
= bp
->size
- bp
->in
;
954 /* Transfer first segment */
956 error
= uiomove((char *)bp
->buffer
+ bp
->in
, segsize
,
959 if (error
== 0 && segsize
< size
) {
961 * Transfer remaining part now, to
962 * support atomic writes. Wraparound
965 KASSERT(bp
->in
+ segsize
== bp
->size
);
966 error
= uiomove(bp
->buffer
,
967 size
- segsize
, uio
);
974 if (bp
->in
>= bp
->size
) {
975 KASSERT(bp
->in
== size
- segsize
+ bp
->size
);
976 bp
->in
= size
- segsize
;
980 KASSERT(bp
->cnt
<= bp
->size
);
984 * If the "read-side" has been blocked, wake it up now.
986 cv_broadcast(&wpipe
->pipe_rcv
);
989 * Don't block on non-blocking I/O.
991 if (fp
->f_flag
& FNONBLOCK
) {
997 * We have no more space and have something to offer,
998 * wake up select/poll.
1001 pipeselwakeup(wpipe
, wpipe
, POLL_IN
);
1003 if (wakeup_state
& PIPE_RESTART
) {
1009 error
= cv_wait_sig(&wpipe
->pipe_wcv
, lock
);
1010 (void)pipelock(wpipe
, 0);
1014 * If read side wants to go away, we just issue a signal
1017 if (wpipe
->pipe_state
& PIPE_EOF
) {
1021 wakeup_state
= wpipe
->pipe_state
;
1026 if (wpipe
->pipe_busy
== 0) {
1027 wpipe
->pipe_state
&= ~PIPE_RESTART
;
1028 cv_broadcast(&wpipe
->pipe_draincv
);
1031 cv_broadcast(&wpipe
->pipe_rcv
);
1035 * Don't return EPIPE if I/O was successful
1037 if (error
== EPIPE
&& bp
->cnt
== 0 && uio
->uio_resid
== 0)
1041 getnanotime(&wpipe
->pipe_mtime
);
1044 * We have something to offer, wake up select/poll.
1045 * wpipe->pipe_map.cnt is always 0 in this point (direct write
1046 * is only done synchronously), so check only wpipe->pipe_buffer.cnt
1049 pipeselwakeup(wpipe
, wpipe
, POLL_IN
);
1052 * Arrange for next read(2) to do a signal.
1054 wpipe
->pipe_state
|= PIPE_SIGNALR
;
1062 * We implement a very minimal set of ioctls for compatibility with sockets.
1065 pipe_ioctl(file_t
*fp
, u_long cmd
, void *data
)
1067 struct pipe
*pipe
= fp
->f_data
;
1068 kmutex_t
*lock
= pipe
->pipe_lock
;
1078 pipe
->pipe_state
|= PIPE_ASYNC
;
1080 pipe
->pipe_state
&= ~PIPE_ASYNC
;
1087 #ifndef PIPE_NODIRECT
1088 if (pipe
->pipe_state
& PIPE_DIRECTW
)
1089 *(int *)data
= pipe
->pipe_map
.cnt
;
1092 *(int *)data
= pipe
->pipe_buffer
.cnt
;
1097 /* Look at other side */
1098 pipe
= pipe
->pipe_peer
;
1100 #ifndef PIPE_NODIRECT
1101 if (pipe
->pipe_state
& PIPE_DIRECTW
)
1102 *(int *)data
= pipe
->pipe_map
.cnt
;
1105 *(int *)data
= pipe
->pipe_buffer
.cnt
;
1110 /* Look at other side */
1111 pipe
= pipe
->pipe_peer
;
1113 #ifndef PIPE_NODIRECT
1115 * If we're in direct-mode, we don't really have a
1116 * send queue, and any other write will block. Thus
1117 * zero seems like the best answer.
1119 if (pipe
->pipe_state
& PIPE_DIRECTW
)
1123 *(int *)data
= pipe
->pipe_buffer
.size
-
1124 pipe
->pipe_buffer
.cnt
;
1130 return fsetown(&pipe
->pipe_pgid
, cmd
, data
);
1134 return fgetown(pipe
->pipe_pgid
, cmd
, data
);
1137 return (EPASSTHROUGH
);
1141 pipe_poll(file_t
*fp
, int events
)
1143 struct pipe
*rpipe
= fp
->f_data
;
1148 mutex_enter(rpipe
->pipe_lock
);
1149 wpipe
= rpipe
->pipe_peer
;
1151 if (events
& (POLLIN
| POLLRDNORM
))
1152 if ((rpipe
->pipe_buffer
.cnt
> 0) ||
1153 #ifndef PIPE_NODIRECT
1154 (rpipe
->pipe_state
& PIPE_DIRECTR
) ||
1156 (rpipe
->pipe_state
& PIPE_EOF
))
1157 revents
|= events
& (POLLIN
| POLLRDNORM
);
1159 eof
|= (rpipe
->pipe_state
& PIPE_EOF
);
1162 revents
|= events
& (POLLOUT
| POLLWRNORM
);
1164 if (events
& (POLLOUT
| POLLWRNORM
))
1165 if ((wpipe
->pipe_state
& PIPE_EOF
) || (
1166 #ifndef PIPE_NODIRECT
1167 (wpipe
->pipe_state
& PIPE_DIRECTW
) == 0 &&
1169 (wpipe
->pipe_buffer
.size
- wpipe
->pipe_buffer
.cnt
) >= PIPE_BUF
))
1170 revents
|= events
& (POLLOUT
| POLLWRNORM
);
1172 eof
|= (wpipe
->pipe_state
& PIPE_EOF
);
1175 if (wpipe
== NULL
|| eof
)
1179 if (events
& (POLLIN
| POLLRDNORM
))
1180 selrecord(curlwp
, &rpipe
->pipe_sel
);
1182 if (events
& (POLLOUT
| POLLWRNORM
))
1183 selrecord(curlwp
, &wpipe
->pipe_sel
);
1185 mutex_exit(rpipe
->pipe_lock
);
1191 pipe_stat(file_t
*fp
, struct stat
*ub
)
1193 struct pipe
*pipe
= fp
->f_data
;
1195 mutex_enter(pipe
->pipe_lock
);
1196 memset(ub
, 0, sizeof(*ub
));
1197 ub
->st_mode
= S_IFIFO
| S_IRUSR
| S_IWUSR
;
1198 ub
->st_blksize
= pipe
->pipe_buffer
.size
;
1199 if (ub
->st_blksize
== 0 && pipe
->pipe_peer
)
1200 ub
->st_blksize
= pipe
->pipe_peer
->pipe_buffer
.size
;
1201 ub
->st_size
= pipe
->pipe_buffer
.cnt
;
1202 ub
->st_blocks
= (ub
->st_size
) ? 1 : 0;
1203 ub
->st_atimespec
= pipe
->pipe_atime
;
1204 ub
->st_mtimespec
= pipe
->pipe_mtime
;
1205 ub
->st_ctimespec
= ub
->st_birthtimespec
= pipe
->pipe_btime
;
1206 ub
->st_uid
= kauth_cred_geteuid(fp
->f_cred
);
1207 ub
->st_gid
= kauth_cred_getegid(fp
->f_cred
);
1210 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1211 * XXX (st_dev, st_ino) should be unique.
1213 mutex_exit(pipe
->pipe_lock
);
1218 pipe_close(file_t
*fp
)
1220 struct pipe
*pipe
= fp
->f_data
;
1228 pipe_restart(file_t
*fp
)
1230 struct pipe
*pipe
= fp
->f_data
;
1233 * Unblock blocked reads/writes in order to allow close() to complete.
1234 * System calls return ERESTART so that the fd is revalidated.
1235 * (Partial writes return the transfer length.)
1237 mutex_enter(pipe
->pipe_lock
);
1238 pipe
->pipe_state
|= PIPE_RESTART
;
1239 /* Wakeup both cvs, maybe we only need one, but maybe there are some
1240 * other paths where wakeup is needed, and it saves deciding which! */
1241 cv_broadcast(&pipe
->pipe_rcv
);
1242 cv_broadcast(&pipe
->pipe_wcv
);
1243 mutex_exit(pipe
->pipe_lock
);
1247 pipe_free_kmem(struct pipe
*pipe
)
1250 if (pipe
->pipe_buffer
.buffer
!= NULL
) {
1251 if (pipe
->pipe_buffer
.size
> PIPE_SIZE
) {
1252 atomic_dec_uint(&nbigpipe
);
1254 if (pipe
->pipe_buffer
.buffer
!= (void *)pipe
->pipe_kmem
) {
1255 uvm_km_free(kernel_map
,
1256 (vaddr_t
)pipe
->pipe_buffer
.buffer
,
1257 pipe
->pipe_buffer
.size
, UVM_KMF_PAGEABLE
);
1258 atomic_add_int(&amountpipekva
,
1259 -pipe
->pipe_buffer
.size
);
1261 pipe
->pipe_buffer
.buffer
= NULL
;
1263 #ifndef PIPE_NODIRECT
1264 if (pipe
->pipe_map
.kva
!= 0) {
1265 pipe_loan_free(pipe
);
1266 pipe
->pipe_map
.cnt
= 0;
1267 pipe
->pipe_map
.kva
= 0;
1268 pipe
->pipe_map
.pos
= 0;
1269 pipe
->pipe_map
.npages
= 0;
1271 #endif /* !PIPE_NODIRECT */
1275 * Shutdown the pipe.
1278 pipeclose(struct pipe
*pipe
)
1286 KASSERT(cv_is_valid(&pipe
->pipe_rcv
));
1287 KASSERT(cv_is_valid(&pipe
->pipe_wcv
));
1288 KASSERT(cv_is_valid(&pipe
->pipe_draincv
));
1289 KASSERT(cv_is_valid(&pipe
->pipe_lkcv
));
1291 lock
= pipe
->pipe_lock
;
1293 /* Must have failed during create */
1294 goto free_resources
;
1297 pipeselwakeup(pipe
, pipe
, POLL_HUP
);
1300 * If the other side is blocked, wake it up saying that
1301 * we want to close it down.
1303 pipe
->pipe_state
|= PIPE_EOF
;
1304 if (pipe
->pipe_busy
) {
1305 while (pipe
->pipe_busy
) {
1306 cv_broadcast(&pipe
->pipe_wcv
);
1307 cv_wait_sig(&pipe
->pipe_draincv
, lock
);
1312 * Disconnect from peer.
1314 if ((ppipe
= pipe
->pipe_peer
) != NULL
) {
1315 pipeselwakeup(ppipe
, ppipe
, POLL_HUP
);
1316 ppipe
->pipe_state
|= PIPE_EOF
;
1317 cv_broadcast(&ppipe
->pipe_rcv
);
1318 ppipe
->pipe_peer
= NULL
;
1322 * Any knote objects still left in the list are
1323 * the one attached by peer. Since no one will
1324 * traverse this list, we just clear it.
1326 SLIST_INIT(&pipe
->pipe_sel
.sel_klist
);
1328 KASSERT((pipe
->pipe_state
& PIPE_LOCKFL
) == 0);
1330 mutex_obj_free(lock
);
1336 pipe
->pipe_pgid
= 0;
1337 pipe
->pipe_state
= PIPE_SIGNALR
;
1338 pipe_free_kmem(pipe
);
1339 if (pipe
->pipe_kmem
!= 0) {
1340 pool_cache_put(pipe_rd_cache
, pipe
);
1342 pool_cache_put(pipe_wr_cache
, pipe
);
1347 filt_pipedetach(struct knote
*kn
)
1352 pipe
= ((file_t
*)kn
->kn_obj
)->f_data
;
1353 lock
= pipe
->pipe_lock
;
1357 switch(kn
->kn_filter
) {
1359 /* Need the peer structure, not our own. */
1360 pipe
= pipe
->pipe_peer
;
1362 /* If reader end already closed, just return. */
1370 /* Nothing to do. */
1374 KASSERT(kn
->kn_hook
== pipe
);
1375 SLIST_REMOVE(&pipe
->pipe_sel
.sel_klist
, kn
, knote
, kn_selnext
);
1380 filt_piperead(struct knote
*kn
, long hint
)
1382 struct pipe
*rpipe
= ((file_t
*)kn
->kn_obj
)->f_data
;
1385 if ((hint
& NOTE_SUBMIT
) == 0) {
1386 mutex_enter(rpipe
->pipe_lock
);
1388 wpipe
= rpipe
->pipe_peer
;
1389 kn
->kn_data
= rpipe
->pipe_buffer
.cnt
;
1391 if ((kn
->kn_data
== 0) && (rpipe
->pipe_state
& PIPE_DIRECTW
))
1392 kn
->kn_data
= rpipe
->pipe_map
.cnt
;
1394 if ((rpipe
->pipe_state
& PIPE_EOF
) ||
1395 (wpipe
== NULL
) || (wpipe
->pipe_state
& PIPE_EOF
)) {
1396 kn
->kn_flags
|= EV_EOF
;
1397 if ((hint
& NOTE_SUBMIT
) == 0) {
1398 mutex_exit(rpipe
->pipe_lock
);
1403 if ((hint
& NOTE_SUBMIT
) == 0) {
1404 mutex_exit(rpipe
->pipe_lock
);
1406 return (kn
->kn_data
> 0);
1410 filt_pipewrite(struct knote
*kn
, long hint
)
1412 struct pipe
*rpipe
= ((file_t
*)kn
->kn_obj
)->f_data
;
1415 if ((hint
& NOTE_SUBMIT
) == 0) {
1416 mutex_enter(rpipe
->pipe_lock
);
1418 wpipe
= rpipe
->pipe_peer
;
1420 if ((wpipe
== NULL
) || (wpipe
->pipe_state
& PIPE_EOF
)) {
1422 kn
->kn_flags
|= EV_EOF
;
1423 if ((hint
& NOTE_SUBMIT
) == 0) {
1424 mutex_exit(rpipe
->pipe_lock
);
1428 kn
->kn_data
= wpipe
->pipe_buffer
.size
- wpipe
->pipe_buffer
.cnt
;
1429 if (wpipe
->pipe_state
& PIPE_DIRECTW
)
1432 if ((hint
& NOTE_SUBMIT
) == 0) {
1433 mutex_exit(rpipe
->pipe_lock
);
1435 return (kn
->kn_data
>= PIPE_BUF
);
1438 static const struct filterops pipe_rfiltops
=
1439 { 1, NULL
, filt_pipedetach
, filt_piperead
};
1440 static const struct filterops pipe_wfiltops
=
1441 { 1, NULL
, filt_pipedetach
, filt_pipewrite
};
1444 pipe_kqfilter(file_t
*fp
, struct knote
*kn
)
1449 pipe
= ((file_t
*)kn
->kn_obj
)->f_data
;
1450 lock
= pipe
->pipe_lock
;
1454 switch (kn
->kn_filter
) {
1456 kn
->kn_fop
= &pipe_rfiltops
;
1459 kn
->kn_fop
= &pipe_wfiltops
;
1460 pipe
= pipe
->pipe_peer
;
1462 /* Other end of pipe has been closed. */
1473 SLIST_INSERT_HEAD(&pipe
->pipe_sel
.sel_klist
, kn
, kn_selnext
);
1480 * Handle pipe sysctls.
1482 SYSCTL_SETUP(sysctl_kern_pipe_setup
, "sysctl kern.pipe subtree setup")
1485 sysctl_createv(clog
, 0, NULL
, NULL
,
1487 CTLTYPE_NODE
, "kern", NULL
,
1490 sysctl_createv(clog
, 0, NULL
, NULL
,
1492 CTLTYPE_NODE
, "pipe",
1493 SYSCTL_DESCR("Pipe settings"),
1495 CTL_KERN
, KERN_PIPE
, CTL_EOL
);
1497 sysctl_createv(clog
, 0, NULL
, NULL
,
1498 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
1499 CTLTYPE_INT
, "maxkvasz",
1500 SYSCTL_DESCR("Maximum amount of kernel memory to be "
1502 NULL
, 0, &maxpipekva
, 0,
1503 CTL_KERN
, KERN_PIPE
, KERN_PIPE_MAXKVASZ
, CTL_EOL
);
1504 sysctl_createv(clog
, 0, NULL
, NULL
,
1505 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
1506 CTLTYPE_INT
, "maxloankvasz",
1507 SYSCTL_DESCR("Limit for direct transfers via page loan"),
1508 NULL
, 0, &limitpipekva
, 0,
1509 CTL_KERN
, KERN_PIPE
, KERN_PIPE_LIMITKVA
, CTL_EOL
);
1510 sysctl_createv(clog
, 0, NULL
, NULL
,
1511 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
1512 CTLTYPE_INT
, "maxbigpipes",
1513 SYSCTL_DESCR("Maximum number of \"big\" pipes"),
1514 NULL
, 0, &maxbigpipes
, 0,
1515 CTL_KERN
, KERN_PIPE
, KERN_PIPE_MAXBIGPIPES
, CTL_EOL
);
1516 sysctl_createv(clog
, 0, NULL
, NULL
,
1518 CTLTYPE_INT
, "nbigpipes",
1519 SYSCTL_DESCR("Number of \"big\" pipes"),
1520 NULL
, 0, &nbigpipe
, 0,
1521 CTL_KERN
, KERN_PIPE
, KERN_PIPE_NBIGPIPES
, CTL_EOL
);
1522 sysctl_createv(clog
, 0, NULL
, NULL
,
1524 CTLTYPE_INT
, "kvasize",
1525 SYSCTL_DESCR("Amount of kernel memory consumed by pipe "
1527 NULL
, 0, &amountpipekva
, 0,
1528 CTL_KERN
, KERN_PIPE
, KERN_PIPE_KVASIZE
, CTL_EOL
);