dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / kernel / syscall / sendfile.c
blobf16623695d214cadfa42bd732badaf5286dad529
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/t_lock.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/buf.h>
31 #include <sys/conf.h>
32 #include <sys/cred.h>
33 #include <sys/kmem.h>
34 #include <sys/sysmacros.h>
35 #include <sys/vfs.h>
36 #include <sys/vnode.h>
37 #include <sys/debug.h>
38 #include <sys/errno.h>
39 #include <sys/time.h>
40 #include <sys/file.h>
41 #include <sys/open.h>
42 #include <sys/user.h>
43 #include <sys/termios.h>
44 #include <sys/stream.h>
45 #include <sys/strsubr.h>
46 #include <sys/sunddi.h>
47 #include <sys/esunddi.h>
48 #include <sys/flock.h>
49 #include <sys/modctl.h>
50 #include <sys/cmn_err.h>
51 #include <sys/vmsystm.h>
53 #include <sys/socket.h>
54 #include <sys/socketvar.h>
55 #include <../../../../../../kernel/fs/sockfs/sockcommon.h>
56 #include <../../../../../../kernel/fs/sockfs/socktpi.h>
58 #include <netinet/in.h>
59 #include <sys/sendfile.h>
60 #include <sys/un.h>
61 #include <sys/tihdr.h>
62 #include <sys/atomic.h>
64 #include <inet/common.h>
65 #include <inet/ip.h>
66 #include <inet/ip6.h>
67 #include <inet/tcp.h>
69 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
70 ssize32_t *);
71 extern int snf_segmap(file_t *, vnode_t *, uoff_t, uoff_t, ssize_t *,
72 boolean_t);
73 extern sotpi_info_t *sotpi_sototpi(struct sonode *);
75 #define SEND_MAX_CHUNK 16
77 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
79 * 64 bit offsets for 32 bit applications only running either on
80 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
81 * more than 2GB of data.
83 static int
84 sendvec_chunk64(file_t *fp, uoff_t *fileoff, struct ksendfilevec64 *sfv,
85 int copy_cnt, ssize32_t *count)
87 struct vnode *vp;
88 ushort_t fflag;
89 int ioflag;
90 size32_t cnt;
91 ssize32_t sfv_len;
92 ssize32_t tmpcount;
93 uoff_t sfv_off;
94 struct uio auio;
95 struct iovec aiov;
96 int i, error;
98 fflag = fp->f_flag;
99 vp = fp->f_vnode;
100 for (i = 0; i < copy_cnt; i++) {
102 if (ISSIG(curthread, JUSTLOOKING))
103 return (EINTR);
106 * Do similar checks as "write" as we are writing
107 * sfv_len bytes into "vp".
109 sfv_len = (ssize32_t)sfv->sfv_len;
111 if (sfv_len == 0) {
112 sfv++;
113 continue;
116 if (sfv_len < 0)
117 return (EINVAL);
119 if (vp->v_type == VREG) {
120 if (*fileoff >= curproc->p_fsz_ctl) {
121 mutex_enter(&curproc->p_lock);
122 (void) rctl_action(
123 rctlproc_legacy[RLIMIT_FSIZE],
124 curproc->p_rctls, curproc, RCA_SAFE);
125 mutex_exit(&curproc->p_lock);
126 return (EFBIG);
129 if (*fileoff >= OFFSET_MAX(fp))
130 return (EFBIG);
132 if (*fileoff + sfv_len > OFFSET_MAX(fp))
133 return (EINVAL);
136 tmpcount = *count + sfv_len;
137 if (tmpcount < 0)
138 return (EINVAL);
140 sfv_off = sfv->sfv_off;
142 auio.uio_extflg = UIO_COPY_DEFAULT;
143 if (sfv->sfv_fd == SFV_FD_SELF) {
144 aiov.iov_len = sfv_len;
145 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
146 auio.uio_loffset = *fileoff;
147 auio.uio_iovcnt = 1;
148 auio.uio_resid = sfv_len;
149 auio.uio_iov = &aiov;
150 auio.uio_segflg = UIO_USERSPACE;
151 auio.uio_llimit = curproc->p_fsz_ctl;
152 auio.uio_fmode = fflag;
153 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
154 while (sfv_len > 0) {
155 error = fop_write(vp, &auio, ioflag,
156 fp->f_cred, NULL);
157 cnt = sfv_len - auio.uio_resid;
158 sfv_len -= cnt;
159 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
160 if (vp->v_type == VREG)
161 *fileoff += cnt;
162 *count += cnt;
163 if (error != 0)
164 return (error);
166 } else {
167 file_t *ffp;
168 vnode_t *readvp;
169 size_t size;
170 caddr_t ptr;
172 if ((ffp = getf(sfv->sfv_fd)) == NULL)
173 return (EBADF);
175 if ((ffp->f_flag & FREAD) == 0) {
176 releasef(sfv->sfv_fd);
177 return (EBADF);
180 readvp = ffp->f_vnode;
181 if (readvp->v_type != VREG) {
182 releasef(sfv->sfv_fd);
183 return (EINVAL);
187 * No point reading and writing to same vp,
188 * as long as both are regular files. readvp is not
189 * locked; but since we got it from an open file the
190 * contents will be valid during the time of access.
192 if (vn_compare(vp, readvp)) {
193 releasef(sfv->sfv_fd);
194 return (EINVAL);
198 * Optimize the regular file over
199 * the socket case.
201 if (vp->v_type == VSOCK) {
202 error = sosendfile64(fp, ffp, sfv,
203 (ssize32_t *)&cnt);
204 *count += cnt;
205 if (error)
206 return (error);
207 sfv++;
208 continue;
212 * Note: we assume readvp != vp. "vp" is already
213 * locked, and "readvp" must not be.
215 if (readvp < vp) {
216 fop_rwunlock(vp, V_WRITELOCK_TRUE, NULL);
217 (void) fop_rwlock(readvp, V_WRITELOCK_FALSE,
218 NULL);
219 (void) fop_rwlock(vp, V_WRITELOCK_TRUE, NULL);
220 } else {
221 (void) fop_rwlock(readvp, V_WRITELOCK_FALSE,
222 NULL);
226 * Same checks as in pread64.
228 if (sfv_off > MAXOFFSET_T) {
229 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
230 releasef(sfv->sfv_fd);
231 return (EINVAL);
234 if (sfv_off + sfv_len > MAXOFFSET_T)
235 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
237 /* Find the native blocksize to transfer data */
238 size = MIN(vp->v_vfsp->vfs_bsize,
239 readvp->v_vfsp->vfs_bsize);
240 size = sfv_len < size ? sfv_len : size;
241 ptr = kmem_alloc(size, KM_NOSLEEP);
242 if (ptr == NULL) {
243 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
244 releasef(sfv->sfv_fd);
245 return (ENOMEM);
248 while (sfv_len > 0) {
249 size_t iov_len;
251 iov_len = MIN(size, sfv_len);
252 aiov.iov_base = ptr;
253 aiov.iov_len = iov_len;
254 auio.uio_loffset = sfv_off;
255 auio.uio_iov = &aiov;
256 auio.uio_iovcnt = 1;
257 auio.uio_resid = iov_len;
258 auio.uio_segflg = UIO_SYSSPACE;
259 auio.uio_llimit = MAXOFFSET_T;
260 auio.uio_fmode = ffp->f_flag;
261 ioflag = auio.uio_fmode &
262 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
265 * If read sync is not asked for,
266 * filter sync flags
268 if ((ioflag & FRSYNC) == 0)
269 ioflag &= ~(FSYNC|FDSYNC);
270 error = fop_read(readvp, &auio, ioflag,
271 fp->f_cred, NULL);
272 if (error) {
273 kmem_free(ptr, size);
274 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
275 NULL);
276 releasef(sfv->sfv_fd);
277 return (error);
281 * Check how must data was really read.
282 * Decrement the 'len' and increment the
283 * 'off' appropriately.
285 cnt = iov_len - auio.uio_resid;
286 if (cnt == 0) {
288 * If we were reading a pipe (currently
289 * not implemented), we may now lose
290 * data.
292 kmem_free(ptr, size);
293 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
294 NULL);
295 releasef(sfv->sfv_fd);
296 return (EINVAL);
298 sfv_len -= cnt;
299 sfv_off += cnt;
301 aiov.iov_base = ptr;
302 aiov.iov_len = cnt;
303 auio.uio_loffset = *fileoff;
304 auio.uio_iov = &aiov;
305 auio.uio_iovcnt = 1;
306 auio.uio_resid = cnt;
307 auio.uio_segflg = UIO_SYSSPACE;
308 auio.uio_llimit = curproc->p_fsz_ctl;
309 auio.uio_fmode = fflag;
310 ioflag = auio.uio_fmode &
311 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
312 error = fop_write(vp, &auio, ioflag,
313 fp->f_cred, NULL);
316 * Check how much data was written. Increment
317 * the 'len' and decrement the 'off' if all
318 * the data was not written.
320 cnt -= auio.uio_resid;
321 sfv_len += auio.uio_resid;
322 sfv_off -= auio.uio_resid;
323 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
324 if (vp->v_type == VREG)
325 *fileoff += cnt;
326 *count += cnt;
327 if (error != 0) {
328 kmem_free(ptr, size);
329 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
330 NULL);
331 releasef(sfv->sfv_fd);
332 return (error);
335 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
336 releasef(sfv->sfv_fd);
337 kmem_free(ptr, size);
339 sfv++;
341 return (0);
344 static ssize32_t
345 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
346 size32_t *xferred, int fildes)
348 uoff_t fileoff;
349 int copy_cnt;
350 const struct ksendfilevec64 *copy_vec;
351 struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
352 struct vnode *vp;
353 int error;
354 ssize32_t count = 0;
356 vp = fp->f_vnode;
357 (void) fop_rwlock(vp, V_WRITELOCK_TRUE, NULL);
359 copy_vec = vec;
360 fileoff = fp->f_offset;
362 do {
363 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
364 if (copyin(copy_vec, sfv, copy_cnt *
365 sizeof (struct ksendfilevec64))) {
366 error = EFAULT;
367 break;
370 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
371 if (error != 0)
372 break;
374 copy_vec += copy_cnt;
375 sfvcnt -= copy_cnt;
376 } while (sfvcnt > 0);
378 if (vp->v_type == VREG)
379 fp->f_offset += count;
381 fop_rwunlock(vp, V_WRITELOCK_TRUE, NULL);
382 if (copyout(&count, xferred, sizeof (count)))
383 error = EFAULT;
384 releasef(fildes);
385 if (error != 0)
386 return (set_errno(error));
387 return (count);
389 #endif
391 static int
392 sendvec_small_chunk(file_t *fp, uoff_t *fileoff, struct sendfilevec *sfv,
393 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
395 struct vnode *vp;
396 struct uio auio;
397 struct iovec aiov;
398 ushort_t fflag;
399 int ioflag;
400 int i, error;
401 size_t cnt;
402 ssize_t sfv_len;
403 uoff_t sfv_off;
404 #ifdef _SYSCALL32_IMPL
405 model_t model = get_udatamodel();
406 uoff_t maxoff = (model == DATAMODEL_ILP32) ?
407 MAXOFF32_T : MAXOFFSET_T;
408 #else
409 const uoff_t maxoff = MAXOFF32_T;
410 #endif
411 mblk_t *dmp = NULL;
412 int wroff;
413 int buf_left = 0;
414 size_t iov_len;
415 mblk_t *head, *tmp;
416 size_t size = total_size;
417 size_t extra;
418 int tail_len;
419 struct msghdr msg;
421 fflag = fp->f_flag;
422 vp = fp->f_vnode;
424 ASSERT(vp->v_type == VSOCK);
425 ASSERT(maxblk > 0);
427 /* If nothing to send, return */
428 if (total_size == 0)
429 return (0);
431 if (vp->v_stream != NULL) {
432 wroff = (int)vp->v_stream->sd_wroff;
433 tail_len = (int)vp->v_stream->sd_tail;
434 } else {
435 struct sonode *so;
437 so = VTOSO(vp);
438 wroff = so->so_proto_props.sopp_wroff;
439 tail_len = so->so_proto_props.sopp_tail;
442 extra = wroff + tail_len;
444 buf_left = MIN(total_size, maxblk);
445 head = dmp = allocb(buf_left + extra, BPRI_HI);
446 if (head == NULL)
447 return (ENOMEM);
448 head->b_wptr = head->b_rptr = head->b_rptr + wroff;
449 bzero(&msg, sizeof (msg));
451 auio.uio_extflg = UIO_COPY_DEFAULT;
452 for (i = 0; i < copy_cnt; i++) {
453 if (ISSIG(curthread, JUSTLOOKING)) {
454 freemsg(head);
455 return (EINTR);
459 * Do similar checks as "write" as we are writing
460 * sfv_len bytes into "vp".
462 sfv_len = (ssize_t)sfv->sfv_len;
464 if (sfv_len == 0) {
465 sfv++;
466 continue;
469 /* Check for overflow */
470 #ifdef _SYSCALL32_IMPL
471 if (model == DATAMODEL_ILP32) {
472 if (((ssize32_t)(*count + sfv_len)) < 0) {
473 freemsg(head);
474 return (EINVAL);
476 } else
477 #endif
478 if ((*count + sfv_len) < 0) {
479 freemsg(head);
480 return (EINVAL);
483 sfv_off = (uoff_t)(ulong_t)sfv->sfv_off;
485 if (sfv->sfv_fd == SFV_FD_SELF) {
486 while (sfv_len > 0) {
487 if (buf_left == 0) {
488 tmp = dmp;
489 buf_left = MIN(total_size, maxblk);
490 iov_len = MIN(buf_left, sfv_len);
491 dmp = allocb(buf_left + extra, BPRI_HI);
492 if (dmp == NULL) {
493 freemsg(head);
494 return (ENOMEM);
496 dmp->b_wptr = dmp->b_rptr =
497 dmp->b_rptr + wroff;
498 tmp->b_cont = dmp;
499 } else {
500 iov_len = MIN(buf_left, sfv_len);
503 aiov.iov_len = iov_len;
504 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
505 auio.uio_loffset = *fileoff;
506 auio.uio_iovcnt = 1;
507 auio.uio_resid = iov_len;
508 auio.uio_iov = &aiov;
509 auio.uio_segflg = UIO_USERSPACE;
510 auio.uio_llimit = curproc->p_fsz_ctl;
511 auio.uio_fmode = fflag;
513 buf_left -= iov_len;
514 total_size -= iov_len;
515 sfv_len -= iov_len;
516 sfv_off += iov_len;
518 error = uiomove((caddr_t)dmp->b_wptr,
519 iov_len, UIO_WRITE, &auio);
520 if (error != 0) {
521 freemsg(head);
522 return (error);
524 dmp->b_wptr += iov_len;
526 } else {
527 file_t *ffp;
528 vnode_t *readvp;
530 if ((ffp = getf(sfv->sfv_fd)) == NULL) {
531 freemsg(head);
532 return (EBADF);
535 if ((ffp->f_flag & FREAD) == 0) {
536 releasef(sfv->sfv_fd);
537 freemsg(head);
538 return (EACCES);
541 readvp = ffp->f_vnode;
542 if (readvp->v_type != VREG) {
543 releasef(sfv->sfv_fd);
544 freemsg(head);
545 return (EINVAL);
549 * No point reading and writing to same vp,
550 * as long as both are regular files. readvp is not
551 * locked; but since we got it from an open file the
552 * contents will be valid during the time of access.
555 if (vn_compare(vp, readvp)) {
556 releasef(sfv->sfv_fd);
557 freemsg(head);
558 return (EINVAL);
562 * Note: we assume readvp != vp. "vp" is already
563 * locked, and "readvp" must not be.
566 if (readvp < vp) {
567 fop_rwunlock(vp, V_WRITELOCK_TRUE, NULL);
568 (void) fop_rwlock(readvp, V_WRITELOCK_FALSE,
569 NULL);
570 (void) fop_rwlock(vp, V_WRITELOCK_TRUE, NULL);
571 } else {
572 (void) fop_rwlock(readvp, V_WRITELOCK_FALSE,
573 NULL);
576 /* Same checks as in pread */
577 if (sfv_off > maxoff) {
578 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
579 releasef(sfv->sfv_fd);
580 freemsg(head);
581 return (EINVAL);
583 if (sfv_off + sfv_len > maxoff) {
584 total_size -= (sfv_off + sfv_len - maxoff);
585 sfv_len = (ssize_t)((offset_t)maxoff -
586 sfv_off);
589 while (sfv_len > 0) {
590 if (buf_left == 0) {
591 tmp = dmp;
592 buf_left = MIN(total_size, maxblk);
593 iov_len = MIN(buf_left, sfv_len);
594 dmp = allocb(buf_left + extra, BPRI_HI);
595 if (dmp == NULL) {
596 fop_rwunlock(readvp,
597 V_WRITELOCK_FALSE, NULL);
598 releasef(sfv->sfv_fd);
599 freemsg(head);
600 return (ENOMEM);
602 dmp->b_wptr = dmp->b_rptr =
603 dmp->b_rptr + wroff;
604 tmp->b_cont = dmp;
605 } else {
606 iov_len = MIN(buf_left, sfv_len);
608 aiov.iov_base = (caddr_t)dmp->b_wptr;
609 aiov.iov_len = iov_len;
610 auio.uio_loffset = sfv_off;
611 auio.uio_iov = &aiov;
612 auio.uio_iovcnt = 1;
613 auio.uio_resid = iov_len;
614 auio.uio_segflg = UIO_SYSSPACE;
615 auio.uio_llimit = MAXOFFSET_T;
616 auio.uio_fmode = ffp->f_flag;
617 ioflag = auio.uio_fmode &
618 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
621 * If read sync is not asked for,
622 * filter sync flags
624 if ((ioflag & FRSYNC) == 0)
625 ioflag &= ~(FSYNC|FDSYNC);
626 error = fop_read(readvp, &auio, ioflag,
627 fp->f_cred, NULL);
628 if (error != 0) {
630 * If we were reading a pipe (currently
631 * not implemented), we may now loose
632 * data.
634 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
635 NULL);
636 releasef(sfv->sfv_fd);
637 freemsg(head);
638 return (error);
642 * Check how much data was really read.
643 * Decrement the 'len' and increment the
644 * 'off' appropriately.
646 cnt = iov_len - auio.uio_resid;
647 if (cnt == 0) {
648 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
649 NULL);
650 releasef(sfv->sfv_fd);
651 freemsg(head);
652 return (EINVAL);
654 sfv_len -= cnt;
655 sfv_off += cnt;
656 total_size -= cnt;
657 buf_left -= cnt;
659 dmp->b_wptr += cnt;
661 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
662 releasef(sfv->sfv_fd);
664 sfv++;
667 ASSERT(total_size == 0);
668 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head);
669 if (error != 0) {
670 if (head != NULL)
671 freemsg(head);
672 return (error);
674 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
675 *count += size;
677 return (0);
681 static int
682 sendvec_chunk(file_t *fp, uoff_t *fileoff, struct sendfilevec *sfv,
683 int copy_cnt, ssize_t *count)
685 struct vnode *vp;
686 struct uio auio;
687 struct iovec aiov;
688 ushort_t fflag;
689 int ioflag;
690 int i, error;
691 size_t cnt;
692 ssize_t sfv_len;
693 uoff_t sfv_off;
694 #ifdef _SYSCALL32_IMPL
695 model_t model = get_udatamodel();
696 uoff_t maxoff = (model == DATAMODEL_ILP32) ?
697 MAXOFF32_T : MAXOFFSET_T;
698 #else
699 const uoff_t maxoff = MAXOFF32_T;
700 #endif
701 mblk_t *dmp = NULL;
702 char *buf = NULL;
703 size_t extra;
704 int maxblk, wroff, tail_len;
705 struct sonode *so;
706 stdata_t *stp;
707 struct msghdr msg;
709 fflag = fp->f_flag;
710 vp = fp->f_vnode;
712 if (vp->v_type == VSOCK) {
713 so = VTOSO(vp);
714 if (vp->v_stream != NULL) {
715 stp = vp->v_stream;
716 wroff = (int)stp->sd_wroff;
717 tail_len = (int)stp->sd_tail;
718 maxblk = (int)stp->sd_maxblk;
719 } else {
720 stp = NULL;
721 wroff = so->so_proto_props.sopp_wroff;
722 tail_len = so->so_proto_props.sopp_tail;
723 maxblk = so->so_proto_props.sopp_maxblk;
725 extra = wroff + tail_len;
728 bzero(&msg, sizeof (msg));
729 auio.uio_extflg = UIO_COPY_DEFAULT;
730 for (i = 0; i < copy_cnt; i++) {
731 if (ISSIG(curthread, JUSTLOOKING))
732 return (EINTR);
735 * Do similar checks as "write" as we are writing
736 * sfv_len bytes into "vp".
738 sfv_len = (ssize_t)sfv->sfv_len;
740 if (sfv_len == 0) {
741 sfv++;
742 continue;
745 if (vp->v_type == VREG) {
746 if (*fileoff >= curproc->p_fsz_ctl) {
747 mutex_enter(&curproc->p_lock);
748 (void) rctl_action(
749 rctlproc_legacy[RLIMIT_FSIZE],
750 curproc->p_rctls, curproc, RCA_SAFE);
751 mutex_exit(&curproc->p_lock);
753 return (EFBIG);
756 if (*fileoff >= maxoff)
757 return (EFBIG);
759 if (*fileoff + sfv_len > maxoff)
760 return (EINVAL);
763 /* Check for overflow */
764 #ifdef _SYSCALL32_IMPL
765 if (model == DATAMODEL_ILP32) {
766 if (((ssize32_t)(*count + sfv_len)) < 0)
767 return (EINVAL);
768 } else
769 #endif
770 if ((*count + sfv_len) < 0)
771 return (EINVAL);
773 sfv_off = (uoff_t)(ulong_t)sfv->sfv_off;
775 if (sfv->sfv_fd == SFV_FD_SELF) {
776 if (vp->v_type == VSOCK) {
777 while (sfv_len > 0) {
778 size_t iov_len;
780 iov_len = sfv_len;
782 * Socket filters can limit the mblk
783 * size, so limit reads to maxblk if
784 * there are filters present.
786 if (so->so_filter_active > 0 &&
787 maxblk != INFPSZ)
788 iov_len = MIN(iov_len, maxblk);
790 aiov.iov_len = iov_len;
791 aiov.iov_base =
792 (caddr_t)(uintptr_t)sfv_off;
794 auio.uio_iov = &aiov;
795 auio.uio_iovcnt = 1;
796 auio.uio_loffset = *fileoff;
797 auio.uio_segflg = UIO_USERSPACE;
798 auio.uio_fmode = fflag;
799 auio.uio_llimit = curproc->p_fsz_ctl;
800 auio.uio_resid = iov_len;
802 dmp = allocb(iov_len + extra, BPRI_HI);
803 if (dmp == NULL)
804 return (ENOMEM);
805 dmp->b_wptr = dmp->b_rptr =
806 dmp->b_rptr + wroff;
807 error = uiomove((caddr_t)dmp->b_wptr,
808 iov_len, UIO_WRITE, &auio);
809 if (error != 0) {
810 freeb(dmp);
811 return (error);
813 dmp->b_wptr += iov_len;
814 error = socket_sendmblk(VTOSO(vp),
815 &msg, fflag, CRED(), &dmp);
817 if (error != 0) {
818 if (dmp != NULL)
819 freeb(dmp);
820 return (error);
822 ttolwp(curthread)->lwp_ru.ioch +=
823 (ulong_t)iov_len;
824 *count += iov_len;
825 sfv_len -= iov_len;
826 sfv_off += iov_len;
828 } else {
829 aiov.iov_len = sfv_len;
830 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
832 auio.uio_iov = &aiov;
833 auio.uio_iovcnt = 1;
834 auio.uio_loffset = *fileoff;
835 auio.uio_segflg = UIO_USERSPACE;
836 auio.uio_fmode = fflag;
837 auio.uio_llimit = curproc->p_fsz_ctl;
838 auio.uio_resid = sfv_len;
840 ioflag = auio.uio_fmode &
841 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
842 while (sfv_len > 0) {
843 error = fop_write(vp, &auio, ioflag,
844 fp->f_cred, NULL);
845 cnt = sfv_len - auio.uio_resid;
846 sfv_len -= cnt;
847 ttolwp(curthread)->lwp_ru.ioch +=
848 (ulong_t)cnt;
849 *fileoff += cnt;
850 *count += cnt;
851 if (error != 0)
852 return (error);
855 } else {
856 int segmapit = 0;
857 file_t *ffp;
858 vnode_t *readvp;
859 struct vnode *realvp;
860 size_t size;
861 caddr_t ptr;
863 if ((ffp = getf(sfv->sfv_fd)) == NULL)
864 return (EBADF);
866 if ((ffp->f_flag & FREAD) == 0) {
867 releasef(sfv->sfv_fd);
868 return (EBADF);
871 readvp = ffp->f_vnode;
872 if (fop_realvp(readvp, &realvp, NULL) == 0)
873 readvp = realvp;
874 if (readvp->v_type != VREG) {
875 releasef(sfv->sfv_fd);
876 return (EINVAL);
880 * No point reading and writing to same vp,
881 * as long as both are regular files. readvp is not
882 * locked; but since we got it from an open file the
883 * contents will be valid during the time of access.
885 if (vn_compare(vp, readvp)) {
886 releasef(sfv->sfv_fd);
887 return (EINVAL);
891 * Note: we assume readvp != vp. "vp" is already
892 * locked, and "readvp" must not be.
894 if (readvp < vp) {
895 fop_rwunlock(vp, V_WRITELOCK_TRUE, NULL);
896 (void) fop_rwlock(readvp, V_WRITELOCK_FALSE,
897 NULL);
898 (void) fop_rwlock(vp, V_WRITELOCK_TRUE, NULL);
899 } else {
900 (void) fop_rwlock(readvp, V_WRITELOCK_FALSE,
901 NULL);
904 /* Same checks as in pread */
905 if (sfv_off > maxoff) {
906 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
907 releasef(sfv->sfv_fd);
908 return (EINVAL);
910 if (sfv_off + sfv_len > maxoff) {
911 sfv_len = (ssize_t)((offset_t)maxoff -
912 sfv_off);
914 /* Find the native blocksize to transfer data */
915 size = MIN(vp->v_vfsp->vfs_bsize,
916 readvp->v_vfsp->vfs_bsize);
917 size = sfv_len < size ? sfv_len : size;
919 if (vp->v_type != VSOCK) {
920 segmapit = 0;
921 buf = kmem_alloc(size, KM_NOSLEEP);
922 if (buf == NULL) {
923 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
924 NULL);
925 releasef(sfv->sfv_fd);
926 return (ENOMEM);
928 } else {
929 uint_t copyflag;
931 copyflag = stp != NULL ? stp->sd_copyflag :
932 so->so_proto_props.sopp_zcopyflag;
935 * Socket filters can limit the mblk size,
936 * so limit reads to maxblk if there are
937 * filters present.
939 if (so->so_filter_active > 0 &&
940 maxblk != INFPSZ)
941 size = MIN(size, maxblk);
943 if (vn_has_flocks(readvp) ||
944 readvp->v_flag & VNOMAP ||
945 copyflag & STZCVMUNSAFE) {
946 segmapit = 0;
947 } else if (copyflag & STZCVMSAFE) {
948 segmapit = 1;
949 } else {
950 int on = 1;
951 if (socket_setsockopt(VTOSO(vp),
952 SOL_SOCKET, SO_SND_COPYAVOID,
953 &on, sizeof (on), CRED()) == 0)
954 segmapit = 1;
958 if (segmapit) {
959 boolean_t nowait;
961 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0;
962 error = snf_segmap(fp, readvp, sfv_off,
963 (uoff_t)sfv_len, (ssize_t *)&cnt,
964 nowait);
965 releasef(sfv->sfv_fd);
966 *count += cnt;
967 if (error)
968 return (error);
969 sfv++;
970 continue;
973 while (sfv_len > 0) {
974 size_t iov_len;
976 iov_len = MIN(size, sfv_len);
978 if (vp->v_type == VSOCK) {
979 dmp = allocb(iov_len + extra, BPRI_HI);
980 if (dmp == NULL) {
981 fop_rwunlock(readvp,
982 V_WRITELOCK_FALSE, NULL);
983 releasef(sfv->sfv_fd);
984 return (ENOMEM);
986 dmp->b_wptr = dmp->b_rptr =
987 dmp->b_rptr + wroff;
988 ptr = (caddr_t)dmp->b_rptr;
989 } else {
990 ptr = buf;
993 aiov.iov_base = ptr;
994 aiov.iov_len = iov_len;
995 auio.uio_loffset = sfv_off;
996 auio.uio_iov = &aiov;
997 auio.uio_iovcnt = 1;
998 auio.uio_resid = iov_len;
999 auio.uio_segflg = UIO_SYSSPACE;
1000 auio.uio_llimit = MAXOFFSET_T;
1001 auio.uio_fmode = ffp->f_flag;
1002 ioflag = auio.uio_fmode &
1003 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1006 * If read sync is not asked for,
1007 * filter sync flags
1009 if ((ioflag & FRSYNC) == 0)
1010 ioflag &= ~(FSYNC|FDSYNC);
1011 error = fop_read(readvp, &auio, ioflag,
1012 fp->f_cred, NULL);
1013 if (error != 0) {
1015 * If we were reading a pipe (currently
1016 * not implemented), we may now lose
1017 * data.
1019 if (vp->v_type == VSOCK)
1020 freeb(dmp);
1021 else
1022 kmem_free(buf, size);
1023 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
1024 NULL);
1025 releasef(sfv->sfv_fd);
1026 return (error);
1030 * Check how much data was really read.
1031 * Decrement the 'len' and increment the
1032 * 'off' appropriately.
1034 cnt = iov_len - auio.uio_resid;
1035 if (cnt == 0) {
1036 if (vp->v_type == VSOCK)
1037 freeb(dmp);
1038 else
1039 kmem_free(buf, size);
1040 fop_rwunlock(readvp, V_WRITELOCK_FALSE,
1041 NULL);
1042 releasef(sfv->sfv_fd);
1043 return (EINVAL);
1045 sfv_len -= cnt;
1046 sfv_off += cnt;
1048 if (vp->v_type == VSOCK) {
1049 dmp->b_wptr = dmp->b_rptr + cnt;
1051 error = socket_sendmblk(VTOSO(vp),
1052 &msg, fflag, CRED(), &dmp);
1054 if (error != 0) {
1055 if (dmp != NULL)
1056 freeb(dmp);
1057 fop_rwunlock(readvp,
1058 V_WRITELOCK_FALSE, NULL);
1059 releasef(sfv->sfv_fd);
1060 return (error);
1063 ttolwp(curthread)->lwp_ru.ioch +=
1064 (ulong_t)cnt;
1065 *count += cnt;
1066 } else {
1068 aiov.iov_base = ptr;
1069 aiov.iov_len = cnt;
1070 auio.uio_loffset = *fileoff;
1071 auio.uio_resid = cnt;
1072 auio.uio_iov = &aiov;
1073 auio.uio_iovcnt = 1;
1074 auio.uio_segflg = UIO_SYSSPACE;
1075 auio.uio_llimit = curproc->p_fsz_ctl;
1076 auio.uio_fmode = fflag;
1077 ioflag = auio.uio_fmode &
1078 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1079 error = fop_write(vp, &auio, ioflag,
1080 fp->f_cred, NULL);
1083 * Check how much data was written.
1084 * Increment the 'len' and decrement the
1085 * 'off' if all the data was not
1086 * written.
1088 cnt -= auio.uio_resid;
1089 sfv_len += auio.uio_resid;
1090 sfv_off -= auio.uio_resid;
1091 ttolwp(curthread)->lwp_ru.ioch +=
1092 (ulong_t)cnt;
1093 *fileoff += cnt;
1094 *count += cnt;
1095 if (error != 0) {
1096 kmem_free(buf, size);
1097 fop_rwunlock(readvp,
1098 V_WRITELOCK_FALSE, NULL);
1099 releasef(sfv->sfv_fd);
1100 return (error);
1104 if (buf) {
1105 kmem_free(buf, size);
1106 buf = NULL;
1108 fop_rwunlock(readvp, V_WRITELOCK_FALSE, NULL);
1109 releasef(sfv->sfv_fd);
1111 sfv++;
1113 return (0);
1116 ssize_t
1117 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1118 size_t *xferred)
1120 int error = 0;
1121 int first_vector_error = 0;
1122 file_t *fp;
1123 struct vnode *vp;
1124 struct sonode *so;
1125 uoff_t fileoff;
1126 int copy_cnt;
1127 const struct sendfilevec *copy_vec;
1128 struct sendfilevec sfv[SEND_MAX_CHUNK];
1129 ssize_t count = 0;
1130 #ifdef _SYSCALL32_IMPL
1131 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1132 #endif
1133 ssize_t total_size;
1134 int i;
1135 boolean_t is_sock = B_FALSE;
1136 int maxblk = 0;
1138 if (sfvcnt <= 0)
1139 return (set_errno(EINVAL));
1141 if ((fp = getf(fildes)) == NULL)
1142 return (set_errno(EBADF));
1144 if (((fp->f_flag) & FWRITE) == 0) {
1145 error = EBADF;
1146 goto err;
1149 fileoff = fp->f_offset;
1150 vp = fp->f_vnode;
1152 switch (vp->v_type) {
1153 case VSOCK:
1154 so = VTOSO(vp);
1155 is_sock = B_TRUE;
1156 if (SOCK_IS_NONSTR(so)) {
1157 maxblk = so->so_proto_props.sopp_maxblk;
1158 } else {
1159 maxblk = (int)vp->v_stream->sd_maxblk;
1163 * We need to make sure that the socket that we're sending on
1164 * supports sendfile behavior. sockfs doesn't know that the APIs
1165 * we want to use are coming from sendfile, so we can't rely on
1166 * it to check for us.
1168 if ((so->so_mode & SM_SENDFILESUPP) == 0) {
1169 error = EOPNOTSUPP;
1170 goto err;
1172 break;
1173 case VREG:
1174 break;
1175 default:
1176 error = EINVAL;
1177 goto err;
1180 switch (opcode) {
1181 case SENDFILEV :
1182 break;
1183 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1184 case SENDFILEV64 :
1185 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1186 (size32_t *)xferred, fildes));
1187 #endif
1188 default :
1189 error = ENOSYS;
1190 break;
1193 (void) fop_rwlock(vp, V_WRITELOCK_TRUE, NULL);
1194 copy_vec = vec;
1196 do {
1197 total_size = 0;
1198 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1199 #ifdef _SYSCALL32_IMPL
1200 /* 32-bit callers need to have their iovec expanded. */
1201 if (get_udatamodel() == DATAMODEL_ILP32) {
1202 if (copyin(copy_vec, sfv32,
1203 copy_cnt * sizeof (ksendfilevec32_t))) {
1204 error = EFAULT;
1205 break;
1208 for (i = 0; i < copy_cnt; i++) {
1209 sfv[i].sfv_fd = sfv32[i].sfv_fd;
1210 sfv[i].sfv_off =
1211 (off_t)(uint32_t)sfv32[i].sfv_off;
1212 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1213 total_size += sfv[i].sfv_len;
1214 sfv[i].sfv_flag = sfv32[i].sfv_flag;
1216 * Individual elements of the vector must not
1217 * wrap or overflow, as later math is signed.
1218 * Equally total_size needs to be checked after
1219 * each vector is added in, to be sure that
1220 * rogue values haven't overflowed the counter.
1222 if (((ssize32_t)sfv[i].sfv_len < 0) ||
1223 ((ssize32_t)total_size < 0)) {
1225 * Truncate the vector to send data
1226 * described by elements before the
1227 * error.
1229 copy_cnt = i;
1230 first_vector_error = EINVAL;
1231 /* total_size can't be trusted */
1232 if ((ssize32_t)total_size < 0)
1233 error = EINVAL;
1234 break;
1237 /* Nothing to do, process errors */
1238 if (copy_cnt == 0)
1239 break;
1241 } else {
1242 #endif
1243 if (copyin(copy_vec, sfv,
1244 copy_cnt * sizeof (sendfilevec_t))) {
1245 error = EFAULT;
1246 break;
1249 for (i = 0; i < copy_cnt; i++) {
1250 total_size += sfv[i].sfv_len;
1252 * Individual elements of the vector must not
1253 * wrap or overflow, as later math is signed.
1254 * Equally total_size needs to be checked after
1255 * each vector is added in, to be sure that
1256 * rogue values haven't overflowed the counter.
1258 if (((ssize_t)sfv[i].sfv_len < 0) ||
1259 (total_size < 0)) {
1261 * Truncate the vector to send data
1262 * described by elements before the
1263 * error.
1265 copy_cnt = i;
1266 first_vector_error = EINVAL;
1267 /* total_size can't be trusted */
1268 if (total_size < 0)
1269 error = EINVAL;
1270 break;
1273 /* Nothing to do, process errors */
1274 if (copy_cnt == 0)
1275 break;
1276 #ifdef _SYSCALL32_IMPL
1278 #endif
1281 * The task between deciding to use sendvec_small_chunk
1282 * and sendvec_chunk is dependant on multiple things:
1284 * i) latency is important for smaller files. So if the
1285 * data is smaller than 'tcp_slow_start_initial' times
1286 * maxblk, then use sendvec_small_chunk which creates
1287 * maxblk size mblks and chains them together and sends
1288 * them to TCP in one shot. It also leaves 'wroff' size
1289 * space for the headers in each mblk.
1291 * ii) for total size bigger than 'tcp_slow_start_initial'
1292 * time maxblk, its probably real file data which is
1293 * dominating. So its better to use sendvec_chunk because
1294 * performance goes to dog if we don't do pagesize reads.
1295 * sendvec_chunk will do pagesize reads and write them
1296 * in pagesize mblks to TCP.
1298 * Side Notes: A write to file has not been optimized.
1299 * Future zero copy code will plugin into sendvec_chunk
1300 * only because doing zero copy for files smaller then
1301 * pagesize is useless.
1303 if (is_sock) {
1304 if ((total_size <= (4 * maxblk)) &&
1305 error == 0) {
1306 error = sendvec_small_chunk(fp,
1307 &fileoff, sfv, copy_cnt,
1308 total_size, maxblk, &count);
1309 } else {
1310 error = sendvec_chunk(fp, &fileoff,
1311 sfv, copy_cnt, &count);
1313 } else {
1314 ASSERT(vp->v_type == VREG);
1315 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1316 &count);
1320 #ifdef _SYSCALL32_IMPL
1321 if (get_udatamodel() == DATAMODEL_ILP32) {
1322 copy_vec = (const struct sendfilevec *)
1323 ((char *)copy_vec +
1324 (copy_cnt * sizeof (ksendfilevec32_t)));
1325 } else
1326 #endif
1327 copy_vec += copy_cnt;
1328 sfvcnt -= copy_cnt;
1330 /* Process all vector members up to first error */
1331 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0);
1333 if (vp->v_type == VREG)
1334 fp->f_offset += count;
1336 fop_rwunlock(vp, V_WRITELOCK_TRUE, NULL);
1338 #ifdef _SYSCALL32_IMPL
1339 if (get_udatamodel() == DATAMODEL_ILP32) {
1340 ssize32_t count32 = (ssize32_t)count;
1341 if (copyout(&count32, xferred, sizeof (count32)))
1342 error = EFAULT;
1343 releasef(fildes);
1344 if (error != 0)
1345 return (set_errno(error));
1346 if (first_vector_error != 0)
1347 return (set_errno(first_vector_error));
1348 return (count32);
1350 #endif
1351 if (copyout(&count, xferred, sizeof (count)))
1352 error = EFAULT;
1353 releasef(fildes);
1354 if (error != 0)
1355 return (set_errno(error));
1356 if (first_vector_error != 0)
1357 return (set_errno(first_vector_error));
1358 return (count);
1359 err:
1360 ASSERT(error != 0);
1361 releasef(fildes);
1362 return (set_errno(error));