dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / kernel / fs / nfs / nfs_srv.c
blobd584fedc5793757ebb60722c9a0dc004ea91c45a
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2016 by Delphix. All rights reserved.
28 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
29 * All rights reserved.
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/buf.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/uio.h>
40 #include <sys/stat.h>
41 #include <sys/errno.h>
42 #include <sys/sysmacros.h>
43 #include <sys/statvfs.h>
44 #include <sys/kmem.h>
45 #include <sys/kstat.h>
46 #include <sys/dirent.h>
47 #include <sys/cmn_err.h>
48 #include <sys/debug.h>
49 #include <sys/vtrace.h>
50 #include <sys/mode.h>
51 #include <sys/acl.h>
52 #include <sys/nbmlock.h>
53 #include <sys/policy.h>
54 #include <sys/sdt.h>
56 #include <rpc/types.h>
57 #include <rpc/auth.h>
58 #include <rpc/svc.h>
60 #include <nfs/nfs.h>
61 #include <nfs/export.h>
62 #include <nfs/nfs_cmd.h>
64 #include <vm/hat.h>
65 #include <vm/as.h>
66 #include <vm/seg.h>
67 #include <vm/seg_map.h>
68 #include <vm/seg_kmem.h>
70 #include <sys/strsubr.h>
73 * These are the interface routines for the server side of the
74 * Network File System. See the NFS version 2 protocol specification
75 * for a description of this interface.
78 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
79 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
80 cred_t *);
83 * Some "over the wire" UNIX file types. These are encoded
84 * into the mode. This needs to be fixed in the next rev.
86 #define IFMT 0170000 /* type of file */
87 #define IFCHR 0020000 /* character special */
88 #define IFBLK 0060000 /* block special */
89 #define IFSOCK 0140000 /* socket */
91 u_longlong_t nfs2_srv_caller_id;
94 * Get file attributes.
95 * Returns the current attributes of the file with the given fhandle.
97 /* ARGSUSED */
98 void
99 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
100 struct svc_req *req, cred_t *cr, bool_t ro)
102 int error;
103 vnode_t *vp;
104 struct vattr va;
106 vp = nfs_fhtovp(fhp, exi);
107 if (vp == NULL) {
108 ns->ns_status = NFSERR_STALE;
109 return;
113 * Do the getattr.
115 va.va_mask = AT_ALL; /* we want all the attributes */
117 error = rfs4_delegated_getattr(vp, &va, 0, cr);
119 /* check for overflows */
120 if (!error) {
121 /* Lie about the object type for a referral */
122 if (vn_is_nfs_reparse(vp, cr))
123 va.va_type = VLNK;
125 acl_perm(vp, exi, &va, cr);
126 error = vattr_to_nattr(&va, &ns->ns_attr);
129 VN_RELE(vp);
131 ns->ns_status = puterrno(error);
133 void *
134 rfs_getattr_getfh(fhandle_t *fhp)
136 return (fhp);
140 * Set file attributes.
141 * Sets the attributes of the file with the given fhandle. Returns
142 * the new attributes.
144 /* ARGSUSED */
145 void
146 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
147 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
149 int error;
150 int flag;
151 int in_crit = 0;
152 vnode_t *vp;
153 struct vattr va;
154 struct vattr bva;
155 struct flock64 bf;
156 caller_context_t ct;
159 vp = nfs_fhtovp(&args->saa_fh, exi);
160 if (vp == NULL) {
161 ns->ns_status = NFSERR_STALE;
162 return;
165 if (rdonly(ro, vp)) {
166 VN_RELE(vp);
167 ns->ns_status = NFSERR_ROFS;
168 return;
171 error = sattr_to_vattr(&args->saa_sa, &va);
172 if (error) {
173 VN_RELE(vp);
174 ns->ns_status = puterrno(error);
175 return;
179 * If the client is requesting a change to the mtime,
180 * but the nanosecond field is set to 1 billion, then
181 * this is a flag to the server that it should set the
182 * atime and mtime fields to the server's current time.
183 * The 1 billion number actually came from the client
184 * as 1 million, but the units in the over the wire
185 * request are microseconds instead of nanoseconds.
187 * This is an overload of the protocol and should be
188 * documented in the NFS Version 2 protocol specification.
190 if (va.va_mask & AT_MTIME) {
191 if (va.va_mtime.tv_nsec == 1000000000) {
192 gethrestime(&va.va_mtime);
193 va.va_atime = va.va_mtime;
194 va.va_mask |= AT_ATIME;
195 flag = 0;
196 } else
197 flag = ATTR_UTIME;
198 } else
199 flag = 0;
202 * If the filesystem is exported with nosuid, then mask off
203 * the setuid and setgid bits.
205 if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
206 (exi->exi_export.ex_flags & EX_NOSUID))
207 va.va_mode &= ~(VSUID | VSGID);
209 ct.cc_sysid = 0;
210 ct.cc_pid = 0;
211 ct.cc_caller_id = nfs2_srv_caller_id;
212 ct.cc_flags = CC_DONTBLOCK;
215 * We need to specially handle size changes because it is
216 * possible for the client to create a file with modes
217 * which indicate read-only, but with the file opened for
218 * writing. If the client then tries to set the size of
219 * the file, then the normal access checking done in
220 * fop_setattr would prevent the client from doing so,
221 * although it should be legal for it to do so. To get
222 * around this, we do the access checking for ourselves
223 * and then use fop_space which doesn't do the access
224 * checking which fop_setattr does. fop_space can only
225 * operate on VREG files, let fop_setattr handle the other
226 * extremely rare cases.
227 * Also the client should not be allowed to change the
228 * size of the file if there is a conflicting non-blocking
229 * mandatory lock in the region of change.
231 if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
232 if (nbl_need_check(vp)) {
233 nbl_start_crit(vp, RW_READER);
234 in_crit = 1;
237 bva.va_mask = AT_UID | AT_SIZE;
239 error = fop_getattr(vp, &bva, 0, cr, &ct);
241 if (error) {
242 if (in_crit)
243 nbl_end_crit(vp);
244 VN_RELE(vp);
245 ns->ns_status = puterrno(error);
246 return;
249 if (in_crit) {
250 uoff_t offset;
251 ssize_t length;
253 if (va.va_size < bva.va_size) {
254 offset = va.va_size;
255 length = bva.va_size - va.va_size;
256 } else {
257 offset = bva.va_size;
258 length = va.va_size - bva.va_size;
260 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
261 NULL)) {
262 error = EACCES;
266 if (crgetuid(cr) == bva.va_uid && !error &&
267 va.va_size != bva.va_size) {
268 va.va_mask &= ~AT_SIZE;
269 bf.l_type = F_WRLCK;
270 bf.l_whence = 0;
271 bf.l_start = (off64_t)va.va_size;
272 bf.l_len = 0;
273 bf.l_sysid = 0;
274 bf.l_pid = 0;
276 error = fop_space(vp, F_FREESP, &bf, FWRITE,
277 (offset_t)va.va_size, cr, &ct);
279 if (in_crit)
280 nbl_end_crit(vp);
281 } else
282 error = 0;
285 * Do the setattr.
287 if (!error && va.va_mask) {
288 error = fop_setattr(vp, &va, flag, cr, &ct);
292 * check if the monitor on either vop_space or vop_setattr detected
293 * a delegation conflict and if so, mark the thread flag as
294 * wouldblock so that the response is dropped and the client will
295 * try again.
297 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
298 VN_RELE(vp);
299 curthread->t_flag |= T_WOULDBLOCK;
300 return;
303 if (!error) {
304 va.va_mask = AT_ALL; /* get everything */
306 error = rfs4_delegated_getattr(vp, &va, 0, cr);
308 /* check for overflows */
309 if (!error) {
310 acl_perm(vp, exi, &va, cr);
311 error = vattr_to_nattr(&va, &ns->ns_attr);
315 ct.cc_flags = 0;
318 * Force modified metadata out to stable storage.
320 (void) fop_fsync(vp, FNODSYNC, cr, &ct);
322 VN_RELE(vp);
324 ns->ns_status = puterrno(error);
326 void *
327 rfs_setattr_getfh(struct nfssaargs *args)
329 return (&args->saa_fh);
333 * Directory lookup.
334 * Returns an fhandle and file attributes for file name in a directory.
336 /* ARGSUSED */
337 void
338 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
339 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
341 int error;
342 vnode_t *dvp;
343 vnode_t *vp;
344 struct vattr va;
345 fhandle_t *fhp = da->da_fhandle;
346 struct sec_ol sec = {0, 0};
347 bool_t publicfh_flag = FALSE, auth_weak = FALSE;
348 char *name;
349 struct sockaddr *ca;
352 * Disallow NULL paths
354 if (da->da_name == NULL || *da->da_name == '\0') {
355 dr->dr_status = NFSERR_ACCES;
356 return;
360 * Allow lookups from the root - the default
361 * location of the public filehandle.
363 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
364 dvp = rootdir;
365 VN_HOLD(dvp);
366 } else {
367 dvp = nfs_fhtovp(fhp, exi);
368 if (dvp == NULL) {
369 dr->dr_status = NFSERR_STALE;
370 return;
375 * Not allow lookup beyond root.
376 * If the filehandle matches a filehandle of the exi,
377 * then the ".." refers beyond the root of an exported filesystem.
379 if (strcmp(da->da_name, "..") == 0 &&
380 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
381 VN_RELE(dvp);
382 dr->dr_status = NFSERR_NOENT;
383 return;
386 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
387 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
388 MAXPATHLEN);
390 if (name == NULL) {
391 dr->dr_status = NFSERR_ACCES;
392 return;
396 * If the public filehandle is used then allow
397 * a multi-component lookup, i.e. evaluate
398 * a pathname and follow symbolic links if
399 * necessary.
401 * This may result in a vnode in another filesystem
402 * which is OK as long as the filesystem is exported.
404 if (PUBLIC_FH2(fhp)) {
405 publicfh_flag = TRUE;
406 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
407 &sec);
408 } else {
410 * Do a normal single component lookup.
412 error = fop_lookup(dvp, name, &vp, NULL, 0, NULL, cr,
413 NULL, NULL, NULL);
416 if (name != da->da_name)
417 kmem_free(name, MAXPATHLEN);
420 if (!error) {
421 va.va_mask = AT_ALL; /* we want everything */
423 error = rfs4_delegated_getattr(vp, &va, 0, cr);
425 /* check for overflows */
426 if (!error) {
427 acl_perm(vp, exi, &va, cr);
428 error = vattr_to_nattr(&va, &dr->dr_attr);
429 if (!error) {
430 if (sec.sec_flags & SEC_QUERY)
431 error = makefh_ol(&dr->dr_fhandle, exi,
432 sec.sec_index);
433 else {
434 error = makefh(&dr->dr_fhandle, vp,
435 exi);
436 if (!error && publicfh_flag &&
437 !chk_clnt_sec(exi, req))
438 auth_weak = TRUE;
442 VN_RELE(vp);
445 VN_RELE(dvp);
448 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
449 * and have obtained a new exportinfo in exi which needs to be
450 * released. Note the the original exportinfo pointed to by exi
451 * will be released by the caller, comon_dispatch.
453 if (publicfh_flag && exi != NULL)
454 exi_rele(exi);
457 * If it's public fh, no 0x81, and client's flavor is
458 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
459 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
461 if (auth_weak)
462 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
463 else
464 dr->dr_status = puterrno(error);
466 void *
467 rfs_lookup_getfh(struct nfsdiropargs *da)
469 return (da->da_fhandle);
473 * Read symbolic link.
474 * Returns the string in the symbolic link at the given fhandle.
476 /* ARGSUSED */
477 void
478 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
479 struct svc_req *req, cred_t *cr, bool_t ro)
481 int error;
482 struct iovec iov;
483 struct uio uio;
484 vnode_t *vp;
485 struct vattr va;
486 struct sockaddr *ca;
487 char *name = NULL;
488 int is_referral = 0;
490 vp = nfs_fhtovp(fhp, exi);
491 if (vp == NULL) {
492 rl->rl_data = NULL;
493 rl->rl_status = NFSERR_STALE;
494 return;
497 va.va_mask = AT_MODE;
499 error = fop_getattr(vp, &va, 0, cr, NULL);
501 if (error) {
502 VN_RELE(vp);
503 rl->rl_data = NULL;
504 rl->rl_status = puterrno(error);
505 return;
508 if (MANDLOCK(vp, va.va_mode)) {
509 VN_RELE(vp);
510 rl->rl_data = NULL;
511 rl->rl_status = NFSERR_ACCES;
512 return;
515 /* We lied about the object type for a referral */
516 if (vn_is_nfs_reparse(vp, cr))
517 is_referral = 1;
520 * XNFS and RFC1094 require us to return ENXIO if argument
521 * is not a link. BUGID 1138002.
523 if (vp->v_type != VLNK && !is_referral) {
524 VN_RELE(vp);
525 rl->rl_data = NULL;
526 rl->rl_status = NFSERR_NXIO;
527 return;
531 * Allocate data for pathname. This will be freed by rfs_rlfree.
533 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
535 if (is_referral) {
536 char *s;
537 size_t strsz;
539 /* Get an artificial symlink based on a referral */
540 s = build_symlink(vp, cr, &strsz);
541 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
542 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
543 vnode_t *, vp, char *, s);
544 if (s == NULL)
545 error = EINVAL;
546 else {
547 error = 0;
548 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
549 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
550 kmem_free(s, strsz);
553 } else {
556 * Set up io vector to read sym link data
558 iov.iov_base = rl->rl_data;
559 iov.iov_len = NFS_MAXPATHLEN;
560 uio.uio_iov = &iov;
561 uio.uio_iovcnt = 1;
562 uio.uio_segflg = UIO_SYSSPACE;
563 uio.uio_extflg = UIO_COPY_CACHED;
564 uio.uio_loffset = (offset_t)0;
565 uio.uio_resid = NFS_MAXPATHLEN;
568 * Do the readlink.
570 error = fop_readlink(vp, &uio, cr, NULL);
572 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
574 if (!error)
575 rl->rl_data[rl->rl_count] = '\0';
580 VN_RELE(vp);
582 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
583 name = nfscmd_convname(ca, exi, rl->rl_data,
584 NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
586 if (name != NULL && name != rl->rl_data) {
587 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
588 rl->rl_data = name;
592 * XNFS and RFC1094 require us to return ENXIO if argument
593 * is not a link. UFS returns EINVAL if this is the case,
594 * so we do the mapping here. BUGID 1138002.
596 if (error == EINVAL)
597 rl->rl_status = NFSERR_NXIO;
598 else
599 rl->rl_status = puterrno(error);
602 void *
603 rfs_readlink_getfh(fhandle_t *fhp)
605 return (fhp);
608 * Free data allocated by rfs_readlink
610 void
611 rfs_rlfree(struct nfsrdlnres *rl)
613 if (rl->rl_data != NULL)
614 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
617 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
620 * Read data.
621 * Returns some data read from the file at the given fhandle.
623 /* ARGSUSED */
624 void
625 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
626 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
628 vnode_t *vp;
629 int error;
630 struct vattr va;
631 struct iovec iov;
632 struct uio uio;
633 mblk_t *mp;
634 int alloc_err = 0;
635 int in_crit = 0;
636 caller_context_t ct;
638 vp = nfs_fhtovp(&ra->ra_fhandle, exi);
639 if (vp == NULL) {
640 rr->rr_data = NULL;
641 rr->rr_status = NFSERR_STALE;
642 return;
645 if (vp->v_type != VREG) {
646 VN_RELE(vp);
647 rr->rr_data = NULL;
648 rr->rr_status = NFSERR_ISDIR;
649 return;
652 ct.cc_sysid = 0;
653 ct.cc_pid = 0;
654 ct.cc_caller_id = nfs2_srv_caller_id;
655 ct.cc_flags = CC_DONTBLOCK;
658 * Enter the critical region before calling fop_rwlock
659 * to avoid a deadlock with write requests.
661 if (nbl_need_check(vp)) {
662 nbl_start_crit(vp, RW_READER);
663 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
664 0, NULL)) {
665 nbl_end_crit(vp);
666 VN_RELE(vp);
667 rr->rr_data = NULL;
668 rr->rr_status = NFSERR_ACCES;
669 return;
671 in_crit = 1;
674 error = fop_rwlock(vp, V_WRITELOCK_FALSE, &ct);
676 /* check if a monitor detected a delegation conflict */
677 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
678 VN_RELE(vp);
679 /* mark as wouldblock so response is dropped */
680 curthread->t_flag |= T_WOULDBLOCK;
682 rr->rr_data = NULL;
683 return;
686 va.va_mask = AT_ALL;
688 error = fop_getattr(vp, &va, 0, cr, &ct);
690 if (error) {
691 fop_rwunlock(vp, V_WRITELOCK_FALSE, &ct);
692 if (in_crit)
693 nbl_end_crit(vp);
695 VN_RELE(vp);
696 rr->rr_data = NULL;
697 rr->rr_status = puterrno(error);
699 return;
703 * This is a kludge to allow reading of files created
704 * with no read permission. The owner of the file
705 * is always allowed to read it.
707 if (crgetuid(cr) != va.va_uid) {
708 error = fop_access(vp, VREAD, 0, cr, &ct);
710 if (error) {
712 * Exec is the same as read over the net because
713 * of demand loading.
715 error = fop_access(vp, VEXEC, 0, cr, &ct);
717 if (error) {
718 fop_rwunlock(vp, V_WRITELOCK_FALSE, &ct);
719 if (in_crit)
720 nbl_end_crit(vp);
721 VN_RELE(vp);
722 rr->rr_data = NULL;
723 rr->rr_status = puterrno(error);
725 return;
729 if (MANDLOCK(vp, va.va_mode)) {
730 fop_rwunlock(vp, V_WRITELOCK_FALSE, &ct);
731 if (in_crit)
732 nbl_end_crit(vp);
734 VN_RELE(vp);
735 rr->rr_data = NULL;
736 rr->rr_status = NFSERR_ACCES;
738 return;
741 rr->rr_ok.rrok_wlist_len = 0;
742 rr->rr_ok.rrok_wlist = NULL;
744 if ((uoff_t)ra->ra_offset >= va.va_size) {
745 rr->rr_count = 0;
746 rr->rr_data = NULL;
748 * In this case, status is NFS_OK, but there is no data
749 * to encode. So set rr_mp to NULL.
751 rr->rr_mp = NULL;
752 rr->rr_ok.rrok_wlist = ra->ra_wlist;
753 if (rr->rr_ok.rrok_wlist)
754 clist_zero_len(rr->rr_ok.rrok_wlist);
755 goto done;
758 if (ra->ra_wlist) {
759 mp = NULL;
760 rr->rr_mp = NULL;
761 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
762 if (ra->ra_count > iov.iov_len) {
763 rr->rr_data = NULL;
764 rr->rr_status = NFSERR_INVAL;
765 goto done;
767 } else {
769 * mp will contain the data to be sent out in the read reply.
770 * This will be freed after the reply has been sent out (by the
771 * driver).
772 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
773 * that the call to xdrmblk_putmblk() never fails.
775 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
776 &alloc_err);
777 ASSERT(mp != NULL);
778 ASSERT(alloc_err == 0);
780 rr->rr_mp = mp;
783 * Set up io vector
785 iov.iov_base = (caddr_t)mp->b_datap->db_base;
786 iov.iov_len = ra->ra_count;
789 uio.uio_iov = &iov;
790 uio.uio_iovcnt = 1;
791 uio.uio_segflg = UIO_SYSSPACE;
792 uio.uio_extflg = UIO_COPY_CACHED;
793 uio.uio_loffset = (offset_t)ra->ra_offset;
794 uio.uio_resid = ra->ra_count;
796 error = fop_read(vp, &uio, 0, cr, &ct);
798 if (error) {
799 if (mp)
800 freeb(mp);
803 * check if a monitor detected a delegation conflict and
804 * mark as wouldblock so response is dropped
806 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
807 curthread->t_flag |= T_WOULDBLOCK;
808 else
809 rr->rr_status = puterrno(error);
811 fop_rwunlock(vp, V_WRITELOCK_FALSE, &ct);
812 if (in_crit)
813 nbl_end_crit(vp);
815 VN_RELE(vp);
816 rr->rr_data = NULL;
818 return;
822 * Get attributes again so we can send the latest access
823 * time to the client side for its cache.
825 va.va_mask = AT_ALL;
827 error = fop_getattr(vp, &va, 0, cr, &ct);
829 if (error) {
830 if (mp)
831 freeb(mp);
833 fop_rwunlock(vp, V_WRITELOCK_FALSE, &ct);
834 if (in_crit)
835 nbl_end_crit(vp);
837 VN_RELE(vp);
838 rr->rr_data = NULL;
839 rr->rr_status = puterrno(error);
841 return;
844 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
846 if (mp) {
847 rr->rr_data = (char *)mp->b_datap->db_base;
848 } else {
849 if (ra->ra_wlist) {
850 rr->rr_data = (caddr_t)iov.iov_base;
851 if (!rdma_setup_read_data2(ra, rr)) {
852 rr->rr_data = NULL;
853 rr->rr_status = puterrno(NFSERR_INVAL);
857 done:
858 fop_rwunlock(vp, V_WRITELOCK_FALSE, &ct);
859 if (in_crit)
860 nbl_end_crit(vp);
862 acl_perm(vp, exi, &va, cr);
864 /* check for overflows */
865 error = vattr_to_nattr(&va, &rr->rr_attr);
867 VN_RELE(vp);
869 rr->rr_status = puterrno(error);
873 * Free data allocated by rfs_read
875 void
876 rfs_rdfree(struct nfsrdresult *rr)
878 mblk_t *mp;
880 if (rr->rr_status == NFS_OK) {
881 mp = rr->rr_mp;
882 if (mp != NULL)
883 freeb(mp);
887 void *
888 rfs_read_getfh(struct nfsreadargs *ra)
890 return (&ra->ra_fhandle);
893 #define MAX_IOVECS 12
895 #ifdef DEBUG
896 static int rfs_write_sync_hits = 0;
897 static int rfs_write_sync_misses = 0;
898 #endif
901 * Write data to file.
902 * Returns attributes of a file after writing some data to it.
904 * Any changes made here, especially in error handling might have
905 * to also be done in rfs_write (which clusters write requests).
907 /* ARGSUSED */
908 void
909 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
910 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
912 int error;
913 vnode_t *vp;
914 rlim64_t rlimit;
915 struct vattr va;
916 struct uio uio;
917 struct iovec iov[MAX_IOVECS];
918 mblk_t *m;
919 struct iovec *iovp;
920 int iovcnt;
921 cred_t *savecred;
922 int in_crit = 0;
923 caller_context_t ct;
925 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
926 if (vp == NULL) {
927 ns->ns_status = NFSERR_STALE;
928 return;
931 if (rdonly(ro, vp)) {
932 VN_RELE(vp);
933 ns->ns_status = NFSERR_ROFS;
934 return;
937 if (vp->v_type != VREG) {
938 VN_RELE(vp);
939 ns->ns_status = NFSERR_ISDIR;
940 return;
943 ct.cc_sysid = 0;
944 ct.cc_pid = 0;
945 ct.cc_caller_id = nfs2_srv_caller_id;
946 ct.cc_flags = CC_DONTBLOCK;
948 va.va_mask = AT_UID|AT_MODE;
950 error = fop_getattr(vp, &va, 0, cr, &ct);
952 if (error) {
953 VN_RELE(vp);
954 ns->ns_status = puterrno(error);
956 return;
959 if (crgetuid(cr) != va.va_uid) {
961 * This is a kludge to allow writes of files created
962 * with read only permission. The owner of the file
963 * is always allowed to write it.
965 error = fop_access(vp, VWRITE, 0, cr, &ct);
967 if (error) {
968 VN_RELE(vp);
969 ns->ns_status = puterrno(error);
970 return;
975 * Can't access a mandatory lock file. This might cause
976 * the NFS service thread to block forever waiting for a
977 * lock to be released that will never be released.
979 if (MANDLOCK(vp, va.va_mode)) {
980 VN_RELE(vp);
981 ns->ns_status = NFSERR_ACCES;
982 return;
986 * We have to enter the critical region before calling fop_rwlock
987 * to avoid a deadlock with ufs.
989 if (nbl_need_check(vp)) {
990 nbl_start_crit(vp, RW_READER);
991 in_crit = 1;
992 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
993 wa->wa_count, 0, NULL)) {
994 error = EACCES;
995 goto out;
999 error = fop_rwlock(vp, V_WRITELOCK_TRUE, &ct);
1001 /* check if a monitor detected a delegation conflict */
1002 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1003 VN_RELE(vp);
1004 /* mark as wouldblock so response is dropped */
1005 curthread->t_flag |= T_WOULDBLOCK;
1006 return;
1009 if (wa->wa_data || wa->wa_rlist) {
1010 /* Do the RDMA thing if necessary */
1011 if (wa->wa_rlist) {
1012 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1013 iov[0].iov_len = wa->wa_count;
1014 } else {
1015 iov[0].iov_base = wa->wa_data;
1016 iov[0].iov_len = wa->wa_count;
1018 uio.uio_iov = iov;
1019 uio.uio_iovcnt = 1;
1020 uio.uio_segflg = UIO_SYSSPACE;
1021 uio.uio_extflg = UIO_COPY_DEFAULT;
1022 uio.uio_loffset = (offset_t)wa->wa_offset;
1023 uio.uio_resid = wa->wa_count;
1025 * The limit is checked on the client. We
1026 * should allow any size writes here.
1028 uio.uio_llimit = curproc->p_fsz_ctl;
1029 rlimit = uio.uio_llimit - wa->wa_offset;
1030 if (rlimit < (rlim64_t)uio.uio_resid)
1031 uio.uio_resid = (uint_t)rlimit;
1034 * for now we assume no append mode
1037 * We're changing creds because VM may fault and we need
1038 * the cred of the current thread to be used if quota
1039 * checking is enabled.
1041 savecred = curthread->t_cred;
1042 curthread->t_cred = cr;
1043 error = fop_write(vp, &uio, FSYNC, cr, &ct);
1044 curthread->t_cred = savecred;
1045 } else {
1046 iovcnt = 0;
1047 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1048 iovcnt++;
1049 if (iovcnt <= MAX_IOVECS) {
1050 #ifdef DEBUG
1051 rfs_write_sync_hits++;
1052 #endif
1053 iovp = iov;
1054 } else {
1055 #ifdef DEBUG
1056 rfs_write_sync_misses++;
1057 #endif
1058 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1060 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1061 uio.uio_iov = iovp;
1062 uio.uio_iovcnt = iovcnt;
1063 uio.uio_segflg = UIO_SYSSPACE;
1064 uio.uio_extflg = UIO_COPY_DEFAULT;
1065 uio.uio_loffset = (offset_t)wa->wa_offset;
1066 uio.uio_resid = wa->wa_count;
1068 * The limit is checked on the client. We
1069 * should allow any size writes here.
1071 uio.uio_llimit = curproc->p_fsz_ctl;
1072 rlimit = uio.uio_llimit - wa->wa_offset;
1073 if (rlimit < (rlim64_t)uio.uio_resid)
1074 uio.uio_resid = (uint_t)rlimit;
1077 * For now we assume no append mode.
1080 * We're changing creds because VM may fault and we need
1081 * the cred of the current thread to be used if quota
1082 * checking is enabled.
1084 savecred = curthread->t_cred;
1085 curthread->t_cred = cr;
1086 error = fop_write(vp, &uio, FSYNC, cr, &ct);
1087 curthread->t_cred = savecred;
1089 if (iovp != iov)
1090 kmem_free(iovp, sizeof (*iovp) * iovcnt);
1093 fop_rwunlock(vp, V_WRITELOCK_TRUE, &ct);
1095 if (!error) {
1097 * Get attributes again so we send the latest mod
1098 * time to the client side for its cache.
1100 va.va_mask = AT_ALL; /* now we want everything */
1102 error = fop_getattr(vp, &va, 0, cr, &ct);
1104 /* check for overflows */
1105 if (!error) {
1106 acl_perm(vp, exi, &va, cr);
1107 error = vattr_to_nattr(&va, &ns->ns_attr);
1111 out:
1112 if (in_crit)
1113 nbl_end_crit(vp);
1114 VN_RELE(vp);
1116 /* check if a monitor detected a delegation conflict */
1117 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1118 /* mark as wouldblock so response is dropped */
1119 curthread->t_flag |= T_WOULDBLOCK;
1120 else
1121 ns->ns_status = puterrno(error);
1125 struct rfs_async_write {
1126 struct nfswriteargs *wa;
1127 struct nfsattrstat *ns;
1128 struct svc_req *req;
1129 cred_t *cr;
1130 bool_t ro;
1131 kthread_t *thread;
1132 struct rfs_async_write *list;
1135 struct rfs_async_write_list {
1136 fhandle_t *fhp;
1137 kcondvar_t cv;
1138 struct rfs_async_write *list;
1139 struct rfs_async_write_list *next;
1142 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1143 static kmutex_t rfs_async_write_lock;
1144 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1146 #define MAXCLIOVECS 42
1147 #define RFSWRITE_INITVAL (enum nfsstat) -1
1149 #ifdef DEBUG
1150 static int rfs_write_hits = 0;
1151 static int rfs_write_misses = 0;
1152 #endif
1155 * Write data to file.
1156 * Returns attributes of a file after writing some data to it.
1158 void
1159 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1160 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1162 int error;
1163 vnode_t *vp;
1164 rlim64_t rlimit;
1165 struct vattr va;
1166 struct uio uio;
1167 struct rfs_async_write_list *lp;
1168 struct rfs_async_write_list *nlp;
1169 struct rfs_async_write *rp;
1170 struct rfs_async_write *nrp;
1171 struct rfs_async_write *trp;
1172 struct rfs_async_write *lrp;
1173 int data_written;
1174 int iovcnt;
1175 mblk_t *m;
1176 struct iovec *iovp;
1177 struct iovec *niovp;
1178 struct iovec iov[MAXCLIOVECS];
1179 int count;
1180 int rcount;
1181 uint_t off;
1182 uint_t len;
1183 struct rfs_async_write nrpsp;
1184 struct rfs_async_write_list nlpsp;
1185 ushort_t t_flag;
1186 cred_t *savecred;
1187 int in_crit = 0;
1188 caller_context_t ct;
1190 if (!rfs_write_async) {
1191 rfs_write_sync(wa, ns, exi, req, cr, ro);
1192 return;
1196 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1197 * is considered an OK.
1199 ns->ns_status = RFSWRITE_INITVAL;
1201 nrp = &nrpsp;
1202 nrp->wa = wa;
1203 nrp->ns = ns;
1204 nrp->req = req;
1205 nrp->cr = cr;
1206 nrp->ro = ro;
1207 nrp->thread = curthread;
1210 * Look to see if there is already a cluster started
1211 * for this file.
1213 mutex_enter(&rfs_async_write_lock);
1214 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1215 if (bcmp(&wa->wa_fhandle, lp->fhp,
1216 sizeof (fhandle_t)) == 0)
1217 break;
1221 * If lp is non-NULL, then there is already a cluster
1222 * started. We need to place ourselves in the cluster
1223 * list in the right place as determined by starting
1224 * offset. Conflicts with non-blocking mandatory locked
1225 * regions will be checked when the cluster is processed.
1227 if (lp != NULL) {
1228 rp = lp->list;
1229 trp = NULL;
1230 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1231 trp = rp;
1232 rp = rp->list;
1234 nrp->list = rp;
1235 if (trp == NULL)
1236 lp->list = nrp;
1237 else
1238 trp->list = nrp;
1239 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1240 cv_wait(&lp->cv, &rfs_async_write_lock);
1241 mutex_exit(&rfs_async_write_lock);
1243 return;
1247 * No cluster started yet, start one and add ourselves
1248 * to the list of clusters.
1250 nrp->list = NULL;
1252 nlp = &nlpsp;
1253 nlp->fhp = &wa->wa_fhandle;
1254 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1255 nlp->list = nrp;
1256 nlp->next = NULL;
1258 if (rfs_async_write_head == NULL) {
1259 rfs_async_write_head = nlp;
1260 } else {
1261 lp = rfs_async_write_head;
1262 while (lp->next != NULL)
1263 lp = lp->next;
1264 lp->next = nlp;
1266 mutex_exit(&rfs_async_write_lock);
1269 * Convert the file handle common to all of the requests
1270 * in this cluster to a vnode.
1272 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1273 if (vp == NULL) {
1274 mutex_enter(&rfs_async_write_lock);
1275 if (rfs_async_write_head == nlp)
1276 rfs_async_write_head = nlp->next;
1277 else {
1278 lp = rfs_async_write_head;
1279 while (lp->next != nlp)
1280 lp = lp->next;
1281 lp->next = nlp->next;
1283 t_flag = curthread->t_flag & T_WOULDBLOCK;
1284 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1285 rp->ns->ns_status = NFSERR_STALE;
1286 rp->thread->t_flag |= t_flag;
1288 cv_broadcast(&nlp->cv);
1289 mutex_exit(&rfs_async_write_lock);
1291 return;
1295 * Can only write regular files. Attempts to write any
1296 * other file types fail with EISDIR.
1298 if (vp->v_type != VREG) {
1299 VN_RELE(vp);
1300 mutex_enter(&rfs_async_write_lock);
1301 if (rfs_async_write_head == nlp)
1302 rfs_async_write_head = nlp->next;
1303 else {
1304 lp = rfs_async_write_head;
1305 while (lp->next != nlp)
1306 lp = lp->next;
1307 lp->next = nlp->next;
1309 t_flag = curthread->t_flag & T_WOULDBLOCK;
1310 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1311 rp->ns->ns_status = NFSERR_ISDIR;
1312 rp->thread->t_flag |= t_flag;
1314 cv_broadcast(&nlp->cv);
1315 mutex_exit(&rfs_async_write_lock);
1317 return;
1321 * Enter the critical region before calling fop_rwlock, to avoid a
1322 * deadlock with ufs.
1324 if (nbl_need_check(vp)) {
1325 nbl_start_crit(vp, RW_READER);
1326 in_crit = 1;
1329 ct.cc_sysid = 0;
1330 ct.cc_pid = 0;
1331 ct.cc_caller_id = nfs2_srv_caller_id;
1332 ct.cc_flags = CC_DONTBLOCK;
1335 * Lock the file for writing. This operation provides
1336 * the delay which allows clusters to grow.
1338 error = fop_rwlock(vp, V_WRITELOCK_TRUE, &ct);
1340 /* check if a monitor detected a delegation conflict */
1341 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1342 if (in_crit)
1343 nbl_end_crit(vp);
1344 VN_RELE(vp);
1345 /* mark as wouldblock so response is dropped */
1346 curthread->t_flag |= T_WOULDBLOCK;
1347 mutex_enter(&rfs_async_write_lock);
1348 if (rfs_async_write_head == nlp)
1349 rfs_async_write_head = nlp->next;
1350 else {
1351 lp = rfs_async_write_head;
1352 while (lp->next != nlp)
1353 lp = lp->next;
1354 lp->next = nlp->next;
1356 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1357 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1358 rp->ns->ns_status = puterrno(error);
1359 rp->thread->t_flag |= T_WOULDBLOCK;
1362 cv_broadcast(&nlp->cv);
1363 mutex_exit(&rfs_async_write_lock);
1365 return;
1369 * Disconnect this cluster from the list of clusters.
1370 * The cluster that is being dealt with must be fixed
1371 * in size after this point, so there is no reason
1372 * to leave it on the list so that new requests can
1373 * find it.
1375 * The algorithm is that the first write request will
1376 * create a cluster, convert the file handle to a
1377 * vnode pointer, and then lock the file for writing.
1378 * This request is not likely to be clustered with
1379 * any others. However, the next request will create
1380 * a new cluster and be blocked in fop_rwlock while
1381 * the first request is being processed. This delay
1382 * will allow more requests to be clustered in this
1383 * second cluster.
1385 mutex_enter(&rfs_async_write_lock);
1386 if (rfs_async_write_head == nlp)
1387 rfs_async_write_head = nlp->next;
1388 else {
1389 lp = rfs_async_write_head;
1390 while (lp->next != nlp)
1391 lp = lp->next;
1392 lp->next = nlp->next;
1394 mutex_exit(&rfs_async_write_lock);
1397 * Step through the list of requests in this cluster.
1398 * We need to check permissions to make sure that all
1399 * of the requests have sufficient permission to write
1400 * the file. A cluster can be composed of requests
1401 * from different clients and different users on each
1402 * client.
1404 * As a side effect, we also calculate the size of the
1405 * byte range that this cluster encompasses.
1407 rp = nlp->list;
1408 off = rp->wa->wa_offset;
1409 len = (uint_t)0;
1410 do {
1411 if (rdonly(rp->ro, vp)) {
1412 rp->ns->ns_status = NFSERR_ROFS;
1413 t_flag = curthread->t_flag & T_WOULDBLOCK;
1414 rp->thread->t_flag |= t_flag;
1415 continue;
1418 va.va_mask = AT_UID|AT_MODE;
1420 error = fop_getattr(vp, &va, 0, rp->cr, &ct);
1422 if (!error) {
1423 if (crgetuid(rp->cr) != va.va_uid) {
1425 * This is a kludge to allow writes of files
1426 * created with read only permission. The
1427 * owner of the file is always allowed to
1428 * write it.
1430 error = fop_access(vp, VWRITE, 0, rp->cr, &ct);
1432 if (!error && MANDLOCK(vp, va.va_mode))
1433 error = EACCES;
1437 * Check for a conflict with a nbmand-locked region.
1439 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1440 rp->wa->wa_count, 0, NULL)) {
1441 error = EACCES;
1444 if (error) {
1445 rp->ns->ns_status = puterrno(error);
1446 t_flag = curthread->t_flag & T_WOULDBLOCK;
1447 rp->thread->t_flag |= t_flag;
1448 continue;
1450 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1451 len = rp->wa->wa_offset + rp->wa->wa_count - off;
1452 } while ((rp = rp->list) != NULL);
1455 * Step through the cluster attempting to gather as many
1456 * requests which are contiguous as possible. These
1457 * contiguous requests are handled via one call to fop_write
1458 * instead of different calls to fop_write. We also keep
1459 * track of the fact that any data was written.
1461 rp = nlp->list;
1462 data_written = 0;
1463 do {
1465 * Skip any requests which are already marked as having an
1466 * error.
1468 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1469 rp = rp->list;
1470 continue;
1474 * Count the number of iovec's which are required
1475 * to handle this set of requests. One iovec is
1476 * needed for each data buffer, whether addressed
1477 * by wa_data or by the b_rptr pointers in the
1478 * mblk chains.
1480 iovcnt = 0;
1481 lrp = rp;
1482 for (;;) {
1483 if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1484 iovcnt++;
1485 else {
1486 m = lrp->wa->wa_mblk;
1487 while (m != NULL) {
1488 iovcnt++;
1489 m = m->b_cont;
1492 if (lrp->list == NULL ||
1493 lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1494 lrp->wa->wa_offset + lrp->wa->wa_count !=
1495 lrp->list->wa->wa_offset) {
1496 lrp = lrp->list;
1497 break;
1499 lrp = lrp->list;
1502 if (iovcnt <= MAXCLIOVECS) {
1503 #ifdef DEBUG
1504 rfs_write_hits++;
1505 #endif
1506 niovp = iov;
1507 } else {
1508 #ifdef DEBUG
1509 rfs_write_misses++;
1510 #endif
1511 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1514 * Put together the scatter/gather iovecs.
1516 iovp = niovp;
1517 trp = rp;
1518 count = 0;
1519 do {
1520 if (trp->wa->wa_data || trp->wa->wa_rlist) {
1521 if (trp->wa->wa_rlist) {
1522 iovp->iov_base =
1523 (char *)((trp->wa->wa_rlist)->
1524 u.c_daddr3);
1525 iovp->iov_len = trp->wa->wa_count;
1526 } else {
1527 iovp->iov_base = trp->wa->wa_data;
1528 iovp->iov_len = trp->wa->wa_count;
1530 iovp++;
1531 } else {
1532 m = trp->wa->wa_mblk;
1533 rcount = trp->wa->wa_count;
1534 while (m != NULL) {
1535 iovp->iov_base = (caddr_t)m->b_rptr;
1536 iovp->iov_len = (m->b_wptr - m->b_rptr);
1537 rcount -= iovp->iov_len;
1538 if (rcount < 0)
1539 iovp->iov_len += rcount;
1540 iovp++;
1541 if (rcount <= 0)
1542 break;
1543 m = m->b_cont;
1546 count += trp->wa->wa_count;
1547 trp = trp->list;
1548 } while (trp != lrp);
1550 uio.uio_iov = niovp;
1551 uio.uio_iovcnt = iovcnt;
1552 uio.uio_segflg = UIO_SYSSPACE;
1553 uio.uio_extflg = UIO_COPY_DEFAULT;
1554 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1555 uio.uio_resid = count;
1557 * The limit is checked on the client. We
1558 * should allow any size writes here.
1560 uio.uio_llimit = curproc->p_fsz_ctl;
1561 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1562 if (rlimit < (rlim64_t)uio.uio_resid)
1563 uio.uio_resid = (uint_t)rlimit;
1566 * For now we assume no append mode.
1570 * We're changing creds because VM may fault
1571 * and we need the cred of the current
1572 * thread to be used if quota * checking is
1573 * enabled.
1575 savecred = curthread->t_cred;
1576 curthread->t_cred = cr;
1577 error = fop_write(vp, &uio, 0, rp->cr, &ct);
1578 curthread->t_cred = savecred;
1580 /* check if a monitor detected a delegation conflict */
1581 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1582 /* mark as wouldblock so response is dropped */
1583 curthread->t_flag |= T_WOULDBLOCK;
1585 if (niovp != iov)
1586 kmem_free(niovp, sizeof (*niovp) * iovcnt);
1588 if (!error) {
1589 data_written = 1;
1591 * Get attributes again so we send the latest mod
1592 * time to the client side for its cache.
1594 va.va_mask = AT_ALL; /* now we want everything */
1596 error = fop_getattr(vp, &va, 0, rp->cr, &ct);
1598 if (!error)
1599 acl_perm(vp, exi, &va, rp->cr);
1603 * Fill in the status responses for each request
1604 * which was just handled. Also, copy the latest
1605 * attributes in to the attribute responses if
1606 * appropriate.
1608 t_flag = curthread->t_flag & T_WOULDBLOCK;
1609 do {
1610 rp->thread->t_flag |= t_flag;
1611 /* check for overflows */
1612 if (!error) {
1613 error = vattr_to_nattr(&va, &rp->ns->ns_attr);
1615 rp->ns->ns_status = puterrno(error);
1616 rp = rp->list;
1617 } while (rp != lrp);
1618 } while (rp != NULL);
1621 * If any data was written at all, then we need to flush
1622 * the data and metadata to stable storage.
1624 if (data_written) {
1625 error = fop_putpage(vp, (uoff_t)off, len, 0, cr, &ct);
1627 if (!error) {
1628 error = fop_fsync(vp, FNODSYNC, cr, &ct);
1632 fop_rwunlock(vp, V_WRITELOCK_TRUE, &ct);
1634 if (in_crit)
1635 nbl_end_crit(vp);
1636 VN_RELE(vp);
1638 t_flag = curthread->t_flag & T_WOULDBLOCK;
1639 mutex_enter(&rfs_async_write_lock);
1640 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1641 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1642 rp->ns->ns_status = puterrno(error);
1643 rp->thread->t_flag |= t_flag;
1646 cv_broadcast(&nlp->cv);
1647 mutex_exit(&rfs_async_write_lock);
1651 void *
1652 rfs_write_getfh(struct nfswriteargs *wa)
1654 return (&wa->wa_fhandle);
1658 * Create a file.
1659 * Creates a file with given attributes and returns those attributes
1660 * and an fhandle for the new file.
1662 void
1663 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1664 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1666 int error;
1667 int lookuperr;
1668 int in_crit = 0;
1669 struct vattr va;
1670 vnode_t *vp;
1671 vnode_t *realvp;
1672 vnode_t *dvp;
1673 char *name = args->ca_da.da_name;
1674 vnode_t *tvp = NULL;
1675 int mode;
1676 int lookup_ok;
1677 bool_t trunc;
1678 struct sockaddr *ca;
1681 * Disallow NULL paths
1683 if (name == NULL || *name == '\0') {
1684 dr->dr_status = NFSERR_ACCES;
1685 return;
1688 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1689 if (dvp == NULL) {
1690 dr->dr_status = NFSERR_STALE;
1691 return;
1694 error = sattr_to_vattr(args->ca_sa, &va);
1695 if (error) {
1696 dr->dr_status = puterrno(error);
1697 return;
1701 * Must specify the mode.
1703 if (!(va.va_mask & AT_MODE)) {
1704 VN_RELE(dvp);
1705 dr->dr_status = NFSERR_INVAL;
1706 return;
1710 * This is a completely gross hack to make mknod
1711 * work over the wire until we can wack the protocol
1713 if ((va.va_mode & IFMT) == IFCHR) {
1714 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1715 va.va_type = VFIFO; /* xtra kludge for named pipe */
1716 else {
1717 va.va_type = VCHR;
1719 * uncompress the received dev_t
1720 * if the top half is zero indicating a request
1721 * from an `older style' OS.
1723 if ((va.va_size & 0xffff0000) == 0)
1724 va.va_rdev = nfsv2_expdev(va.va_size);
1725 else
1726 va.va_rdev = (dev_t)va.va_size;
1728 va.va_mask &= ~AT_SIZE;
1729 } else if ((va.va_mode & IFMT) == IFBLK) {
1730 va.va_type = VBLK;
1732 * uncompress the received dev_t
1733 * if the top half is zero indicating a request
1734 * from an `older style' OS.
1736 if ((va.va_size & 0xffff0000) == 0)
1737 va.va_rdev = nfsv2_expdev(va.va_size);
1738 else
1739 va.va_rdev = (dev_t)va.va_size;
1740 va.va_mask &= ~AT_SIZE;
1741 } else if ((va.va_mode & IFMT) == IFSOCK) {
1742 va.va_type = VSOCK;
1743 } else {
1744 va.va_type = VREG;
1746 va.va_mode &= ~IFMT;
1747 va.va_mask |= AT_TYPE;
1749 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1750 name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1751 MAXPATHLEN);
1752 if (name == NULL) {
1753 dr->dr_status = puterrno(EINVAL);
1754 return;
1758 * Why was the choice made to use VWRITE as the mode to the
1759 * call to fop_create ? This results in a bug. When a client
1760 * opens a file that already exists and is RDONLY, the second
1761 * open fails with an EACESS because of the mode.
1762 * bug ID 1054648.
1764 lookup_ok = 0;
1765 mode = VWRITE;
1766 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1767 error = fop_lookup(dvp, name, &tvp, NULL, 0, NULL, cr,
1768 NULL, NULL, NULL);
1769 if (!error) {
1770 struct vattr at;
1772 lookup_ok = 1;
1773 at.va_mask = AT_MODE;
1774 error = fop_getattr(tvp, &at, 0, cr, NULL);
1775 if (!error)
1776 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1777 VN_RELE(tvp);
1778 tvp = NULL;
1782 if (!lookup_ok) {
1783 if (rdonly(ro, dvp)) {
1784 error = EROFS;
1785 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1786 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1787 error = EPERM;
1788 } else {
1789 error = 0;
1794 * If file size is being modified on an already existing file
1795 * make sure that there are no conflicting non-blocking mandatory
1796 * locks in the region being manipulated. Return EACCES if there
1797 * are conflicting locks.
1799 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1800 lookuperr = fop_lookup(dvp, name, &tvp, NULL, 0, NULL, cr,
1801 NULL, NULL, NULL);
1803 if (!lookuperr &&
1804 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1805 VN_RELE(tvp);
1806 curthread->t_flag |= T_WOULDBLOCK;
1807 goto out;
1810 if (!lookuperr && nbl_need_check(tvp)) {
1812 * The file exists. Now check if it has any
1813 * conflicting non-blocking mandatory locks
1814 * in the region being changed.
1816 struct vattr bva;
1817 uoff_t offset;
1818 ssize_t length;
1820 nbl_start_crit(tvp, RW_READER);
1821 in_crit = 1;
1823 bva.va_mask = AT_SIZE;
1824 error = fop_getattr(tvp, &bva, 0, cr, NULL);
1825 if (!error) {
1826 if (va.va_size < bva.va_size) {
1827 offset = va.va_size;
1828 length = bva.va_size - va.va_size;
1829 } else {
1830 offset = bva.va_size;
1831 length = va.va_size - bva.va_size;
1833 if (length) {
1834 if (nbl_conflict(tvp, NBL_WRITE,
1835 offset, length, 0, NULL)) {
1836 error = EACCES;
1840 if (error) {
1841 nbl_end_crit(tvp);
1842 VN_RELE(tvp);
1843 in_crit = 0;
1845 } else if (tvp != NULL) {
1846 VN_RELE(tvp);
1850 if (!error) {
1852 * If filesystem is shared with nosuid the remove any
1853 * setuid/setgid bits on create.
1855 if (va.va_type == VREG &&
1856 exi->exi_export.ex_flags & EX_NOSUID)
1857 va.va_mode &= ~(VSUID | VSGID);
1859 error = fop_create(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1860 NULL, NULL);
1862 if (!error) {
1864 if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1865 trunc = TRUE;
1866 else
1867 trunc = FALSE;
1869 if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1870 VN_RELE(vp);
1871 curthread->t_flag |= T_WOULDBLOCK;
1872 goto out;
1874 va.va_mask = AT_ALL;
1876 error = fop_getattr(vp, &va, 0, cr, NULL);
1878 /* check for overflows */
1879 if (!error) {
1880 acl_perm(vp, exi, &va, cr);
1881 error = vattr_to_nattr(&va, &dr->dr_attr);
1882 if (!error) {
1883 error = makefh(&dr->dr_fhandle, vp,
1884 exi);
1888 * Force modified metadata out to stable storage.
1890 * if a underlying vp exists, pass it to fop_fsync
1892 if (fop_realvp(vp, &realvp, NULL) == 0)
1893 (void) fop_fsync(realvp, FNODSYNC, cr, NULL);
1894 else
1895 (void) fop_fsync(vp, FNODSYNC, cr, NULL);
1896 VN_RELE(vp);
1899 if (in_crit) {
1900 nbl_end_crit(tvp);
1901 VN_RELE(tvp);
1906 * Force modified data and metadata out to stable storage.
1908 (void) fop_fsync(dvp, 0, cr, NULL);
1910 out:
1912 VN_RELE(dvp);
1914 dr->dr_status = puterrno(error);
1916 if (name != args->ca_da.da_name)
1917 kmem_free(name, MAXPATHLEN);
1919 void *
1920 rfs_create_getfh(struct nfscreatargs *args)
1922 return (args->ca_da.da_fhandle);
1926 * Remove a file.
1927 * Remove named file from parent directory.
1929 /* ARGSUSED */
1930 void
1931 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1932 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1934 int error = 0;
1935 vnode_t *vp;
1936 vnode_t *targvp;
1937 int in_crit = 0;
1940 * Disallow NULL paths
1942 if (da->da_name == NULL || *da->da_name == '\0') {
1943 *status = NFSERR_ACCES;
1944 return;
1947 vp = nfs_fhtovp(da->da_fhandle, exi);
1948 if (vp == NULL) {
1949 *status = NFSERR_STALE;
1950 return;
1953 if (rdonly(ro, vp)) {
1954 VN_RELE(vp);
1955 *status = NFSERR_ROFS;
1956 return;
1960 * Check for a conflict with a non-blocking mandatory share reservation.
1962 error = fop_lookup(vp, da->da_name, &targvp, NULL, 0,
1963 NULL, cr, NULL, NULL, NULL);
1964 if (error != 0) {
1965 VN_RELE(vp);
1966 *status = puterrno(error);
1967 return;
1971 * If the file is delegated to an v4 client, then initiate
1972 * recall and drop this request (by setting T_WOULDBLOCK).
1973 * The client will eventually re-transmit the request and
1974 * (hopefully), by then, the v4 client will have returned
1975 * the delegation.
1978 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1979 VN_RELE(vp);
1980 VN_RELE(targvp);
1981 curthread->t_flag |= T_WOULDBLOCK;
1982 return;
1985 if (nbl_need_check(targvp)) {
1986 nbl_start_crit(targvp, RW_READER);
1987 in_crit = 1;
1988 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1989 error = EACCES;
1990 goto out;
1994 error = fop_remove(vp, da->da_name, cr, NULL, 0);
1997 * Force modified data and metadata out to stable storage.
1999 (void) fop_fsync(vp, 0, cr, NULL);
2001 out:
2002 if (in_crit)
2003 nbl_end_crit(targvp);
2004 VN_RELE(targvp);
2005 VN_RELE(vp);
2007 *status = puterrno(error);
2011 void *
2012 rfs_remove_getfh(struct nfsdiropargs *da)
2014 return (da->da_fhandle);
2018 * rename a file
2019 * Give a file (from) a new name (to).
2021 /* ARGSUSED */
2022 void
2023 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2024 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2026 int error = 0;
2027 vnode_t *fromvp;
2028 vnode_t *tovp;
2029 struct exportinfo *to_exi;
2030 fhandle_t *fh;
2031 vnode_t *srcvp;
2032 vnode_t *targvp;
2033 int in_crit = 0;
2035 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2036 if (fromvp == NULL) {
2037 *status = NFSERR_STALE;
2038 return;
2041 fh = args->rna_to.da_fhandle;
2042 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2043 if (to_exi == NULL) {
2044 VN_RELE(fromvp);
2045 *status = NFSERR_ACCES;
2046 return;
2048 exi_rele(to_exi);
2050 if (to_exi != exi) {
2051 VN_RELE(fromvp);
2052 *status = NFSERR_XDEV;
2053 return;
2056 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2057 if (tovp == NULL) {
2058 VN_RELE(fromvp);
2059 *status = NFSERR_STALE;
2060 return;
2063 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2064 VN_RELE(tovp);
2065 VN_RELE(fromvp);
2066 *status = NFSERR_NOTDIR;
2067 return;
2071 * Disallow NULL paths
2073 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2074 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2075 VN_RELE(tovp);
2076 VN_RELE(fromvp);
2077 *status = NFSERR_ACCES;
2078 return;
2081 if (rdonly(ro, tovp)) {
2082 VN_RELE(tovp);
2083 VN_RELE(fromvp);
2084 *status = NFSERR_ROFS;
2085 return;
2089 * Check for a conflict with a non-blocking mandatory share reservation.
2091 error = fop_lookup(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2092 NULL, cr, NULL, NULL, NULL);
2093 if (error != 0) {
2094 VN_RELE(tovp);
2095 VN_RELE(fromvp);
2096 *status = puterrno(error);
2097 return;
2100 /* Check for delegations on the source file */
2102 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2103 VN_RELE(tovp);
2104 VN_RELE(fromvp);
2105 VN_RELE(srcvp);
2106 curthread->t_flag |= T_WOULDBLOCK;
2107 return;
2110 /* Check for delegation on the file being renamed over, if it exists */
2112 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2113 fop_lookup(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2114 NULL, NULL, NULL) == 0) {
2116 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2117 VN_RELE(tovp);
2118 VN_RELE(fromvp);
2119 VN_RELE(srcvp);
2120 VN_RELE(targvp);
2121 curthread->t_flag |= T_WOULDBLOCK;
2122 return;
2124 VN_RELE(targvp);
2128 if (nbl_need_check(srcvp)) {
2129 nbl_start_crit(srcvp, RW_READER);
2130 in_crit = 1;
2131 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2132 error = EACCES;
2133 goto out;
2137 error = fop_rename(fromvp, args->rna_from.da_name,
2138 tovp, args->rna_to.da_name, cr, NULL, 0);
2140 if (error == 0)
2141 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2142 strlen(args->rna_to.da_name));
2145 * Force modified data and metadata out to stable storage.
2147 (void) fop_fsync(tovp, 0, cr, NULL);
2148 (void) fop_fsync(fromvp, 0, cr, NULL);
2150 out:
2151 if (in_crit)
2152 nbl_end_crit(srcvp);
2153 VN_RELE(srcvp);
2154 VN_RELE(tovp);
2155 VN_RELE(fromvp);
2157 *status = puterrno(error);
2160 void *
2161 rfs_rename_getfh(struct nfsrnmargs *args)
2163 return (args->rna_from.da_fhandle);
2167 * Link to a file.
2168 * Create a file (to) which is a hard link to the given file (from).
2170 /* ARGSUSED */
2171 void
2172 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2173 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2175 int error;
2176 vnode_t *fromvp;
2177 vnode_t *tovp;
2178 struct exportinfo *to_exi;
2179 fhandle_t *fh;
2181 fromvp = nfs_fhtovp(args->la_from, exi);
2182 if (fromvp == NULL) {
2183 *status = NFSERR_STALE;
2184 return;
2187 fh = args->la_to.da_fhandle;
2188 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2189 if (to_exi == NULL) {
2190 VN_RELE(fromvp);
2191 *status = NFSERR_ACCES;
2192 return;
2194 exi_rele(to_exi);
2196 if (to_exi != exi) {
2197 VN_RELE(fromvp);
2198 *status = NFSERR_XDEV;
2199 return;
2202 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2203 if (tovp == NULL) {
2204 VN_RELE(fromvp);
2205 *status = NFSERR_STALE;
2206 return;
2209 if (tovp->v_type != VDIR) {
2210 VN_RELE(tovp);
2211 VN_RELE(fromvp);
2212 *status = NFSERR_NOTDIR;
2213 return;
2216 * Disallow NULL paths
2218 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2219 VN_RELE(tovp);
2220 VN_RELE(fromvp);
2221 *status = NFSERR_ACCES;
2222 return;
2225 if (rdonly(ro, tovp)) {
2226 VN_RELE(tovp);
2227 VN_RELE(fromvp);
2228 *status = NFSERR_ROFS;
2229 return;
2232 error = fop_link(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2235 * Force modified data and metadata out to stable storage.
2237 (void) fop_fsync(tovp, 0, cr, NULL);
2238 (void) fop_fsync(fromvp, FNODSYNC, cr, NULL);
2240 VN_RELE(tovp);
2241 VN_RELE(fromvp);
2243 *status = puterrno(error);
2246 void *
2247 rfs_link_getfh(struct nfslinkargs *args)
2249 return (args->la_from);
2253 * Symbolicly link to a file.
2254 * Create a file (to) with the given attributes which is a symbolic link
2255 * to the given path name (to).
2257 void
2258 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2259 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2261 int error;
2262 struct vattr va;
2263 vnode_t *vp;
2264 vnode_t *svp;
2265 int lerror;
2266 struct sockaddr *ca;
2267 char *name = NULL;
2270 * Disallow NULL paths
2272 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2273 *status = NFSERR_ACCES;
2274 return;
2277 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2278 if (vp == NULL) {
2279 *status = NFSERR_STALE;
2280 return;
2283 if (rdonly(ro, vp)) {
2284 VN_RELE(vp);
2285 *status = NFSERR_ROFS;
2286 return;
2289 error = sattr_to_vattr(args->sla_sa, &va);
2290 if (error) {
2291 VN_RELE(vp);
2292 *status = puterrno(error);
2293 return;
2296 if (!(va.va_mask & AT_MODE)) {
2297 VN_RELE(vp);
2298 *status = NFSERR_INVAL;
2299 return;
2302 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2303 name = nfscmd_convname(ca, exi, args->sla_tnm,
2304 NFSCMD_CONV_INBOUND, MAXPATHLEN);
2306 if (name == NULL) {
2307 *status = NFSERR_ACCES;
2308 return;
2311 va.va_type = VLNK;
2312 va.va_mask |= AT_TYPE;
2314 error = fop_symlink(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2317 * Force new data and metadata out to stable storage.
2319 lerror = fop_lookup(vp, args->sla_from.da_name, &svp, NULL, 0,
2320 NULL, cr, NULL, NULL, NULL);
2322 if (!lerror) {
2323 (void) fop_fsync(svp, 0, cr, NULL);
2324 VN_RELE(svp);
2328 * Force modified data and metadata out to stable storage.
2330 (void) fop_fsync(vp, 0, cr, NULL);
2332 VN_RELE(vp);
2334 *status = puterrno(error);
2335 if (name != args->sla_tnm)
2336 kmem_free(name, MAXPATHLEN);
2339 void *
2340 rfs_symlink_getfh(struct nfsslargs *args)
2342 return (args->sla_from.da_fhandle);
2346 * Make a directory.
2347 * Create a directory with the given name, parent directory, and attributes.
2348 * Returns a file handle and attributes for the new directory.
2350 /* ARGSUSED */
2351 void
2352 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2353 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2355 int error;
2356 struct vattr va;
2357 vnode_t *dvp = NULL;
2358 vnode_t *vp;
2359 char *name = args->ca_da.da_name;
2362 * Disallow NULL paths
2364 if (name == NULL || *name == '\0') {
2365 dr->dr_status = NFSERR_ACCES;
2366 return;
2369 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2370 if (vp == NULL) {
2371 dr->dr_status = NFSERR_STALE;
2372 return;
2375 if (rdonly(ro, vp)) {
2376 VN_RELE(vp);
2377 dr->dr_status = NFSERR_ROFS;
2378 return;
2381 error = sattr_to_vattr(args->ca_sa, &va);
2382 if (error) {
2383 VN_RELE(vp);
2384 dr->dr_status = puterrno(error);
2385 return;
2388 if (!(va.va_mask & AT_MODE)) {
2389 VN_RELE(vp);
2390 dr->dr_status = NFSERR_INVAL;
2391 return;
2394 va.va_type = VDIR;
2395 va.va_mask |= AT_TYPE;
2397 error = fop_mkdir(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2399 if (!error) {
2401 * Attribtutes of the newly created directory should
2402 * be returned to the client.
2404 va.va_mask = AT_ALL; /* We want everything */
2405 error = fop_getattr(dvp, &va, 0, cr, NULL);
2407 /* check for overflows */
2408 if (!error) {
2409 acl_perm(vp, exi, &va, cr);
2410 error = vattr_to_nattr(&va, &dr->dr_attr);
2411 if (!error) {
2412 error = makefh(&dr->dr_fhandle, dvp, exi);
2416 * Force new data and metadata out to stable storage.
2418 (void) fop_fsync(dvp, 0, cr, NULL);
2419 VN_RELE(dvp);
2423 * Force modified data and metadata out to stable storage.
2425 (void) fop_fsync(vp, 0, cr, NULL);
2427 VN_RELE(vp);
2429 dr->dr_status = puterrno(error);
2432 void *
2433 rfs_mkdir_getfh(struct nfscreatargs *args)
2435 return (args->ca_da.da_fhandle);
2439 * Remove a directory.
2440 * Remove the given directory name from the given parent directory.
2442 /* ARGSUSED */
2443 void
2444 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2445 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2447 int error;
2448 vnode_t *vp;
2451 * Disallow NULL paths
2453 if (da->da_name == NULL || *da->da_name == '\0') {
2454 *status = NFSERR_ACCES;
2455 return;
2458 vp = nfs_fhtovp(da->da_fhandle, exi);
2459 if (vp == NULL) {
2460 *status = NFSERR_STALE;
2461 return;
2464 if (rdonly(ro, vp)) {
2465 VN_RELE(vp);
2466 *status = NFSERR_ROFS;
2467 return;
2471 * fop_rmdir takes a third argument (the current
2472 * directory of the process). That's because someone
2473 * wants to return EINVAL if one tries to remove ".".
2474 * Of course, NFS servers have no idea what their
2475 * clients' current directories are. We fake it by
2476 * supplying a vnode known to exist and illegal to
2477 * remove.
2479 error = fop_rmdir(vp, da->da_name, rootdir, cr, NULL, 0);
2482 * Force modified data and metadata out to stable storage.
2484 (void) fop_fsync(vp, 0, cr, NULL);
2486 VN_RELE(vp);
2489 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2490 * if the directory is not empty. A System V NFS server
2491 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2492 * over the wire.
2494 if (error == EEXIST)
2495 *status = NFSERR_NOTEMPTY;
2496 else
2497 *status = puterrno(error);
2500 void *
2501 rfs_rmdir_getfh(struct nfsdiropargs *da)
2503 return (da->da_fhandle);
2506 /* ARGSUSED */
2507 void
2508 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2509 struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2511 int error;
2512 int iseof;
2513 struct iovec iov;
2514 struct uio uio;
2515 vnode_t *vp;
2516 char *ndata = NULL;
2517 struct sockaddr *ca;
2518 size_t nents;
2519 int ret;
2521 vp = nfs_fhtovp(&rda->rda_fh, exi);
2522 if (vp == NULL) {
2523 rd->rd_entries = NULL;
2524 rd->rd_status = NFSERR_STALE;
2525 return;
2528 if (vp->v_type != VDIR) {
2529 VN_RELE(vp);
2530 rd->rd_entries = NULL;
2531 rd->rd_status = NFSERR_NOTDIR;
2532 return;
2535 (void) fop_rwlock(vp, V_WRITELOCK_FALSE, NULL);
2537 error = fop_access(vp, VREAD, 0, cr, NULL);
2539 if (error) {
2540 rd->rd_entries = NULL;
2541 goto bad;
2544 if (rda->rda_count == 0) {
2545 rd->rd_entries = NULL;
2546 rd->rd_size = 0;
2547 rd->rd_eof = FALSE;
2548 goto bad;
2551 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2554 * Allocate data for entries. This will be freed by rfs_rddirfree.
2556 rd->rd_bufsize = (uint_t)rda->rda_count;
2557 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2560 * Set up io vector to read directory data
2562 iov.iov_base = (caddr_t)rd->rd_entries;
2563 iov.iov_len = rda->rda_count;
2564 uio.uio_iov = &iov;
2565 uio.uio_iovcnt = 1;
2566 uio.uio_segflg = UIO_SYSSPACE;
2567 uio.uio_extflg = UIO_COPY_CACHED;
2568 uio.uio_loffset = (offset_t)rda->rda_offset;
2569 uio.uio_resid = rda->rda_count;
2572 * read directory
2574 error = fop_readdir(vp, &uio, cr, &iseof, NULL, 0);
2577 * Clean up
2579 if (!error) {
2581 * set size and eof
2583 if (uio.uio_resid == rda->rda_count) {
2584 rd->rd_size = 0;
2585 rd->rd_eof = TRUE;
2586 } else {
2587 rd->rd_size = (uint32_t)(rda->rda_count -
2588 uio.uio_resid);
2589 rd->rd_eof = iseof ? TRUE : FALSE;
2593 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2594 nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2595 ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2596 rda->rda_count, &ndata);
2598 if (ret != 0) {
2599 size_t dropbytes;
2601 * We had to drop one or more entries in order to fit
2602 * during the character conversion. We need to patch
2603 * up the size and eof info.
2605 if (rd->rd_eof)
2606 rd->rd_eof = FALSE;
2607 dropbytes = nfscmd_dropped_entrysize(
2608 (struct dirent64 *)rd->rd_entries, nents, ret);
2609 rd->rd_size -= dropbytes;
2611 if (ndata == NULL) {
2612 ndata = (char *)rd->rd_entries;
2613 } else if (ndata != (char *)rd->rd_entries) {
2614 kmem_free(rd->rd_entries, rd->rd_bufsize);
2615 rd->rd_entries = (void *)ndata;
2616 rd->rd_bufsize = rda->rda_count;
2619 bad:
2620 fop_rwunlock(vp, V_WRITELOCK_FALSE, NULL);
2622 #if 0 /* notyet */
2624 * Don't do this. It causes local disk writes when just
2625 * reading the file and the overhead is deemed larger
2626 * than the benefit.
2629 * Force modified metadata out to stable storage.
2631 (void) fop_fsync(vp, FNODSYNC, cr, NULL);
2632 #endif
2634 VN_RELE(vp);
2636 rd->rd_status = puterrno(error);
2639 void *
2640 rfs_readdir_getfh(struct nfsrddirargs *rda)
2642 return (&rda->rda_fh);
2644 void
2645 rfs_rddirfree(struct nfsrddirres *rd)
2647 if (rd->rd_entries != NULL)
2648 kmem_free(rd->rd_entries, rd->rd_bufsize);
2651 /* ARGSUSED */
2652 void
2653 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2654 struct svc_req *req, cred_t *cr, bool_t ro)
2656 int error;
2657 struct statvfs64 sb;
2658 vnode_t *vp;
2660 vp = nfs_fhtovp(fh, exi);
2661 if (vp == NULL) {
2662 fs->fs_status = NFSERR_STALE;
2663 return;
2666 error = VFS_STATVFS(vp->v_vfsp, &sb);
2668 if (!error) {
2669 fs->fs_tsize = nfstsize();
2670 fs->fs_bsize = sb.f_frsize;
2671 fs->fs_blocks = sb.f_blocks;
2672 fs->fs_bfree = sb.f_bfree;
2673 fs->fs_bavail = sb.f_bavail;
2676 VN_RELE(vp);
2678 fs->fs_status = puterrno(error);
2681 void *
2682 rfs_statfs_getfh(fhandle_t *fh)
2684 return (fh);
2687 static int
2688 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2690 vap->va_mask = 0;
2693 * There was a sign extension bug in some VFS based systems
2694 * which stored the mode as a short. When it would get
2695 * assigned to a u_long, no sign extension would occur.
2696 * It needed to, but this wasn't noticed because sa_mode
2697 * would then get assigned back to the short, thus ignoring
2698 * the upper 16 bits of sa_mode.
2700 * To make this implementation work for both broken
2701 * clients and good clients, we check for both versions
2702 * of the mode.
2704 if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2705 sa->sa_mode != (uint32_t)-1) {
2706 vap->va_mask |= AT_MODE;
2707 vap->va_mode = sa->sa_mode;
2709 if (sa->sa_uid != (uint32_t)-1) {
2710 vap->va_mask |= AT_UID;
2711 vap->va_uid = sa->sa_uid;
2713 if (sa->sa_gid != (uint32_t)-1) {
2714 vap->va_mask |= AT_GID;
2715 vap->va_gid = sa->sa_gid;
2717 if (sa->sa_size != (uint32_t)-1) {
2718 vap->va_mask |= AT_SIZE;
2719 vap->va_size = sa->sa_size;
2721 if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2722 sa->sa_atime.tv_usec != (int32_t)-1) {
2723 #ifndef _LP64
2724 /* return error if time overflow */
2725 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2726 return (EOVERFLOW);
2727 #endif
2728 vap->va_mask |= AT_ATIME;
2730 * nfs protocol defines times as unsigned so don't extend sign,
2731 * unless sysadmin set nfs_allow_preepoch_time.
2733 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2734 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2736 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2737 sa->sa_mtime.tv_usec != (int32_t)-1) {
2738 #ifndef _LP64
2739 /* return error if time overflow */
2740 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2741 return (EOVERFLOW);
2742 #endif
2743 vap->va_mask |= AT_MTIME;
2745 * nfs protocol defines times as unsigned so don't extend sign,
2746 * unless sysadmin set nfs_allow_preepoch_time.
2748 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2749 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2751 return (0);
2754 static enum nfsftype vt_to_nf[] = {
2755 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2759 * check the following fields for overflow: nodeid, size, and time.
2760 * There could be a problem when converting 64-bit LP64 fields
2761 * into 32-bit ones. Return an error if there is an overflow.
2764 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2766 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2767 na->na_type = vt_to_nf[vap->va_type];
2769 if (vap->va_mode == (unsigned short) -1)
2770 na->na_mode = (uint32_t)-1;
2771 else
2772 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2774 if (vap->va_uid == (unsigned short)(-1))
2775 na->na_uid = (uint32_t)(-1);
2776 else if (vap->va_uid == UID_NOBODY)
2777 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2778 else
2779 na->na_uid = vap->va_uid;
2781 if (vap->va_gid == (unsigned short)(-1))
2782 na->na_gid = (uint32_t)-1;
2783 else if (vap->va_gid == GID_NOBODY)
2784 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2785 else
2786 na->na_gid = vap->va_gid;
2789 * Do we need to check fsid for overflow? It is 64-bit in the
2790 * vattr, but are bigger than 32 bit values supported?
2792 na->na_fsid = vap->va_fsid;
2794 na->na_nodeid = vap->va_nodeid;
2797 * Check to make sure that the nodeid is representable over the
2798 * wire without losing bits.
2800 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2801 return (EFBIG);
2802 na->na_nlink = vap->va_nlink;
2805 * Check for big files here, instead of at the caller. See
2806 * comments in cstat for large special file explanation.
2808 if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2809 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2810 return (EFBIG);
2811 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2812 /* UNKNOWN_SIZE | OVERFLOW */
2813 na->na_size = MAXOFF32_T;
2814 } else
2815 na->na_size = vap->va_size;
2816 } else
2817 na->na_size = vap->va_size;
2820 * If the vnode times overflow the 32-bit times that NFS2
2821 * uses on the wire then return an error.
2823 if (!NFS_VAP_TIME_OK(vap)) {
2824 return (EOVERFLOW);
2826 na->na_atime.tv_sec = vap->va_atime.tv_sec;
2827 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2829 na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2830 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2832 na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2833 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2836 * If the dev_t will fit into 16 bits then compress
2837 * it, otherwise leave it alone. See comments in
2838 * nfs_client.c.
2840 if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2841 getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2842 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2843 else
2844 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2846 na->na_blocks = vap->va_nblocks;
2847 na->na_blocksize = vap->va_blksize;
2850 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2851 * over-the-wire protocols for named-pipe vnodes. It remaps the
2852 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2854 * BUYER BEWARE:
2855 * If you are porting the NFS to a non-Sun server, you probably
2856 * don't want to include the following block of code. The
2857 * over-the-wire special file types will be changing with the
2858 * NFS Protocol Revision.
2860 if (vap->va_type == VFIFO)
2861 NA_SETFIFO(na);
2862 return (0);
2866 * acl v2 support: returns approximate permission.
2867 * default: returns minimal permission (more restrictive)
2868 * aclok: returns maximal permission (less restrictive)
2869 * This routine changes the permissions that are alaredy in *va.
2870 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2871 * CLASS_OBJ is always the same as GROUP_OBJ entry.
2873 static void
2874 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2876 vsecattr_t vsa;
2877 int aclcnt;
2878 aclent_t *aclentp;
2879 mode_t mask_perm;
2880 mode_t grp_perm;
2881 mode_t other_perm;
2882 mode_t other_orig;
2883 int error;
2885 /* dont care default acl */
2886 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2887 error = fop_getsecattr(vp, &vsa, 0, cr, NULL);
2889 if (!error) {
2890 aclcnt = vsa.vsa_aclcnt;
2891 if (aclcnt > MIN_ACL_ENTRIES) {
2892 /* non-trivial ACL */
2893 aclentp = vsa.vsa_aclentp;
2894 if (exi->exi_export.ex_flags & EX_ACLOK) {
2895 /* maximal permissions */
2896 grp_perm = 0;
2897 other_perm = 0;
2898 for (; aclcnt > 0; aclcnt--, aclentp++) {
2899 switch (aclentp->a_type) {
2900 case USER_OBJ:
2901 break;
2902 case USER:
2903 grp_perm |=
2904 aclentp->a_perm << 3;
2905 other_perm |= aclentp->a_perm;
2906 break;
2907 case GROUP_OBJ:
2908 grp_perm |=
2909 aclentp->a_perm << 3;
2910 break;
2911 case GROUP:
2912 other_perm |= aclentp->a_perm;
2913 break;
2914 case OTHER_OBJ:
2915 other_orig = aclentp->a_perm;
2916 break;
2917 case CLASS_OBJ:
2918 mask_perm = aclentp->a_perm;
2919 break;
2920 default:
2921 break;
2924 grp_perm &= mask_perm << 3;
2925 other_perm &= mask_perm;
2926 other_perm |= other_orig;
2928 } else {
2929 /* minimal permissions */
2930 grp_perm = 070;
2931 other_perm = 07;
2932 for (; aclcnt > 0; aclcnt--, aclentp++) {
2933 switch (aclentp->a_type) {
2934 case USER_OBJ:
2935 break;
2936 case USER:
2937 case CLASS_OBJ:
2938 grp_perm &=
2939 aclentp->a_perm << 3;
2940 other_perm &=
2941 aclentp->a_perm;
2942 break;
2943 case GROUP_OBJ:
2944 grp_perm &=
2945 aclentp->a_perm << 3;
2946 break;
2947 case GROUP:
2948 other_perm &=
2949 aclentp->a_perm;
2950 break;
2951 case OTHER_OBJ:
2952 other_perm &=
2953 aclentp->a_perm;
2954 break;
2955 default:
2956 break;
2960 /* copy to va */
2961 va->va_mode &= ~077;
2962 va->va_mode |= grp_perm | other_perm;
2964 if (vsa.vsa_aclcnt)
2965 kmem_free(vsa.vsa_aclentp,
2966 vsa.vsa_aclcnt * sizeof (aclent_t));
2970 void
2971 rfs_srvrinit(void)
2973 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2974 nfs2_srv_caller_id = fs_new_caller_id();
2977 void
2978 rfs_srvrfini(void)
2980 mutex_destroy(&rfs_async_write_lock);
2983 static int
2984 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2986 struct clist *wcl;
2987 int wlist_len;
2988 uint32_t count = rr->rr_count;
2990 wcl = ra->ra_wlist;
2992 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
2993 return (FALSE);
2996 wcl = ra->ra_wlist;
2997 rr->rr_ok.rrok_wlist_len = wlist_len;
2998 rr->rr_ok.rrok_wlist = wcl;
3000 return (TRUE);