1 /* $NetBSD: nfs_socket.c,v 1.183 2009/12/06 18:00:15 dyoung Exp $ */
4 * Copyright (c) 1989, 1991, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
38 * Socket operations for use by nfs
41 #include <sys/cdefs.h>
42 __KERNEL_RCSID(0, "$NetBSD: nfs_socket.c,v 1.183 2009/12/06 18:00:15 dyoung Exp $");
47 #include "opt_mbuftrace.h"
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/evcnt.h>
53 #include <sys/callout.h>
55 #include <sys/mount.h>
56 #include <sys/kernel.h>
59 #include <sys/vnode.h>
60 #include <sys/domain.h>
61 #include <sys/protosw.h>
62 #include <sys/socket.h>
63 #include <sys/socketvar.h>
64 #include <sys/syslog.h>
65 #include <sys/tprintf.h>
66 #include <sys/namei.h>
67 #include <sys/signal.h>
68 #include <sys/signalvar.h>
69 #include <sys/kauth.h>
71 #include <netinet/in.h>
72 #include <netinet/tcp.h>
74 #include <nfs/rpcv2.h>
75 #include <nfs/nfsproto.h>
77 #include <nfs/xdr_subs.h>
78 #include <nfs/nfsm_subs.h>
79 #include <nfs/nfsmount.h>
80 #include <nfs/nfsnode.h>
81 #include <nfs/nfsrtt.h>
82 #include <nfs/nfs_var.h>
85 struct mowner nfs_mowner
= MOWNER_INIT("nfs","");
89 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
90 * Use the mean and mean deviation of rtt for the appropriate type of rpc
91 * for the frequent rpcs and a default for the others.
92 * The justification for doing "other" this way is that these rpcs
93 * happen so infrequently that timer est. would probably be stale.
94 * Also, since many of these rpcs are
95 * non-idempotent, a conservative timeout is desired.
96 * getattr, lookup - A+2D
100 #define NFS_RTO(n, t) \
101 ((t) == 0 ? (n)->nm_timeo : \
103 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
104 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
105 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
106 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
108 * External data, mostly RPC constants in XDR form
110 extern u_int32_t rpc_reply
, rpc_msgdenied
, rpc_mismatch
, rpc_vers
,
111 rpc_auth_unix
, rpc_msgaccepted
, rpc_call
, rpc_autherr
,
113 extern u_int32_t nfs_prog
;
114 extern const int nfsv3_procid
[NFS_NPROCS
];
115 extern int nfs_ticks
;
119 * Avoid spamming the console with debugging messages. We only print
120 * the nfs timer and reply error debugs every 10 seconds.
122 static const struct timeval nfs_err_interval
= { 10, 0 };
123 static struct timeval nfs_reply_last_err_time
__attribute__((__used__
));
124 static struct timeval nfs_timer_last_err_time
__attribute__((__used__
));
128 * Defines which timer to use for the procnum.
135 static const int proct
[NFS_NPROCS
] = {
137 [NFSPROC_GETATTR
] = 1,
138 [NFSPROC_SETATTR
] = 0,
139 [NFSPROC_LOOKUP
] = 2,
140 [NFSPROC_ACCESS
] = 1,
141 [NFSPROC_READLINK
] = 3,
144 [NFSPROC_CREATE
] = 0,
146 [NFSPROC_SYMLINK
] = 0,
148 [NFSPROC_REMOVE
] = 0,
150 [NFSPROC_RENAME
] = 0,
152 [NFSPROC_READDIR
] = 3,
153 [NFSPROC_READDIRPLUS
] = 3,
154 [NFSPROC_FSSTAT
] = 0,
155 [NFSPROC_FSINFO
] = 0,
156 [NFSPROC_PATHCONF
] = 0,
157 [NFSPROC_COMMIT
] = 0,
162 * There is a congestion window for outstanding rpcs maintained per mount
163 * point. The cwnd size is adjusted in roughly the way that:
164 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
165 * SIGCOMM '88". ACM, August 1988.
166 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
167 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
168 * of rpcs is in progress.
169 * (The sent count and cwnd are scaled for integer arith.)
170 * Variants of "slow start" were tried and were found to be too much of a
171 * performance hit (ave. rtt 3 times larger),
172 * I suspect due to the large rtt that nfs rpcs have.
174 #define NFS_CWNDSCALE 256
175 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
176 static const int nfs_backoff
[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
178 struct nfsrtt nfsrtt
;
179 struct nfsreqhead nfs_reqq
;
180 static callout_t nfs_timer_ch
;
181 static struct evcnt nfs_timer_ev
;
182 static struct evcnt nfs_timer_start_ev
;
183 static struct evcnt nfs_timer_stop_ev
;
184 static kmutex_t nfs_timer_lock
;
185 static bool (*nfs_timer_srvvec
)(void);
188 static int nfs_sndlock(struct nfsmount
*, struct nfsreq
*);
189 static void nfs_sndunlock(struct nfsmount
*);
191 static int nfs_rcvlock(struct nfsmount
*, struct nfsreq
*);
192 static void nfs_rcvunlock(struct nfsmount
*);
195 * Initialize sockets and congestion for a new NFS connection.
196 * We do not free the sockaddr if error.
199 nfs_connect(struct nfsmount
*nmp
, struct nfsreq
*rep
, struct lwp
*l
)
202 int error
, rcvreserve
, sndreserve
;
203 struct sockaddr
*saddr
;
204 struct sockaddr_in
*sin
;
205 struct sockaddr_in6
*sin6
;
210 saddr
= mtod(nmp
->nm_nam
, struct sockaddr
*);
211 error
= socreate(saddr
->sa_family
, &nmp
->nm_so
,
212 nmp
->nm_sotype
, nmp
->nm_soproto
, l
, NULL
);
217 so
->so_mowner
= &nfs_mowner
;
218 so
->so_rcv
.sb_mowner
= &nfs_mowner
;
219 so
->so_snd
.sb_mowner
= &nfs_mowner
;
221 nmp
->nm_soflags
= so
->so_proto
->pr_flags
;
224 * Some servers require that the client port be a reserved port number.
226 if (saddr
->sa_family
== AF_INET
&& (nmp
->nm_flag
& NFSMNT_RESVPORT
)) {
227 val
= IP_PORTRANGE_LOW
;
229 if ((error
= so_setsockopt(NULL
, so
, IPPROTO_IP
, IP_PORTRANGE
,
232 m
= m_get(M_WAIT
, MT_SONAME
);
233 MCLAIM(m
, so
->so_mowner
);
234 sin
= mtod(m
, struct sockaddr_in
*);
235 sin
->sin_len
= m
->m_len
= sizeof (struct sockaddr_in
);
236 sin
->sin_family
= AF_INET
;
237 sin
->sin_addr
.s_addr
= INADDR_ANY
;
239 error
= sobind(so
, m
, &lwp0
);
244 if (saddr
->sa_family
== AF_INET6
&& (nmp
->nm_flag
& NFSMNT_RESVPORT
)) {
245 val
= IPV6_PORTRANGE_LOW
;
247 if ((error
= so_setsockopt(NULL
, so
, IPPROTO_IPV6
,
248 IPV6_PORTRANGE
, &val
, sizeof(val
))))
250 m
= m_get(M_WAIT
, MT_SONAME
);
251 MCLAIM(m
, so
->so_mowner
);
252 sin6
= mtod(m
, struct sockaddr_in6
*);
253 memset(sin6
, 0, sizeof(*sin6
));
254 sin6
->sin6_len
= m
->m_len
= sizeof (struct sockaddr_in6
);
255 sin6
->sin6_family
= AF_INET6
;
256 error
= sobind(so
, m
, &lwp0
);
263 * Protocols that do not require connections may be optionally left
264 * unconnected for servers that reply from a port other than NFS_PORT.
267 if (nmp
->nm_flag
& NFSMNT_NOCONN
) {
268 if (nmp
->nm_soflags
& PR_CONNREQUIRED
) {
274 error
= soconnect(so
, nmp
->nm_nam
, l
);
281 * Wait for the connection to complete. Cribbed from the
282 * connect system call but with the wait timing out so
283 * that interruptible mounts don't hang here for a long time.
285 while ((so
->so_state
& SS_ISCONNECTING
) && so
->so_error
== 0) {
286 (void)sowait(so
, false, 2 * hz
);
287 if ((so
->so_state
& SS_ISCONNECTING
) &&
288 so
->so_error
== 0 && rep
&&
289 (error
= nfs_sigintr(nmp
, rep
, rep
->r_lwp
)) != 0){
290 so
->so_state
&= ~SS_ISCONNECTING
;
296 error
= so
->so_error
;
302 if (nmp
->nm_flag
& (NFSMNT_SOFT
| NFSMNT_INT
)) {
303 so
->so_rcv
.sb_timeo
= (5 * hz
);
304 so
->so_snd
.sb_timeo
= (5 * hz
);
307 * enable receive timeout to detect server crash and reconnect.
308 * otherwise, we can be stuck in soreceive forever.
310 so
->so_rcv
.sb_timeo
= (5 * hz
);
311 so
->so_snd
.sb_timeo
= 0;
313 if (nmp
->nm_sotype
== SOCK_DGRAM
) {
314 sndreserve
= (nmp
->nm_wsize
+ NFS_MAXPKTHDR
) * 2;
315 rcvreserve
= (max(nmp
->nm_rsize
, nmp
->nm_readdirsize
) +
317 } else if (nmp
->nm_sotype
== SOCK_SEQPACKET
) {
318 sndreserve
= (nmp
->nm_wsize
+ NFS_MAXPKTHDR
) * 2;
319 rcvreserve
= (max(nmp
->nm_rsize
, nmp
->nm_readdirsize
) +
323 if (nmp
->nm_sotype
!= SOCK_STREAM
)
324 panic("nfscon sotype");
325 if (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) {
327 so_setsockopt(NULL
, so
, SOL_SOCKET
, SO_KEEPALIVE
, &val
,
330 if (so
->so_proto
->pr_protocol
== IPPROTO_TCP
) {
332 so_setsockopt(NULL
, so
, IPPROTO_TCP
, TCP_NODELAY
, &val
,
335 sndreserve
= (nmp
->nm_wsize
+ NFS_MAXPKTHDR
+
336 sizeof (u_int32_t
)) * 2;
337 rcvreserve
= (nmp
->nm_rsize
+ NFS_MAXPKTHDR
+
338 sizeof (u_int32_t
)) * 2;
341 error
= soreserve(so
, sndreserve
, rcvreserve
);
346 so
->so_rcv
.sb_flags
|= SB_NOINTR
;
347 so
->so_snd
.sb_flags
|= SB_NOINTR
;
350 /* Initialize other non-zero congestion variables */
351 nmp
->nm_srtt
[0] = nmp
->nm_srtt
[1] = nmp
->nm_srtt
[2] = nmp
->nm_srtt
[3] =
353 nmp
->nm_sdrtt
[0] = nmp
->nm_sdrtt
[1] = nmp
->nm_sdrtt
[2] =
354 nmp
->nm_sdrtt
[3] = 0;
355 nmp
->nm_cwnd
= NFS_MAXCWND
/ 2; /* Initial send window */
357 nmp
->nm_timeouts
= 0;
367 * Called when a connection is broken on a reliable protocol.
368 * - clean up the old socket
369 * - nfs_connect() again
370 * - set R_MUSTRESEND for all outstanding requests on mount point
371 * If this fails the mount point is DEAD!
372 * nb: Must be called with the nfs_sndlock() set on the mount point.
375 nfs_reconnect(struct nfsreq
*rep
)
378 struct nfsmount
*nmp
= rep
->r_nmp
;
382 while ((error
= nfs_connect(nmp
, rep
, &lwp0
)) != 0) {
383 if (error
== EINTR
|| error
== ERESTART
)
385 kpause("nfscn2", false, hz
, NULL
);
389 * Loop through outstanding request list and fix up all requests
392 TAILQ_FOREACH(rp
, &nfs_reqq
, r_chain
) {
393 if (rp
->r_nmp
== nmp
) {
394 if ((rp
->r_flags
& R_MUSTRESEND
) == 0)
395 rp
->r_flags
|= R_MUSTRESEND
| R_REXMITTED
;
403 * NFS disconnect. Clean up and unlink.
406 nfs_disconnect(struct nfsmount
*nmp
)
415 soshutdown(so
, SHUT_RDWR
);
417 drain
= (nmp
->nm_iflag
& NFSMNT_DISMNT
) != 0;
420 * soshutdown() above should wake up the current
422 * Now wake up those waiting for the receive lock, and
423 * wait for them to go away unhappy, to prevent *nmp
424 * from evaporating while they're sleeping.
426 mutex_enter(&nmp
->nm_lock
);
427 while (nmp
->nm_waiters
> 0) {
428 cv_broadcast(&nmp
->nm_rcvcv
);
429 cv_broadcast(&nmp
->nm_sndcv
);
430 cv_wait(&nmp
->nm_disconcv
, &nmp
->nm_lock
);
432 mutex_exit(&nmp
->nm_lock
);
437 if (drain
&& (nmp
->nm_waiters
> 0))
438 panic("nfs_disconnect: waiters left after drain?");
443 nfs_safedisconnect(struct nfsmount
*nmp
)
445 struct nfsreq dummyreq
;
447 memset(&dummyreq
, 0, sizeof(dummyreq
));
448 dummyreq
.r_nmp
= nmp
;
449 nfs_rcvlock(nmp
, &dummyreq
); /* XXX ignored error return */
455 * This is the nfs send routine. For connection based socket types, it
456 * must be called with an nfs_sndlock() on the socket.
457 * "rep == NULL" indicates that it has been called from a server.
458 * For the client side:
459 * - return EINTR if the RPC is terminated, 0 otherwise
460 * - set R_MUSTRESEND if the send fails for any reason
461 * - do any cleanup required by recoverable socket errors (? ? ?)
462 * For the server side:
463 * - return EINTR or ERESTART if interrupted by a signal
464 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
465 * - do any cleanup required by recoverable socket errors (? ? ?)
468 nfs_send(struct socket
*so
, struct mbuf
*nam
, struct mbuf
*top
, struct nfsreq
*rep
, struct lwp
*l
)
470 struct mbuf
*sendnam
;
471 int error
, soflags
, flags
;
473 /* XXX nfs_doio()/nfs_request() calls with rep->r_lwp == NULL */
474 if (l
== NULL
&& rep
->r_lwp
== NULL
)
478 if (rep
->r_flags
& R_SOFTTERM
) {
482 if ((so
= rep
->r_nmp
->nm_so
) == NULL
) {
483 rep
->r_flags
|= R_MUSTRESEND
;
487 rep
->r_flags
&= ~R_MUSTRESEND
;
488 soflags
= rep
->r_nmp
->nm_soflags
;
490 soflags
= so
->so_proto
->pr_flags
;
491 if ((soflags
& PR_CONNREQUIRED
) || (so
->so_state
& SS_ISCONNECTED
))
495 if (so
->so_type
== SOCK_SEQPACKET
)
500 error
= (*so
->so_send
)(so
, sendnam
, NULL
, top
, NULL
, flags
, l
);
503 if (error
== ENOBUFS
&& so
->so_type
== SOCK_DGRAM
) {
505 * We're too fast for the network/driver,
506 * and UDP isn't flowcontrolled.
507 * We need to resend. This is not fatal,
510 * Could be smarter here by doing some sort
511 * of a backoff, but this is rare.
513 rep
->r_flags
|= R_MUSTRESEND
;
517 "nfs send error %d for %s\n",
519 rep
->r_nmp
->nm_mountp
->
520 mnt_stat
.f_mntfromname
);
522 * Deal with errors for the client side.
524 if (rep
->r_flags
& R_SOFTTERM
)
526 else if (error
!= EMSGSIZE
)
527 rep
->r_flags
|= R_MUSTRESEND
;
531 * See above. This error can happen under normal
532 * circumstances and the log is too noisy.
533 * The error will still show up in nfsstat.
535 if (error
!= ENOBUFS
|| so
->so_type
!= SOCK_DGRAM
)
536 log(LOG_INFO
, "nfsd send error %d\n", error
);
540 * Handle any recoverable (soft) socket errors here. (? ? ?)
542 if (error
!= EINTR
&& error
!= ERESTART
&&
543 error
!= EWOULDBLOCK
&& error
!= EPIPE
&&
552 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
553 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
554 * Mark and consolidate the data into a new mbuf list.
555 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
557 * For SOCK_STREAM we must be very careful to read an entire record once
558 * we have read any of it, even if the system call has been interrupted.
561 nfs_receive(struct nfsreq
*rep
, struct mbuf
**aname
, struct mbuf
**mp
,
568 struct mbuf
*control
;
570 struct mbuf
**getnam
;
571 int error
, sotype
, rcvflg
;
574 * Set up arguments for soreceive()
578 sotype
= rep
->r_nmp
->nm_sotype
;
581 * For reliable protocols, lock against other senders/receivers
582 * in case a reconnect is necessary.
583 * For SOCK_STREAM, first get the Record Mark to find out how much
584 * more there is to get.
585 * We must lock the socket against other receivers
586 * until we have an entire rpc request/reply.
588 if (sotype
!= SOCK_DGRAM
) {
589 error
= nfs_sndlock(rep
->r_nmp
, rep
);
594 * Check for fatal errors and resending request.
597 * Ugh: If a reconnect attempt just happened, nm_so
598 * would have changed. NULL indicates a failed
599 * attempt that has essentially shut down this
602 if (rep
->r_mrep
|| (rep
->r_flags
& R_SOFTTERM
)) {
603 nfs_sndunlock(rep
->r_nmp
);
606 so
= rep
->r_nmp
->nm_so
;
608 error
= nfs_reconnect(rep
);
610 nfs_sndunlock(rep
->r_nmp
);
615 while (rep
->r_flags
& R_MUSTRESEND
) {
616 m
= m_copym(rep
->r_mreq
, 0, M_COPYALL
, M_WAIT
);
617 nfsstats
.rpcretries
++;
619 rep
->r_flags
&= ~R_TIMING
;
620 error
= nfs_send(so
, rep
->r_nmp
->nm_nam
, m
, rep
, l
);
622 if (error
== EINTR
|| error
== ERESTART
||
623 (error
= nfs_reconnect(rep
)) != 0) {
624 nfs_sndunlock(rep
->r_nmp
);
630 nfs_sndunlock(rep
->r_nmp
);
631 if (sotype
== SOCK_STREAM
) {
632 aio
.iov_base
= (void *) &len
;
633 aio
.iov_len
= sizeof(u_int32_t
);
636 auio
.uio_rw
= UIO_READ
;
638 auio
.uio_resid
= sizeof(u_int32_t
);
639 UIO_SETUP_SYSSPACE(&auio
);
641 rcvflg
= MSG_WAITALL
;
642 error
= (*so
->so_receive
)(so
, NULL
, &auio
,
643 NULL
, NULL
, &rcvflg
);
644 if (error
== EWOULDBLOCK
&& rep
) {
645 if (rep
->r_flags
& R_SOFTTERM
)
648 * if it seems that the server died after it
649 * received our request, set EPIPE so that
650 * we'll reconnect and retransmit requests.
652 if (rep
->r_rexmit
>= rep
->r_nmp
->nm_retry
) {
653 nfsstats
.rpctimeouts
++;
657 } while (error
== EWOULDBLOCK
);
658 if (!error
&& auio
.uio_resid
> 0) {
660 * Don't log a 0 byte receive; it means
661 * that the socket has been closed, and
662 * can happen during normal operation
663 * (forcible unmount or Solaris server).
665 if (auio
.uio_resid
!= sizeof (u_int32_t
))
667 "short receive (%lu/%lu) from nfs server %s\n",
668 (u_long
)sizeof(u_int32_t
) - auio
.uio_resid
,
669 (u_long
)sizeof(u_int32_t
),
670 rep
->r_nmp
->nm_mountp
->mnt_stat
.f_mntfromname
);
675 len
= ntohl(len
) & ~0x80000000;
677 * This is SERIOUS! We are out of sync with the sender
678 * and forcing a disconnect/reconnect is all I can do.
680 if (len
> NFS_MAXPACKET
) {
681 log(LOG_ERR
, "%s (%d) from nfs server %s\n",
682 "impossible packet length",
684 rep
->r_nmp
->nm_mountp
->mnt_stat
.f_mntfromname
);
688 auio
.uio_resid
= len
;
690 rcvflg
= MSG_WAITALL
;
691 error
= (*so
->so_receive
)(so
, NULL
,
692 &auio
, mp
, NULL
, &rcvflg
);
693 } while (error
== EWOULDBLOCK
|| error
== EINTR
||
695 if (!error
&& auio
.uio_resid
> 0) {
696 if (len
!= auio
.uio_resid
)
698 "short receive (%lu/%d) from nfs server %s\n",
699 (u_long
)len
- auio
.uio_resid
, len
,
700 rep
->r_nmp
->nm_mountp
->mnt_stat
.f_mntfromname
);
705 * NB: Since uio_resid is big, MSG_WAITALL is ignored
706 * and soreceive() will return when it has either a
707 * control msg or a data msg.
708 * We have no use for control msg., but must grab them
709 * and then throw them away so we know what is going
712 auio
.uio_resid
= len
= 100000000; /* Anything Big */
713 /* not need to setup uio_vmspace */
716 error
= (*so
->so_receive
)(so
, NULL
,
717 &auio
, mp
, &control
, &rcvflg
);
720 if (error
== EWOULDBLOCK
&& rep
) {
721 if (rep
->r_flags
& R_SOFTTERM
)
724 } while (error
== EWOULDBLOCK
||
725 (!error
&& *mp
== NULL
&& control
));
726 if ((rcvflg
& MSG_EOR
) == 0)
728 if (!error
&& *mp
== NULL
)
730 len
-= auio
.uio_resid
;
733 if (error
&& error
!= EINTR
&& error
!= ERESTART
) {
738 "receive error %d from nfs server %s\n",
740 rep
->r_nmp
->nm_mountp
->mnt_stat
.f_mntfromname
);
741 error
= nfs_sndlock(rep
->r_nmp
, rep
);
743 error
= nfs_reconnect(rep
);
747 nfs_sndunlock(rep
->r_nmp
);
750 if ((so
= rep
->r_nmp
->nm_so
) == NULL
)
752 if (so
->so_state
& SS_ISCONNECTED
)
756 auio
.uio_resid
= len
= 1000000;
757 /* not need to setup uio_vmspace */
760 error
= (*so
->so_receive
)(so
, getnam
, &auio
, mp
,
762 if (error
== EWOULDBLOCK
&&
763 (rep
->r_flags
& R_SOFTTERM
))
765 } while (error
== EWOULDBLOCK
);
766 len
-= auio
.uio_resid
;
767 if (!error
&& *mp
== NULL
)
778 * Implement receipt of reply on a socket.
779 * We must search through the list of received datagrams matching them
780 * with outstanding requests using the xid, until ours is found.
784 nfs_reply(struct nfsreq
*myrep
, struct lwp
*lwp
)
787 struct nfsmount
*nmp
= myrep
->r_nmp
;
789 struct mbuf
*mrep
, *nam
, *md
;
795 * Loop around until we get our own reply
799 * Lock against other receivers so that I don't get stuck in
800 * sbwait() after someone else has received my reply for me.
801 * Also necessary for connection based protocols to avoid
802 * race conditions during a reconnect.
804 error
= nfs_rcvlock(nmp
, myrep
);
805 if (error
== EALREADY
)
810 * Get the next Rpc reply off the socket
813 mutex_enter(&nmp
->nm_lock
);
815 mutex_exit(&nmp
->nm_lock
);
817 error
= nfs_receive(myrep
, &nam
, &mrep
, lwp
);
819 mutex_enter(&nmp
->nm_lock
);
821 cv_signal(&nmp
->nm_disconcv
);
822 mutex_exit(&nmp
->nm_lock
);
827 if (nmp
->nm_iflag
& NFSMNT_DISMNT
) {
829 * Oops, we're going away now..
834 * Ignore routing errors on connectionless protocols? ?
836 if (NFSIGNORE_SOERROR(nmp
->nm_soflags
, error
)) {
837 nmp
->nm_so
->so_error
= 0;
839 if (ratecheck(&nfs_reply_last_err_time
,
841 printf("%s: ignoring error %d\n",
852 * Get the xid and check that it is an rpc reply
855 dpos
= mtod(md
, void *);
856 nfsm_dissect(tl
, u_int32_t
*, 2*NFSX_UNSIGNED
);
858 if (*tl
!= rpc_reply
) {
859 nfsstats
.rpcinvalid
++;
867 * Loop through the request list to match up the reply
868 * Iff no match, just drop the datagram
870 TAILQ_FOREACH(rep
, &nfs_reqq
, r_chain
) {
871 if (rep
->r_mrep
== NULL
&& rxid
== rep
->r_xid
) {
879 rt
= &nfsrtt
.rttl
[nfsrtt
.pos
];
880 rt
->proc
= rep
->r_procnum
;
881 rt
->rto
= NFS_RTO(nmp
, proct
[rep
->r_procnum
]);
882 rt
->sent
= nmp
->nm_sent
;
883 rt
->cwnd
= nmp
->nm_cwnd
;
884 rt
->srtt
= nmp
->nm_srtt
[proct
[rep
->r_procnum
] - 1];
885 rt
->sdrtt
= nmp
->nm_sdrtt
[proct
[rep
->r_procnum
] - 1];
886 rt
->fsid
= nmp
->nm_mountp
->mnt_stat
.f_fsidx
;
887 getmicrotime(&rt
->tstamp
);
888 if (rep
->r_flags
& R_TIMING
)
889 rt
->rtt
= rep
->r_rtt
;
892 nfsrtt
.pos
= (nfsrtt
.pos
+ 1) % NFSRTTLOGSIZ
;
895 * Update congestion window.
896 * Do the additive increase of
899 if (nmp
->nm_cwnd
<= nmp
->nm_sent
) {
901 (NFS_CWNDSCALE
* NFS_CWNDSCALE
+
902 (nmp
->nm_cwnd
>> 1)) / nmp
->nm_cwnd
;
903 if (nmp
->nm_cwnd
> NFS_MAXCWND
)
904 nmp
->nm_cwnd
= NFS_MAXCWND
;
906 rep
->r_flags
&= ~R_SENT
;
907 nmp
->nm_sent
-= NFS_CWNDSCALE
;
909 * Update rtt using a gain of 0.125 on the mean
910 * and a gain of 0.25 on the deviation.
912 if (rep
->r_flags
& R_TIMING
) {
914 * Since the timer resolution of
915 * NFS_HZ is so course, it can often
916 * result in r_rtt == 0. Since
917 * r_rtt == N means that the actual
918 * rtt is between N+dt and N+2-dt ticks,
922 t1
-= (NFS_SRTT(rep
) >> 3);
926 t1
-= (NFS_SDRTT(rep
) >> 2);
927 NFS_SDRTT(rep
) += t1
;
929 nmp
->nm_timeouts
= 0;
935 * If not matched to a request, drop it.
936 * If it's mine, get out.
939 nfsstats
.rpcunexpected
++;
941 } else if (rep
== myrep
) {
942 if (rep
->r_mrep
== NULL
)
943 panic("nfsreply nil");
950 * nfs_request - goes something like this
951 * - fill in request struct
952 * - links it into list
953 * - calls nfs_send() for first transmit
954 * - calls nfs_receive() to get reply
955 * - break down rpc header and return with nfs reply pointed to
957 * nb: always frees up mreq mbuf list
960 nfs_request(struct nfsnode
*np
, struct mbuf
*mrest
, int procnum
, struct lwp
*lwp
, kauth_cred_t cred
, struct mbuf
**mrp
, struct mbuf
**mdp
, char **dposp
, int *rexmitp
)
962 struct mbuf
*m
, *mrep
;
966 struct nfsmount
*nmp
= VFSTONFS(np
->n_vnode
->v_mount
);
967 struct mbuf
*md
, *mheadend
;
968 char nickv
[RPCX_NICKVERF
];
971 int t1
, s
, error
= 0, mrest_len
, auth_len
, auth_type
;
972 int trylater_delay
= NFS_TRYLATERDEL
, failed_auth
= 0;
973 int verf_len
, verf_type
;
975 char *auth_str
, *verf_str
;
976 NFSKERBKEY_T key
; /* save session key */
978 struct mbuf
*mrest_backup
= NULL
;
979 kauth_cred_t origcred
= NULL
; /* XXX: gcc */
980 bool retry_cred
= true;
981 bool use_opencred
= (np
->n_flag
& NUSEOPENCRED
) != 0;
986 acred
= kauth_cred_alloc();
989 KASSERT(cred
!= NULL
);
990 rep
= kmem_alloc(sizeof(*rep
), KM_SLEEP
);
992 KASSERT(lwp
== NULL
|| lwp
== curlwp
);
994 rep
->r_procnum
= procnum
;
1004 * Get the RPC header with authorization.
1007 verf_str
= auth_str
= NULL
;
1008 if (nmp
->nm_flag
& NFSMNT_KERB
) {
1010 verf_len
= sizeof (nickv
);
1011 auth_type
= RPCAUTH_KERB4
;
1012 memset((void *)key
, 0, sizeof (key
));
1013 if (failed_auth
|| nfs_getnickauth(nmp
, cred
, &auth_str
,
1014 &auth_len
, verf_str
, verf_len
)) {
1015 error
= nfs_getauth(nmp
, rep
, cred
, &auth_str
,
1016 &auth_len
, verf_str
, &verf_len
, key
);
1018 kmem_free(rep
, sizeof(*rep
));
1020 KASSERT(kauth_cred_getrefcnt(acred
) == 1);
1021 kauth_cred_free(acred
);
1032 * on the most unix filesystems, permission checks are
1033 * done when the file is open(2)'ed.
1034 * ie. once a file is successfully open'ed,
1035 * following i/o operations never fail with EACCES.
1036 * we try to follow the semantics as far as possible.
1038 * note that we expect that the nfs server always grant
1039 * accesses by the file's owner.
1045 case NFSPROC_COMMIT
:
1046 uid
= np
->n_vattr
->va_uid
;
1047 gid
= np
->n_vattr
->va_gid
;
1048 if (kauth_cred_geteuid(cred
) == uid
&&
1049 kauth_cred_getegid(cred
) == gid
) {
1055 kauth_cred_setuid(acred
, uid
);
1056 kauth_cred_seteuid(acred
, uid
);
1057 kauth_cred_setsvuid(acred
, uid
);
1058 kauth_cred_setgid(acred
, gid
);
1059 kauth_cred_setegid(acred
, gid
);
1060 kauth_cred_setsvgid(acred
, gid
);
1068 * backup mbuf chain if we can need it later to retry.
1070 * XXX maybe we can keep a direct reference to
1071 * mrest without doing m_copym, but it's ...ugly.
1074 mrest_backup
= m_copym(mrest
, 0, M_COPYALL
, M_WAIT
);
1075 auth_type
= RPCAUTH_UNIX
;
1076 /* XXX elad - ngroups */
1077 auth_len
= (((kauth_cred_ngroups(cred
) > nmp
->nm_numgrps
) ?
1078 nmp
->nm_numgrps
: kauth_cred_ngroups(cred
)) << 2) +
1081 m
= nfsm_rpchead(cred
, nmp
->nm_flag
, procnum
, auth_type
, auth_len
,
1082 auth_str
, verf_len
, verf_str
, mrest
, mrest_len
, &mheadend
, &xid
);
1084 free(auth_str
, M_TEMP
);
1087 * For stream protocols, insert a Sun RPC Record Mark.
1089 if (nmp
->nm_sotype
== SOCK_STREAM
) {
1090 M_PREPEND(m
, NFSX_UNSIGNED
, M_WAIT
);
1091 *mtod(m
, u_int32_t
*) = htonl(0x80000000 |
1092 (m
->m_pkthdr
.len
- NFSX_UNSIGNED
));
1097 if (nmp
->nm_flag
& NFSMNT_SOFT
)
1098 rep
->r_retry
= nmp
->nm_retry
;
1100 rep
->r_retry
= NFS_MAXREXMIT
+ 1; /* past clip limit */
1101 rep
->r_rtt
= rep
->r_rexmit
= 0;
1102 if (proct
[procnum
] > 0)
1103 rep
->r_flags
= R_TIMING
;
1109 * Do the client side RPC.
1111 nfsstats
.rpcrequests
++;
1113 * Chain request into list of outstanding requests. Be sure
1114 * to put it LAST so timer finds oldest requests first.
1117 TAILQ_INSERT_TAIL(&nfs_reqq
, rep
, r_chain
);
1121 * If backing off another request or avoiding congestion, don't
1122 * send this one now but let timer do it. If not timing a request,
1125 if (nmp
->nm_so
&& (nmp
->nm_sotype
!= SOCK_DGRAM
||
1126 (nmp
->nm_flag
& NFSMNT_DUMBTIMR
) || nmp
->nm_sent
< nmp
->nm_cwnd
)) {
1128 if (nmp
->nm_soflags
& PR_CONNREQUIRED
)
1129 error
= nfs_sndlock(nmp
, rep
);
1131 m
= m_copym(rep
->r_mreq
, 0, M_COPYALL
, M_WAIT
);
1132 error
= nfs_send(nmp
->nm_so
, nmp
->nm_nam
, m
, rep
, lwp
);
1133 if (nmp
->nm_soflags
& PR_CONNREQUIRED
)
1136 if (!error
&& (rep
->r_flags
& R_MUSTRESEND
) == 0) {
1137 nmp
->nm_sent
+= NFS_CWNDSCALE
;
1138 rep
->r_flags
|= R_SENT
;
1146 * Wait for the reply from our send or the timer's.
1148 if (!error
|| error
== EPIPE
|| error
== EWOULDBLOCK
)
1149 error
= nfs_reply(rep
, lwp
);
1152 * RPC done, unlink the request.
1155 TAILQ_REMOVE(&nfs_reqq
, rep
, r_chain
);
1159 * Decrement the outstanding request count.
1161 if (rep
->r_flags
& R_SENT
) {
1162 rep
->r_flags
&= ~R_SENT
; /* paranoia */
1163 nmp
->nm_sent
-= NFS_CWNDSCALE
;
1166 if (rexmitp
!= NULL
) {
1169 if (nmp
->nm_sotype
!= SOCK_DGRAM
)
1170 rexmit
= (rep
->r_flags
& R_REXMITTED
) != 0;
1172 rexmit
= rep
->r_rexmit
;
1177 * If there was a successful reply and a tprintf msg.
1178 * tprintf a response.
1180 if (!error
&& (rep
->r_flags
& R_TPRINTFMSG
))
1181 nfs_msg(rep
->r_lwp
, nmp
->nm_mountp
->mnt_stat
.f_mntfromname
,
1190 * break down the rpc header and check if ok
1192 nfsm_dissect(tl
, u_int32_t
*, 3 * NFSX_UNSIGNED
);
1193 if (*tl
++ == rpc_msgdenied
) {
1194 if (*tl
== rpc_mismatch
)
1196 else if ((nmp
->nm_flag
& NFSMNT_KERB
) && *tl
++ == rpc_autherr
) {
1199 mheadend
->m_next
= NULL
;
1201 m_freem(rep
->r_mreq
);
1212 * Grab any Kerberos verifier, otherwise just throw it away.
1214 verf_type
= fxdr_unsigned(int, *tl
++);
1215 i
= fxdr_unsigned(int32_t, *tl
);
1216 if ((nmp
->nm_flag
& NFSMNT_KERB
) && verf_type
== RPCAUTH_KERB4
) {
1217 error
= nfs_savenickauth(nmp
, cred
, i
, key
, &md
, &dpos
, mrep
);
1221 nfsm_adv(nfsm_rndup(i
));
1222 nfsm_dissect(tl
, u_int32_t
*, NFSX_UNSIGNED
);
1225 nfsm_dissect(tl
, u_int32_t
*, NFSX_UNSIGNED
);
1227 error
= fxdr_unsigned(int, *tl
);
1250 m_freem(rep
->r_mreq
);
1251 kmem_free(rep
, sizeof(*rep
));
1252 use_opencred
= !use_opencred
;
1253 if (mrest_backup
== NULL
) {
1254 /* m_copym failure */
1256 kauth_cred_getrefcnt(acred
) == 1);
1257 kauth_cred_free(acred
);
1260 mrest
= mrest_backup
;
1261 mrest_backup
= NULL
;
1307 case NFSERR_TIMEDOUT
:
1311 case NFSERR_NAMETOL
:
1312 error
= ENAMETOOLONG
;
1315 case NFSERR_NOTEMPTY
:
1325 * If the File Handle was stale, invalidate the
1326 * lookup cache, just in case.
1329 cache_purge(NFSTOV(np
));
1337 case NFSERR_BADHANDLE
:
1338 case NFSERR_NOT_SYNC
:
1339 case NFSERR_BAD_COOKIE
:
1343 case NFSERR_NOTSUPP
:
1347 case NFSERR_TOOSMALL
:
1348 case NFSERR_SERVERFAULT
:
1349 case NFSERR_BADTYPE
:
1353 case NFSERR_TRYLATER
:
1354 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) == 0)
1358 waituntil
= time_second
+ trylater_delay
;
1359 while (time_second
< waituntil
) {
1360 kpause("nfstrylater", false, hz
, NULL
);
1362 trylater_delay
*= NFS_TRYLATERDELMUL
;
1363 if (trylater_delay
> NFS_TRYLATERDELMAX
)
1364 trylater_delay
= NFS_TRYLATERDELMAX
;
1367 * The client should wait and then try
1368 * the request with a new RPC transaction ID.
1375 printf("Invalid rpc error code %d\n", error
);
1381 if (nmp
->nm_flag
& NFSMNT_NFSV3
) {
1385 error
|= NFSERR_RETERR
;
1392 * note which credential worked to minimize number of retries.
1395 np
->n_flag
|= NUSEOPENCRED
;
1397 np
->n_flag
&= ~NUSEOPENCRED
;
1403 KASSERT(error
== 0);
1407 error
= EPROTONOSUPPORT
;
1409 KASSERT(kauth_cred_getrefcnt(acred
) == 1);
1410 kauth_cred_free(acred
);
1411 m_freem(rep
->r_mreq
);
1412 kmem_free(rep
, sizeof(*rep
));
1413 m_freem(mrest_backup
);
1419 * Generate the rpc reply header
1420 * siz arg. is used to decide if adding a cluster is worthwhile
1423 nfs_rephead(int siz
, struct nfsrv_descript
*nd
, struct nfssvc_sock
*slp
, int err
, int cache
, u_quad_t
*frev
, struct mbuf
**mrq
, struct mbuf
**mbp
, char **bposp
)
1430 mreq
= m_gethdr(M_WAIT
, MT_DATA
);
1431 MCLAIM(mreq
, &nfs_mowner
);
1434 * If this is a big reply, use a cluster else
1435 * try and leave leading space for the lower level headers.
1437 siz
+= RPC_REPLYSIZ
;
1438 if (siz
>= max_datalen
) {
1439 m_clget(mreq
, M_WAIT
);
1441 mreq
->m_data
+= max_hdr
;
1442 tl
= mtod(mreq
, u_int32_t
*);
1443 mreq
->m_len
= 6 * NFSX_UNSIGNED
;
1444 bpos
= ((char *)tl
) + mreq
->m_len
;
1445 *tl
++ = txdr_unsigned(nd
->nd_retxid
);
1447 if (err
== ERPCMISMATCH
|| (err
& NFSERR_AUTHERR
)) {
1448 *tl
++ = rpc_msgdenied
;
1449 if (err
& NFSERR_AUTHERR
) {
1450 *tl
++ = rpc_autherr
;
1451 *tl
= txdr_unsigned(err
& ~NFSERR_AUTHERR
);
1452 mreq
->m_len
-= NFSX_UNSIGNED
;
1453 bpos
-= NFSX_UNSIGNED
;
1455 *tl
++ = rpc_mismatch
;
1456 *tl
++ = txdr_unsigned(RPC_VER2
);
1457 *tl
= txdr_unsigned(RPC_VER2
);
1460 *tl
++ = rpc_msgaccepted
;
1463 * For Kerberos authentication, we must send the nickname
1464 * verifier back, otherwise just RPCAUTH_NULL.
1466 if (nd
->nd_flag
& ND_KERBFULL
) {
1467 struct nfsuid
*nuidp
;
1468 struct timeval ktvin
, ktvout
;
1470 memset(&ktvout
, 0, sizeof ktvout
); /* XXX gcc */
1473 NUIDHASH(slp
, kauth_cred_geteuid(nd
->nd_cr
)),
1475 if (kauth_cred_geteuid(nuidp
->nu_cr
) ==
1476 kauth_cred_geteuid(nd
->nd_cr
) &&
1477 (!nd
->nd_nam2
|| netaddr_match(
1478 NU_NETFAM(nuidp
), &nuidp
->nu_haddr
,
1484 txdr_unsigned(nuidp
->nu_timestamp
.tv_sec
1487 txdr_unsigned(nuidp
->nu_timestamp
.tv_usec
);
1490 * Encrypt the timestamp in ecb mode using the
1497 *tl
++ = rpc_auth_kerb
;
1498 *tl
++ = txdr_unsigned(3 * NFSX_UNSIGNED
);
1499 *tl
= ktvout
.tv_sec
;
1500 nfsm_build(tl
, u_int32_t
*, 3 * NFSX_UNSIGNED
);
1501 *tl
++ = ktvout
.tv_usec
;
1502 *tl
++ = txdr_unsigned(
1503 kauth_cred_geteuid(nuidp
->nu_cr
));
1514 *tl
= txdr_unsigned(RPC_PROGUNAVAIL
);
1517 *tl
= txdr_unsigned(RPC_PROGMISMATCH
);
1518 nfsm_build(tl
, u_int32_t
*, 2 * NFSX_UNSIGNED
);
1519 *tl
++ = txdr_unsigned(2);
1520 *tl
= txdr_unsigned(3);
1523 *tl
= txdr_unsigned(RPC_PROCUNAVAIL
);
1526 *tl
= txdr_unsigned(RPC_GARBAGE
);
1530 if (err
!= NFSERR_RETVOID
) {
1531 nfsm_build(tl
, u_int32_t
*, NFSX_UNSIGNED
);
1533 *tl
= txdr_unsigned(nfsrv_errmap(nd
, err
));
1545 if (err
!= 0 && err
!= NFSERR_RETVOID
)
1546 nfsstats
.srvrpc_errs
++;
1551 nfs_timer_schedule(void)
1554 callout_schedule(&nfs_timer_ch
, nfs_ticks
);
1558 nfs_timer_start(void)
1561 if (callout_pending(&nfs_timer_ch
))
1564 nfs_timer_start_ev
.ev_count
++;
1565 nfs_timer_schedule();
1569 nfs_timer_init(void)
1572 mutex_init(&nfs_timer_lock
, MUTEX_DEFAULT
, IPL_NONE
);
1573 callout_init(&nfs_timer_ch
, 0);
1574 callout_setfunc(&nfs_timer_ch
, nfs_timer
, NULL
);
1575 evcnt_attach_dynamic(&nfs_timer_ev
, EVCNT_TYPE_MISC
, NULL
,
1577 evcnt_attach_dynamic(&nfs_timer_start_ev
, EVCNT_TYPE_MISC
, NULL
,
1578 "nfs", "timer start");
1579 evcnt_attach_dynamic(&nfs_timer_stop_ev
, EVCNT_TYPE_MISC
, NULL
,
1580 "nfs", "timer stop");
1584 nfs_timer_fini(void)
1587 callout_halt(&nfs_timer_ch
, NULL
);
1588 callout_destroy(&nfs_timer_ch
);
1589 mutex_destroy(&nfs_timer_lock
);
1590 evcnt_detach(&nfs_timer_ev
);
1591 evcnt_detach(&nfs_timer_start_ev
);
1592 evcnt_detach(&nfs_timer_stop_ev
);
1596 nfs_timer_srvinit(bool (*func
)(void))
1599 nfs_timer_srvvec
= func
;
1603 nfs_timer_srvfini(void)
1606 mutex_enter(&nfs_timer_lock
);
1607 nfs_timer_srvvec
= NULL
;
1608 mutex_exit(&nfs_timer_lock
);
1614 * Scan the nfsreq list and retranmit any requests that have timed out
1615 * To avoid retransmission attempts on STREAM sockets (in the future) make
1616 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1619 nfs_timer(void *arg
)
1624 struct nfsmount
*nmp
;
1629 nfs_timer_ev
.ev_count
++;
1631 mutex_enter(softnet_lock
); /* XXX PR 40491 */
1632 TAILQ_FOREACH(rep
, &nfs_reqq
, r_chain
) {
1635 if (rep
->r_mrep
|| (rep
->r_flags
& R_SOFTTERM
))
1637 if (nfs_sigintr(nmp
, rep
, rep
->r_lwp
)) {
1638 rep
->r_flags
|= R_SOFTTERM
;
1641 if (rep
->r_rtt
>= 0) {
1643 if (nmp
->nm_flag
& NFSMNT_DUMBTIMR
)
1644 timeo
= nmp
->nm_timeo
;
1646 timeo
= NFS_RTO(nmp
, proct
[rep
->r_procnum
]);
1647 if (nmp
->nm_timeouts
> 0)
1648 timeo
*= nfs_backoff
[nmp
->nm_timeouts
- 1];
1649 if (timeo
> NFS_MAXTIMEO
)
1650 timeo
= NFS_MAXTIMEO
;
1651 if (rep
->r_rtt
<= timeo
)
1653 if (nmp
->nm_timeouts
<
1654 (sizeof(nfs_backoff
) / sizeof(nfs_backoff
[0])))
1658 * Check for server not responding
1660 if ((rep
->r_flags
& R_TPRINTFMSG
) == 0 &&
1661 rep
->r_rexmit
> nmp
->nm_deadthresh
) {
1663 nmp
->nm_mountp
->mnt_stat
.f_mntfromname
,
1665 rep
->r_flags
|= R_TPRINTFMSG
;
1667 if (rep
->r_rexmit
>= rep
->r_retry
) { /* too many */
1668 nfsstats
.rpctimeouts
++;
1669 rep
->r_flags
|= R_SOFTTERM
;
1672 if (nmp
->nm_sotype
!= SOCK_DGRAM
) {
1673 if (++rep
->r_rexmit
> NFS_MAXREXMIT
)
1674 rep
->r_rexmit
= NFS_MAXREXMIT
;
1677 if ((so
= nmp
->nm_so
) == NULL
)
1681 * If there is enough space and the window allows..
1683 * Set r_rtt to -1 in case we fail to send it now.
1685 /* solock(so); XXX PR 40491 */
1687 if (sbspace(&so
->so_snd
) >= rep
->r_mreq
->m_pkthdr
.len
&&
1688 ((nmp
->nm_flag
& NFSMNT_DUMBTIMR
) ||
1689 (rep
->r_flags
& R_SENT
) ||
1690 nmp
->nm_sent
< nmp
->nm_cwnd
) &&
1691 (m
= m_copym(rep
->r_mreq
, 0, M_COPYALL
, M_DONTWAIT
))){
1692 if (so
->so_state
& SS_ISCONNECTED
)
1693 error
= (*so
->so_proto
->pr_usrreq
)(so
, PRU_SEND
, m
,
1696 error
= (*so
->so_proto
->pr_usrreq
)(so
, PRU_SEND
, m
,
1697 nmp
->nm_nam
, NULL
, NULL
);
1699 if (NFSIGNORE_SOERROR(nmp
->nm_soflags
, error
)) {
1701 if (ratecheck(&nfs_timer_last_err_time
,
1703 printf("%s: ignoring error "
1704 "%d\n", __func__
, error
);
1710 * Iff first send, start timing
1711 * else turn timing off, backoff timer
1712 * and divide congestion window by 2.
1714 if (rep
->r_flags
& R_SENT
) {
1715 rep
->r_flags
&= ~R_TIMING
;
1716 if (++rep
->r_rexmit
> NFS_MAXREXMIT
)
1717 rep
->r_rexmit
= NFS_MAXREXMIT
;
1719 if (nmp
->nm_cwnd
< NFS_CWNDSCALE
)
1720 nmp
->nm_cwnd
= NFS_CWNDSCALE
;
1721 nfsstats
.rpcretries
++;
1723 rep
->r_flags
|= R_SENT
;
1724 nmp
->nm_sent
+= NFS_CWNDSCALE
;
1729 /* sounlock(so); XXX PR 40491 */
1731 mutex_exit(softnet_lock
); /* XXX PR 40491 */
1733 mutex_enter(&nfs_timer_lock
);
1734 if (nfs_timer_srvvec
!= NULL
) {
1735 more
|= (*nfs_timer_srvvec
)();
1737 mutex_exit(&nfs_timer_lock
);
1740 nfs_timer_schedule();
1742 nfs_timer_stop_ev
.ev_count
++;
1747 * Test for a termination condition pending on the process.
1748 * This is used for NFSMNT_INT mounts.
1751 nfs_sigintr(struct nfsmount
*nmp
, struct nfsreq
*rep
, struct lwp
*l
)
1755 if (rep
&& (rep
->r_flags
& R_SOFTTERM
))
1757 if (!(nmp
->nm_flag
& NFSMNT_INT
))
1760 sigpending1(l
, &ss
);
1762 sigminusset(&l
->l_proc
->p_sigctx
.ps_sigignore
, &ss
);
1764 if (sigismember(&ss
, SIGINT
) || sigismember(&ss
, SIGTERM
) ||
1765 sigismember(&ss
, SIGKILL
) || sigismember(&ss
, SIGHUP
) ||
1766 sigismember(&ss
, SIGQUIT
))
1774 * Lock a socket against others.
1775 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1776 * and also to avoid race conditions between the processes with nfs requests
1777 * in progress when a reconnect is necessary.
1780 nfs_sndlock(struct nfsmount
*nmp
, struct nfsreq
*rep
)
1789 if (rep
->r_nmp
->nm_flag
& NFSMNT_INT
)
1793 mutex_enter(&nmp
->nm_lock
);
1794 while ((nmp
->nm_iflag
& NFSMNT_SNDLOCK
) != 0) {
1795 if (rep
&& nfs_sigintr(rep
->r_nmp
, rep
, l
)) {
1800 cv_timedwait_sig(&nmp
->nm_sndcv
, &nmp
->nm_lock
, timeo
);
1802 cv_timedwait(&nmp
->nm_sndcv
, &nmp
->nm_lock
, timeo
);
1809 nmp
->nm_iflag
|= NFSMNT_SNDLOCK
;
1811 mutex_exit(&nmp
->nm_lock
);
1816 * Unlock the stream socket for others.
1819 nfs_sndunlock(struct nfsmount
*nmp
)
1822 mutex_enter(&nmp
->nm_lock
);
1823 if ((nmp
->nm_iflag
& NFSMNT_SNDLOCK
) == 0)
1824 panic("nfs sndunlock");
1825 nmp
->nm_iflag
&= ~NFSMNT_SNDLOCK
;
1826 cv_signal(&nmp
->nm_sndcv
);
1827 mutex_exit(&nmp
->nm_lock
);
1832 nfs_rcvlock(struct nfsmount
*nmp
, struct nfsreq
*rep
)
1834 int *flagp
= &nmp
->nm_iflag
;
1839 KASSERT(nmp
== rep
->r_nmp
);
1841 catch = (nmp
->nm_flag
& NFSMNT_INT
) != 0;
1842 mutex_enter(&nmp
->nm_lock
);
1843 while (/* CONSTCOND */ true) {
1844 if (*flagp
& NFSMNT_DISMNT
) {
1845 cv_signal(&nmp
->nm_disconcv
);
1849 /* If our reply was received while we were sleeping,
1850 * then just return without taking the lock to avoid a
1851 * situation where a single iod could 'capture' the
1854 if (rep
->r_mrep
!= NULL
) {
1858 if (nfs_sigintr(rep
->r_nmp
, rep
, rep
->r_lwp
)) {
1862 if ((*flagp
& NFSMNT_RCVLOCK
) == 0) {
1863 *flagp
|= NFSMNT_RCVLOCK
;
1867 cv_timedwait_sig(&nmp
->nm_rcvcv
, &nmp
->nm_lock
,
1870 cv_timedwait(&nmp
->nm_rcvcv
, &nmp
->nm_lock
,
1878 mutex_exit(&nmp
->nm_lock
);
1883 * Unlock the stream socket for others.
1886 nfs_rcvunlock(struct nfsmount
*nmp
)
1889 mutex_enter(&nmp
->nm_lock
);
1890 if ((nmp
->nm_iflag
& NFSMNT_RCVLOCK
) == 0)
1891 panic("nfs rcvunlock");
1892 nmp
->nm_iflag
&= ~NFSMNT_RCVLOCK
;
1893 cv_broadcast(&nmp
->nm_rcvcv
);
1894 mutex_exit(&nmp
->nm_lock
);
1898 * Parse an RPC request
1900 * - allocate and fill in the cred.
1903 nfs_getreq(struct nfsrv_descript
*nd
, struct nfsd
*nfsd
, int has_header
)
1910 char *dpos
, *cp2
, *cp
;
1911 u_int32_t nfsvers
, auth_type
;
1913 int error
= 0, ticklen
;
1914 struct mbuf
*mrep
, *md
;
1915 struct nfsuid
*nuidp
;
1916 struct timeval tvin
, tvout
;
1918 memset(&tvout
, 0, sizeof tvout
); /* XXX gcc */
1920 KASSERT(nd
->nd_cr
== NULL
);
1925 nfsm_dissect(tl
, u_int32_t
*, 10 * NFSX_UNSIGNED
);
1926 nd
->nd_retxid
= fxdr_unsigned(u_int32_t
, *tl
++);
1927 if (*tl
++ != rpc_call
) {
1932 nfsm_dissect(tl
, u_int32_t
*, 8 * NFSX_UNSIGNED
);
1935 if (*tl
++ != rpc_vers
) {
1936 nd
->nd_repstat
= ERPCMISMATCH
;
1937 nd
->nd_procnum
= NFSPROC_NOOP
;
1940 if (*tl
!= nfs_prog
) {
1941 nd
->nd_repstat
= EPROGUNAVAIL
;
1942 nd
->nd_procnum
= NFSPROC_NOOP
;
1946 nfsvers
= fxdr_unsigned(u_int32_t
, *tl
++);
1947 if (nfsvers
< NFS_VER2
|| nfsvers
> NFS_VER3
) {
1948 nd
->nd_repstat
= EPROGMISMATCH
;
1949 nd
->nd_procnum
= NFSPROC_NOOP
;
1952 if (nfsvers
== NFS_VER3
)
1953 nd
->nd_flag
= ND_NFSV3
;
1954 nd
->nd_procnum
= fxdr_unsigned(u_int32_t
, *tl
++);
1955 if (nd
->nd_procnum
== NFSPROC_NULL
)
1957 if (nd
->nd_procnum
> NFSPROC_COMMIT
||
1958 (!nd
->nd_flag
&& nd
->nd_procnum
> NFSV2PROC_STATFS
)) {
1959 nd
->nd_repstat
= EPROCUNAVAIL
;
1960 nd
->nd_procnum
= NFSPROC_NOOP
;
1963 if ((nd
->nd_flag
& ND_NFSV3
) == 0)
1964 nd
->nd_procnum
= nfsv3_procid
[nd
->nd_procnum
];
1966 len
= fxdr_unsigned(int, *tl
++);
1967 if (len
< 0 || len
> RPCAUTH_MAXSIZ
) {
1972 nd
->nd_flag
&= ~ND_KERBAUTH
;
1974 * Handle auth_unix or auth_kerb.
1976 if (auth_type
== rpc_auth_unix
) {
1980 nd
->nd_cr
= kauth_cred_alloc();
1981 len
= fxdr_unsigned(int, *++tl
);
1982 if (len
< 0 || len
> NFS_MAXNAMLEN
) {
1987 nfsm_adv(nfsm_rndup(len
));
1988 nfsm_dissect(tl
, u_int32_t
*, 3 * NFSX_UNSIGNED
);
1990 uid
= fxdr_unsigned(uid_t
, *tl
++);
1991 gid
= fxdr_unsigned(gid_t
, *tl
++);
1992 kauth_cred_setuid(nd
->nd_cr
, uid
);
1993 kauth_cred_seteuid(nd
->nd_cr
, uid
);
1994 kauth_cred_setsvuid(nd
->nd_cr
, uid
);
1995 kauth_cred_setgid(nd
->nd_cr
, gid
);
1996 kauth_cred_setegid(nd
->nd_cr
, gid
);
1997 kauth_cred_setsvgid(nd
->nd_cr
, gid
);
1999 len
= fxdr_unsigned(int, *tl
);
2000 if (len
< 0 || len
> RPCAUTH_UNIXGIDS
) {
2005 nfsm_dissect(tl
, u_int32_t
*, (len
+ 2) * NFSX_UNSIGNED
);
2008 size_t grbuf_size
= min(len
, NGROUPS
) * sizeof(gid_t
);
2009 gid_t
*grbuf
= kmem_alloc(grbuf_size
, KM_SLEEP
);
2011 for (i
= 0; i
< len
; i
++) {
2012 if (i
< NGROUPS
) /* XXX elad */
2013 grbuf
[i
] = fxdr_unsigned(gid_t
, *tl
++);
2017 kauth_cred_setgroups(nd
->nd_cr
, grbuf
,
2018 min(len
, NGROUPS
), -1, UIO_SYSSPACE
);
2019 kmem_free(grbuf
, grbuf_size
);
2022 len
= fxdr_unsigned(int, *++tl
);
2023 if (len
< 0 || len
> RPCAUTH_MAXSIZ
) {
2029 nfsm_adv(nfsm_rndup(len
));
2030 } else if (auth_type
== rpc_auth_kerb
) {
2031 switch (fxdr_unsigned(int, *tl
++)) {
2032 case RPCAKN_FULLNAME
:
2033 ticklen
= fxdr_unsigned(int, *tl
);
2034 *((u_int32_t
*)nfsd
->nfsd_authstr
) = *tl
;
2035 uio
.uio_resid
= nfsm_rndup(ticklen
) + NFSX_UNSIGNED
;
2036 nfsd
->nfsd_authlen
= uio
.uio_resid
+ NFSX_UNSIGNED
;
2037 if (uio
.uio_resid
> (len
- 2 * NFSX_UNSIGNED
)) {
2045 UIO_SETUP_SYSSPACE(&uio
);
2046 iov
.iov_base
= (void *)&nfsd
->nfsd_authstr
[4];
2047 iov
.iov_len
= RPCAUTH_MAXSIZ
- 4;
2048 nfsm_mtouio(&uio
, uio
.uio_resid
);
2049 nfsm_dissect(tl
, u_int32_t
*, 2 * NFSX_UNSIGNED
);
2050 if (*tl
++ != rpc_auth_kerb
||
2051 fxdr_unsigned(int, *tl
) != 4 * NFSX_UNSIGNED
) {
2052 printf("Bad kerb verifier\n");
2053 nd
->nd_repstat
= (NFSERR_AUTHERR
|AUTH_BADVERF
);
2054 nd
->nd_procnum
= NFSPROC_NOOP
;
2057 nfsm_dissect(cp
, void *, 4 * NFSX_UNSIGNED
);
2058 tl
= (u_int32_t
*)cp
;
2059 if (fxdr_unsigned(int, *tl
) != RPCAKN_FULLNAME
) {
2060 printf("Not fullname kerb verifier\n");
2061 nd
->nd_repstat
= (NFSERR_AUTHERR
|AUTH_BADVERF
);
2062 nd
->nd_procnum
= NFSPROC_NOOP
;
2065 cp
+= NFSX_UNSIGNED
;
2066 memcpy(nfsd
->nfsd_verfstr
, cp
, 3 * NFSX_UNSIGNED
);
2067 nfsd
->nfsd_verflen
= 3 * NFSX_UNSIGNED
;
2068 nd
->nd_flag
|= ND_KERBFULL
;
2069 nfsd
->nfsd_flag
|= NFSD_NEEDAUTH
;
2071 case RPCAKN_NICKNAME
:
2072 if (len
!= 2 * NFSX_UNSIGNED
) {
2073 printf("Kerb nickname short\n");
2074 nd
->nd_repstat
= (NFSERR_AUTHERR
|AUTH_BADCRED
);
2075 nd
->nd_procnum
= NFSPROC_NOOP
;
2078 nickuid
= fxdr_unsigned(uid_t
, *tl
);
2079 nfsm_dissect(tl
, u_int32_t
*, 2 * NFSX_UNSIGNED
);
2080 if (*tl
++ != rpc_auth_kerb
||
2081 fxdr_unsigned(int, *tl
) != 3 * NFSX_UNSIGNED
) {
2082 printf("Kerb nick verifier bad\n");
2083 nd
->nd_repstat
= (NFSERR_AUTHERR
|AUTH_BADVERF
);
2084 nd
->nd_procnum
= NFSPROC_NOOP
;
2087 nfsm_dissect(tl
, u_int32_t
*, 3 * NFSX_UNSIGNED
);
2088 tvin
.tv_sec
= *tl
++;
2091 LIST_FOREACH(nuidp
, NUIDHASH(nfsd
->nfsd_slp
, nickuid
),
2093 if (kauth_cred_geteuid(nuidp
->nu_cr
) == nickuid
&&
2095 netaddr_match(NU_NETFAM(nuidp
),
2096 &nuidp
->nu_haddr
, nd
->nd_nam2
)))
2101 (NFSERR_AUTHERR
|AUTH_REJECTCRED
);
2102 nd
->nd_procnum
= NFSPROC_NOOP
;
2107 * Now, decrypt the timestamp using the session key
2114 tvout
.tv_sec
= fxdr_unsigned(long, tvout
.tv_sec
);
2115 tvout
.tv_usec
= fxdr_unsigned(long, tvout
.tv_usec
);
2116 if (nuidp
->nu_expire
< time_second
||
2117 nuidp
->nu_timestamp
.tv_sec
> tvout
.tv_sec
||
2118 (nuidp
->nu_timestamp
.tv_sec
== tvout
.tv_sec
&&
2119 nuidp
->nu_timestamp
.tv_usec
> tvout
.tv_usec
)) {
2120 nuidp
->nu_expire
= 0;
2122 (NFSERR_AUTHERR
|AUTH_REJECTVERF
);
2123 nd
->nd_procnum
= NFSPROC_NOOP
;
2126 kauth_cred_hold(nuidp
->nu_cr
);
2127 nd
->nd_cr
= nuidp
->nu_cr
;
2128 nd
->nd_flag
|= ND_KERBNICK
;
2131 nd
->nd_repstat
= (NFSERR_AUTHERR
| AUTH_REJECTCRED
);
2132 nd
->nd_procnum
= NFSPROC_NOOP
;
2138 KASSERT((nd
->nd_cr
== NULL
&& (nfsd
->nfsd_flag
& NFSD_NEEDAUTH
) != 0)
2139 || (nd
->nd_cr
!= NULL
&& (nfsd
->nfsd_flag
& NFSD_NEEDAUTH
) == 0));
2143 KASSERT(error
!= 0);
2144 if (nd
->nd_cr
!= NULL
) {
2145 kauth_cred_free(nd
->nd_cr
);
2152 nfs_msg(struct lwp
*l
, const char *server
, const char *msg
)
2157 tpr
= tprintf_open(l
->l_proc
);
2160 tprintf(tpr
, "nfs server %s: %s\n", server
, msg
);
2165 static struct pool nfs_srvdesc_pool
;
2171 pool_init(&nfs_srvdesc_pool
, sizeof(struct nfsrv_descript
),
2172 0, 0, 0, "nfsrvdescpl", &pool_allocator_nointr
, IPL_NONE
);
2179 pool_destroy(&nfs_srvdesc_pool
);
2182 struct nfsrv_descript
*
2185 struct nfsrv_descript
*nd
;
2187 nd
= pool_get(&nfs_srvdesc_pool
, PR_WAITOK
);
2193 nfsdreq_free(struct nfsrv_descript
*nd
)
2199 kauth_cred_free(cr
);
2201 pool_put(&nfs_srvdesc_pool
, nd
);