4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <sys/systm.h>
29 #include <rpc/types.h>
31 #include <rpc/auth_unix.h>
32 #include <rpc/auth_des.h>
36 #include <nfs/nfs_dispatch.h>
37 #include <nfs/nfs4_drc.h>
39 #define NFS4_MAX_MINOR_VERSION 0
42 * This is the duplicate request cache for NFSv4
44 rfs4_drc_t
*nfs4_drc
= NULL
;
47 * The default size of the duplicate request cache
49 uint32_t nfs4_drc_max
= 8 * 1024;
52 * The number of buckets we'd like to hash the
53 * replies into.. do not change this on the fly.
55 uint32_t nfs4_drc_hash
= 541;
57 static void rfs4_resource_err(struct svc_req
*req
, COMPOUND4args
*argsp
);
60 * Initialize a duplicate request cache.
63 rfs4_init_drc(uint32_t drc_size
, uint32_t drc_hash_size
)
69 ASSERT(drc_hash_size
);
71 drc
= kmem_alloc(sizeof (rfs4_drc_t
), KM_SLEEP
);
73 drc
->max_size
= drc_size
;
76 mutex_init(&drc
->lock
, NULL
, MUTEX_DEFAULT
, NULL
);
78 drc
->dr_hash
= drc_hash_size
;
80 drc
->dr_buckets
= kmem_alloc(sizeof (list_t
)*drc_hash_size
, KM_SLEEP
);
82 for (bki
= 0; bki
< drc_hash_size
; bki
++) {
83 list_create(&drc
->dr_buckets
[bki
], sizeof (rfs4_dupreq_t
),
84 offsetof(rfs4_dupreq_t
, dr_bkt_next
));
87 list_create(&(drc
->dr_cache
), sizeof (rfs4_dupreq_t
),
88 offsetof(rfs4_dupreq_t
, dr_next
));
94 * Destroy a duplicate request cache.
97 rfs4_fini_drc(rfs4_drc_t
*drc
)
99 rfs4_dupreq_t
*drp
, *drp_next
;
103 /* iterate over the dr_cache and free the enties */
104 for (drp
= list_head(&(drc
->dr_cache
)); drp
!= NULL
; drp
= drp_next
) {
106 if (drp
->dr_state
== NFS4_DUP_REPLAY
)
107 rfs4_compound_free(&(drp
->dr_res
));
109 if (drp
->dr_addr
.buf
!= NULL
)
110 kmem_free(drp
->dr_addr
.buf
, drp
->dr_addr
.maxlen
);
112 drp_next
= list_next(&(drc
->dr_cache
), drp
);
114 kmem_free(drp
, sizeof (rfs4_dupreq_t
));
117 mutex_destroy(&drc
->lock
);
118 kmem_free(drc
->dr_buckets
,
119 sizeof (list_t
)*drc
->dr_hash
);
120 kmem_free(drc
, sizeof (rfs4_drc_t
));
126 * Change the state of a rfs4_dupreq. If it's not in transition
127 * to the FREE state, return. If we are moving to the FREE state
128 * then we need to clean up the compound results and move the entry
129 * to the end of the list.
132 rfs4_dr_chstate(rfs4_dupreq_t
*drp
, int new_state
)
139 ASSERT(MUTEX_HELD(&drp
->drc
->lock
));
141 drp
->dr_state
= new_state
;
143 if (new_state
!= NFS4_DUP_FREE
)
149 * Remove entry from the bucket and
150 * dr_cache list, free compound results.
152 list_remove(drp
->dr_bkt
, drp
);
153 list_remove(&(drc
->dr_cache
), drp
);
154 rfs4_compound_free(&(drp
->dr_res
));
160 * Malloc a new one if we have not reached our maximum cache
161 * limit, otherwise pick an entry off the tail -- Use if it
162 * is marked as NFS4_DUP_FREE, or is an entry in the
163 * NFS4_DUP_REPLAY state.
166 rfs4_alloc_dr(rfs4_drc_t
*drc
)
168 rfs4_dupreq_t
*drp_tail
, *drp
= NULL
;
171 ASSERT(MUTEX_HELD(&drc
->lock
));
174 * Have we hit the cache limit yet ?
176 if (drc
->in_use
< drc
->max_size
) {
178 * nope, so let's malloc a new one
180 drp
= kmem_zalloc(sizeof (rfs4_dupreq_t
), KM_SLEEP
);
183 DTRACE_PROBE1(nfss__i__drc_new
, rfs4_dupreq_t
*, drp
);
188 * Cache is all allocated now traverse the list
189 * backwards to find one we can reuse.
191 for (drp_tail
= list_tail(&drc
->dr_cache
); drp_tail
!= NULL
;
192 drp_tail
= list_prev(&drc
->dr_cache
, drp_tail
)) {
194 switch (drp_tail
->dr_state
) {
197 list_remove(&(drc
->dr_cache
), drp_tail
);
198 DTRACE_PROBE1(nfss__i__drc_freeclaim
,
199 rfs4_dupreq_t
*, drp_tail
);
203 case NFS4_DUP_REPLAY
:
205 rfs4_dr_chstate(drp_tail
, NFS4_DUP_FREE
);
206 DTRACE_PROBE1(nfss__i__drc_replayclaim
,
207 rfs4_dupreq_t
*, drp_tail
);
212 DTRACE_PROBE1(nfss__i__drc_full
, rfs4_drc_t
*, drc
);
219 * Search for an entry in the duplicate request cache by
220 * calculating the hash index based on the XID, and examining
221 * the entries in the hash bucket. If we find a match, return.
222 * Once we have searched the bucket we call rfs4_alloc_dr() to
223 * allocate a new entry, or reuse one that is available.
226 rfs4_find_dr(struct svc_req
*req
, rfs4_drc_t
*drc
, rfs4_dupreq_t
**dup
)
235 * Get the XID, calculate the bucket and search to
236 * see if we need to replay from the cache.
238 the_xid
= req
->rq_xprt
->xp_xid
;
239 bktdex
= the_xid
% drc
->dr_hash
;
242 &(drc
->dr_buckets
[(the_xid
% drc
->dr_hash
)]);
244 DTRACE_PROBE3(nfss__i__drc_bktdex
,
251 mutex_enter(&drc
->lock
);
253 * Search the bucket for a matching xid and address.
255 for (drp
= list_head(dr_bkt
); drp
!= NULL
;
256 drp
= list_next(dr_bkt
, drp
)) {
258 if (drp
->dr_xid
== the_xid
&&
259 drp
->dr_addr
.len
== req
->rq_xprt
->xp_rtaddr
.len
&&
260 bcmp((caddr_t
)drp
->dr_addr
.buf
,
261 (caddr_t
)req
->rq_xprt
->xp_rtaddr
.buf
,
262 drp
->dr_addr
.len
) == 0) {
265 * Found a match so REPLAY the Reply
267 if (drp
->dr_state
== NFS4_DUP_REPLAY
) {
268 rfs4_dr_chstate(drp
, NFS4_DUP_INUSE
);
269 mutex_exit(&drc
->lock
);
271 DTRACE_PROBE1(nfss__i__drc_replay
,
272 rfs4_dupreq_t
*, drp
);
273 return (NFS4_DUP_REPLAY
);
277 * This entry must be in transition, so return
278 * the 'pending' status.
280 mutex_exit(&drc
->lock
);
281 return (NFS4_DUP_PENDING
);
285 drp
= rfs4_alloc_dr(drc
);
286 mutex_exit(&drc
->lock
);
289 * The DRC is full and all entries are in use. Upper function
290 * should error out this request and force the client to
291 * retransmit -- effectively this is a resource issue. NFSD
292 * threads tied up with native File System, or the cache size
293 * is too small for the server load.
296 return (NFS4_DUP_ERROR
);
299 * Init the state to NEW.
301 drp
->dr_state
= NFS4_DUP_NEW
;
304 * If needed, resize the address buffer
306 if (drp
->dr_addr
.maxlen
< req
->rq_xprt
->xp_rtaddr
.len
) {
307 if (drp
->dr_addr
.buf
!= NULL
)
308 kmem_free(drp
->dr_addr
.buf
, drp
->dr_addr
.maxlen
);
309 drp
->dr_addr
.maxlen
= req
->rq_xprt
->xp_rtaddr
.len
;
310 drp
->dr_addr
.buf
= kmem_alloc(drp
->dr_addr
.maxlen
, KM_NOSLEEP
);
311 if (drp
->dr_addr
.buf
== NULL
) {
313 * If the malloc fails, mark the entry
314 * as free and put on the tail.
316 drp
->dr_addr
.maxlen
= 0;
317 drp
->dr_state
= NFS4_DUP_FREE
;
318 mutex_enter(&drc
->lock
);
319 list_insert_tail(&(drc
->dr_cache
), drp
);
320 mutex_exit(&drc
->lock
);
321 return (NFS4_DUP_ERROR
);
329 drp
->dr_addr
.len
= req
->rq_xprt
->xp_rtaddr
.len
;
331 bcopy((caddr_t
)req
->rq_xprt
->xp_rtaddr
.buf
,
332 (caddr_t
)drp
->dr_addr
.buf
,
335 drp
->dr_xid
= the_xid
;
336 drp
->dr_bkt
= dr_bkt
;
339 * Insert at the head of the bucket and
342 mutex_enter(&drc
->lock
);
343 list_insert_head(&drc
->dr_cache
, drp
);
344 list_insert_head(dr_bkt
, drp
);
345 mutex_exit(&drc
->lock
);
349 return (NFS4_DUP_NEW
);
354 * This function handles the duplicate request cache,
355 * NULL_PROC and COMPOUND procedure calls for NFSv4;
357 * Passed into this function are:-
359 * disp A pointer to our dispatch table entry
360 * req The request to process
361 * xprt The server transport handle
362 * ap A pointer to the arguments
365 * When appropriate this function is responsible for inserting
366 * the reply into the duplicate cache or replaying an existing
369 * dr_stat reflects the state of the duplicate request that
370 * has been inserted into or retrieved from the cache
372 * drp is the duplicate request entry
376 rfs4_dispatch(struct rpcdisp
*disp
, struct svc_req
*req
,
377 SVCXPRT
*xprt
, char *ap
)
380 COMPOUND4res res_buf
;
386 int dr_stat
= NFS4_NOT_DUP
;
387 rfs4_dupreq_t
*drp
= NULL
;
393 * Short circuit the RPC_NULL proc.
395 if (disp
->dis_proc
== rpc_null
) {
396 DTRACE_NFSV4_1(null__start
, struct svc_req
*, req
);
397 if (!svc_sendreply(xprt
, xdr_void
, NULL
)) {
398 DTRACE_NFSV4_1(null__done
, struct svc_req
*, req
);
399 svcerr_systemerr(xprt
);
402 DTRACE_NFSV4_1(null__done
, struct svc_req
*, req
);
406 /* Only NFSv4 Compounds from this point onward */
409 cap
= (COMPOUND4args
*)ap
;
412 * Figure out the disposition of the whole COMPOUND
413 * and record it's IDEMPOTENTCY.
415 rfs4_compound_flagproc(cap
, &dis_flags
);
418 * If NON-IDEMPOTENT then we need to figure out if this
419 * request can be replied from the duplicate cache.
421 * If this is a new request then we need to insert the
422 * reply into the duplicate cache.
424 if (!(dis_flags
& RPC_IDEMPOTENT
)) {
425 /* look for a replay from the cache or allocate */
426 dr_stat
= rfs4_find_dr(req
, nfs4_drc
, &drp
);
431 rfs4_resource_err(req
, cap
);
435 case NFS4_DUP_PENDING
:
437 * reply has previously been inserted into the
438 * duplicate cache, however the reply has
439 * not yet been sent via svc_sendreply()
445 curthread
->t_flag
|= T_DONTPEND
;
446 /* NON-IDEMPOTENT proc call */
447 rfs4_compound(cap
, rbp
, NULL
, req
, cr
, &rv
);
448 curthread
->t_flag
&= ~T_DONTPEND
;
450 if (rv
) /* short ckt sendreply on error */
454 * dr_res must be initialized before calling
455 * rfs4_dr_chstate (it frees the reply).
457 drp
->dr_res
= res_buf
;
458 if (curthread
->t_flag
& T_WOULDBLOCK
) {
459 curthread
->t_flag
&= ~T_WOULDBLOCK
;
461 * mark this entry as FREE and plop
462 * on the end of the cache list
464 mutex_enter(&drp
->drc
->lock
);
465 rfs4_dr_chstate(drp
, NFS4_DUP_FREE
);
466 list_insert_tail(&(drp
->drc
->dr_cache
), drp
);
467 mutex_exit(&drp
->drc
->lock
);
472 case NFS4_DUP_REPLAY
:
473 /* replay from the cache */
474 rbp
= &(drp
->dr_res
);
478 curthread
->t_flag
|= T_DONTPEND
;
479 /* IDEMPOTENT proc call */
480 rfs4_compound(cap
, rbp
, NULL
, req
, cr
, &rv
);
481 curthread
->t_flag
&= ~T_DONTPEND
;
483 if (rv
) /* short ckt sendreply on error */
486 if (curthread
->t_flag
& T_WOULDBLOCK
) {
487 curthread
->t_flag
&= ~T_WOULDBLOCK
;
493 * Send out the replayed reply or the 'real' one.
495 if (!svc_sendreply(xprt
, xdr_COMPOUND4res_srv
, (char *)rbp
)) {
496 DTRACE_PROBE2(nfss__e__dispatch_sendfail
,
497 struct svc_req
*, xprt
,
499 svcerr_systemerr(xprt
);
504 * If this reply was just inserted into the duplicate cache
505 * or it was replayed from the dup cache; (re)mark it as
506 * available for replay
508 * At first glance, this 'if' statement seems a little strange;
509 * testing for NFS4_DUP_REPLAY, and then calling...
511 * rfs4_dr_chatate(NFS4_DUP_REPLAY)
513 * ... but notice that we are checking dr_stat, and not the
514 * state of the entry itself, the entry will be NFS4_DUP_INUSE,
515 * we do that so that we know not to prematurely reap it whilst
516 * we resent it to the client.
519 if (dr_stat
== NFS4_DUP_NEW
|| dr_stat
== NFS4_DUP_REPLAY
) {
520 mutex_enter(&drp
->drc
->lock
);
521 rfs4_dr_chstate(drp
, NFS4_DUP_REPLAY
);
522 mutex_exit(&drp
->drc
->lock
);
523 } else if (dr_stat
== NFS4_NOT_DUP
) {
524 rfs4_compound_free(rbp
);
531 rfs4_minorvers_mismatch(struct svc_req
*req
, SVCXPRT
*xprt
, void *args
)
533 COMPOUND4args
*argsp
;
534 COMPOUND4res res_buf
, *resp
;
536 if (req
->rq_vers
!= 4)
539 argsp
= (COMPOUND4args
*)args
;
541 if (argsp
->minorversion
<= NFS4_MAX_MINOR_VERSION
)
547 * Form a reply tag by copying over the reqeuest tag.
549 resp
->tag
.utf8string_val
=
550 kmem_alloc(argsp
->tag
.utf8string_len
, KM_SLEEP
);
551 resp
->tag
.utf8string_len
= argsp
->tag
.utf8string_len
;
552 bcopy(argsp
->tag
.utf8string_val
, resp
->tag
.utf8string_val
,
553 resp
->tag
.utf8string_len
);
556 resp
->status
= NFS4ERR_MINOR_VERS_MISMATCH
;
557 if (!svc_sendreply(xprt
, xdr_COMPOUND4res_srv
, (char *)resp
)) {
558 DTRACE_PROBE2(nfss__e__minorvers_mismatch
,
559 SVCXPRT
*, xprt
, char *, resp
);
560 svcerr_systemerr(xprt
);
562 rfs4_compound_free(resp
);
567 rfs4_resource_err(struct svc_req
*req
, COMPOUND4args
*argsp
)
569 COMPOUND4res res_buf
, *rbp
;
576 * Form a reply tag by copying over the request tag.
578 rbp
->tag
.utf8string_val
=
579 kmem_alloc(argsp
->tag
.utf8string_len
, KM_SLEEP
);
580 rbp
->tag
.utf8string_len
= argsp
->tag
.utf8string_len
;
581 bcopy(argsp
->tag
.utf8string_val
, rbp
->tag
.utf8string_val
,
582 rbp
->tag
.utf8string_len
);
585 rbp
->array
= kmem_zalloc(rbp
->array_len
* sizeof (nfs_resop4
),
587 resop
= &rbp
->array
[0];
588 resop
->resop
= argsp
->array
[0].argop
; /* copy first op over */
590 /* Any op will do, just need to access status field */
591 resp
= &resop
->nfs_resop4_u
.opputfh
;
594 * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
595 * Note that all op numbers in the compound array were already
596 * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
598 resp
->status
= (resop
->resop
== OP_ILLEGAL
?
599 NFS4ERR_OP_ILLEGAL
: NFS4ERR_RESOURCE
);
601 /* compound status is same as first op status */
602 rbp
->status
= resp
->status
;
604 if (!svc_sendreply(req
->rq_xprt
, xdr_COMPOUND4res_srv
, (char *)rbp
)) {
605 DTRACE_PROBE2(nfss__rsrc_err__sendfail
,
606 struct svc_req
*, req
->rq_xprt
, char *, rbp
);
607 svcerr_systemerr(req
->rq_xprt
);
610 UTF8STRING_FREE(rbp
->tag
);
611 kmem_free(rbp
->array
, rbp
->array_len
* sizeof (nfs_resop4
));