2 * Copyright (c) 2014 Christoph Hellwig.
4 #include <linux/kmod.h>
5 #include <linux/file.h>
6 #include <linux/jhash.h>
7 #include <linux/sched.h>
8 #include <linux/sunrpc/addr.h>
14 #define NFSDDBG_FACILITY NFSDDBG_PNFS
17 struct list_head lo_perstate
;
18 struct nfs4_layout_stateid
*lo_state
;
19 struct nfsd4_layout_seg lo_seg
;
22 static struct kmem_cache
*nfs4_layout_cache
;
23 static struct kmem_cache
*nfs4_layout_stateid_cache
;
25 static struct nfsd4_callback_ops nfsd4_cb_layout_ops
;
26 static const struct lock_manager_operations nfsd4_layouts_lm_ops
;
28 const struct nfsd4_layout_ops
*nfsd4_layout_ops
[LAYOUT_TYPE_MAX
] = {
29 [LAYOUT_BLOCK_VOLUME
] = &bl_layout_ops
,
32 /* pNFS device ID to export fsid mapping */
33 #define DEVID_HASH_BITS 8
34 #define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
35 #define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
36 static u64 nfsd_devid_seq
= 1;
37 static struct list_head nfsd_devid_hash
[DEVID_HASH_SIZE
];
38 static DEFINE_SPINLOCK(nfsd_devid_lock
);
40 static inline u32
devid_hashfn(u64 idx
)
42 return jhash_2words(idx
, idx
>> 32, 0) & DEVID_HASH_MASK
;
46 nfsd4_alloc_devid_map(const struct svc_fh
*fhp
)
48 const struct knfsd_fh
*fh
= &fhp
->fh_handle
;
49 size_t fsid_len
= key_len(fh
->fh_fsid_type
);
50 struct nfsd4_deviceid_map
*map
, *old
;
53 map
= kzalloc(sizeof(*map
) + fsid_len
, GFP_KERNEL
);
57 map
->fsid_type
= fh
->fh_fsid_type
;
58 memcpy(&map
->fsid
, fh
->fh_fsid
, fsid_len
);
60 spin_lock(&nfsd_devid_lock
);
61 if (fhp
->fh_export
->ex_devid_map
)
64 for (i
= 0; i
< DEVID_HASH_SIZE
; i
++) {
65 list_for_each_entry(old
, &nfsd_devid_hash
[i
], hash
) {
66 if (old
->fsid_type
!= fh
->fh_fsid_type
)
68 if (memcmp(old
->fsid
, fh
->fh_fsid
,
69 key_len(old
->fsid_type
)))
72 fhp
->fh_export
->ex_devid_map
= old
;
77 map
->idx
= nfsd_devid_seq
++;
78 list_add_tail_rcu(&map
->hash
, &nfsd_devid_hash
[devid_hashfn(map
->idx
)]);
79 fhp
->fh_export
->ex_devid_map
= map
;
83 spin_unlock(&nfsd_devid_lock
);
87 struct nfsd4_deviceid_map
*
88 nfsd4_find_devid_map(int idx
)
90 struct nfsd4_deviceid_map
*map
, *ret
= NULL
;
93 list_for_each_entry_rcu(map
, &nfsd_devid_hash
[devid_hashfn(idx
)], hash
)
102 nfsd4_set_deviceid(struct nfsd4_deviceid
*id
, const struct svc_fh
*fhp
,
103 u32 device_generation
)
105 if (!fhp
->fh_export
->ex_devid_map
) {
106 nfsd4_alloc_devid_map(fhp
);
107 if (!fhp
->fh_export
->ex_devid_map
)
111 id
->fsid_idx
= fhp
->fh_export
->ex_devid_map
->idx
;
112 id
->generation
= device_generation
;
117 void nfsd4_setup_layout_type(struct svc_export
*exp
)
119 struct super_block
*sb
= exp
->ex_path
.mnt
->mnt_sb
;
121 if (exp
->ex_flags
& NFSEXP_NOPNFS
)
124 if (sb
->s_export_op
->get_uuid
&&
125 sb
->s_export_op
->map_blocks
&&
126 sb
->s_export_op
->commit_blocks
)
127 exp
->ex_layout_type
= LAYOUT_BLOCK_VOLUME
;
131 nfsd4_free_layout_stateid(struct nfs4_stid
*stid
)
133 struct nfs4_layout_stateid
*ls
= layoutstateid(stid
);
134 struct nfs4_client
*clp
= ls
->ls_stid
.sc_client
;
135 struct nfs4_file
*fp
= ls
->ls_stid
.sc_file
;
137 trace_layoutstate_free(&ls
->ls_stid
.sc_stateid
);
139 spin_lock(&clp
->cl_lock
);
140 list_del_init(&ls
->ls_perclnt
);
141 spin_unlock(&clp
->cl_lock
);
143 spin_lock(&fp
->fi_lock
);
144 list_del_init(&ls
->ls_perfile
);
145 spin_unlock(&fp
->fi_lock
);
147 vfs_setlease(ls
->ls_file
, F_UNLCK
, NULL
, (void **)&ls
);
151 atomic_dec(&ls
->ls_stid
.sc_file
->fi_lo_recalls
);
153 kmem_cache_free(nfs4_layout_stateid_cache
, ls
);
157 nfsd4_layout_setlease(struct nfs4_layout_stateid
*ls
)
159 struct file_lock
*fl
;
162 fl
= locks_alloc_lock();
166 fl
->fl_lmops
= &nfsd4_layouts_lm_ops
;
167 fl
->fl_flags
= FL_LAYOUT
;
168 fl
->fl_type
= F_RDLCK
;
169 fl
->fl_end
= OFFSET_MAX
;
171 fl
->fl_pid
= current
->tgid
;
172 fl
->fl_file
= ls
->ls_file
;
174 status
= vfs_setlease(fl
->fl_file
, fl
->fl_type
, &fl
, NULL
);
183 static struct nfs4_layout_stateid
*
184 nfsd4_alloc_layout_stateid(struct nfsd4_compound_state
*cstate
,
185 struct nfs4_stid
*parent
, u32 layout_type
)
187 struct nfs4_client
*clp
= cstate
->clp
;
188 struct nfs4_file
*fp
= parent
->sc_file
;
189 struct nfs4_layout_stateid
*ls
;
190 struct nfs4_stid
*stp
;
192 stp
= nfs4_alloc_stid(cstate
->clp
, nfs4_layout_stateid_cache
);
195 stp
->sc_free
= nfsd4_free_layout_stateid
;
199 ls
= layoutstateid(stp
);
200 INIT_LIST_HEAD(&ls
->ls_perclnt
);
201 INIT_LIST_HEAD(&ls
->ls_perfile
);
202 spin_lock_init(&ls
->ls_lock
);
203 INIT_LIST_HEAD(&ls
->ls_layouts
);
204 ls
->ls_layout_type
= layout_type
;
205 nfsd4_init_cb(&ls
->ls_recall
, clp
, &nfsd4_cb_layout_ops
,
206 NFSPROC4_CLNT_CB_LAYOUT
);
208 if (parent
->sc_type
== NFS4_DELEG_STID
)
209 ls
->ls_file
= get_file(fp
->fi_deleg_file
);
211 ls
->ls_file
= find_any_file(fp
);
212 BUG_ON(!ls
->ls_file
);
214 if (nfsd4_layout_setlease(ls
)) {
216 kmem_cache_free(nfs4_layout_stateid_cache
, ls
);
220 spin_lock(&clp
->cl_lock
);
221 stp
->sc_type
= NFS4_LAYOUT_STID
;
222 list_add(&ls
->ls_perclnt
, &clp
->cl_lo_states
);
223 spin_unlock(&clp
->cl_lock
);
225 spin_lock(&fp
->fi_lock
);
226 list_add(&ls
->ls_perfile
, &fp
->fi_lo_states
);
227 spin_unlock(&fp
->fi_lock
);
229 trace_layoutstate_alloc(&ls
->ls_stid
.sc_stateid
);
234 nfsd4_preprocess_layout_stateid(struct svc_rqst
*rqstp
,
235 struct nfsd4_compound_state
*cstate
, stateid_t
*stateid
,
236 bool create
, u32 layout_type
, struct nfs4_layout_stateid
**lsp
)
238 struct nfs4_layout_stateid
*ls
;
239 struct nfs4_stid
*stid
;
240 unsigned char typemask
= NFS4_LAYOUT_STID
;
244 typemask
|= (NFS4_OPEN_STID
| NFS4_LOCK_STID
| NFS4_DELEG_STID
);
246 status
= nfsd4_lookup_stateid(cstate
, stateid
, typemask
, &stid
,
247 net_generic(SVC_NET(rqstp
), nfsd_net_id
));
251 if (!fh_match(&cstate
->current_fh
.fh_handle
,
252 &stid
->sc_file
->fi_fhandle
)) {
253 status
= nfserr_bad_stateid
;
257 if (stid
->sc_type
!= NFS4_LAYOUT_STID
) {
258 ls
= nfsd4_alloc_layout_stateid(cstate
, stid
, layout_type
);
261 status
= nfserr_jukebox
;
265 ls
= container_of(stid
, struct nfs4_layout_stateid
, ls_stid
);
267 status
= nfserr_bad_stateid
;
268 if (stateid
->si_generation
> stid
->sc_stateid
.si_generation
)
270 if (layout_type
!= ls
->ls_layout_type
)
284 nfsd4_recall_file_layout(struct nfs4_layout_stateid
*ls
)
286 spin_lock(&ls
->ls_lock
);
290 ls
->ls_recalled
= true;
291 atomic_inc(&ls
->ls_stid
.sc_file
->fi_lo_recalls
);
292 if (list_empty(&ls
->ls_layouts
))
295 trace_layout_recall(&ls
->ls_stid
.sc_stateid
);
297 atomic_inc(&ls
->ls_stid
.sc_count
);
298 update_stateid(&ls
->ls_stid
.sc_stateid
);
299 memcpy(&ls
->ls_recall_sid
, &ls
->ls_stid
.sc_stateid
, sizeof(stateid_t
));
300 nfsd4_run_cb(&ls
->ls_recall
);
303 spin_unlock(&ls
->ls_lock
);
307 layout_end(struct nfsd4_layout_seg
*seg
)
309 u64 end
= seg
->offset
+ seg
->length
;
310 return end
>= seg
->offset
? end
: NFS4_MAX_UINT64
;
314 layout_update_len(struct nfsd4_layout_seg
*lo
, u64 end
)
316 if (end
== NFS4_MAX_UINT64
)
317 lo
->length
= NFS4_MAX_UINT64
;
319 lo
->length
= end
- lo
->offset
;
323 layouts_overlapping(struct nfs4_layout
*lo
, struct nfsd4_layout_seg
*s
)
325 if (s
->iomode
!= IOMODE_ANY
&& s
->iomode
!= lo
->lo_seg
.iomode
)
327 if (layout_end(&lo
->lo_seg
) <= s
->offset
)
329 if (layout_end(s
) <= lo
->lo_seg
.offset
)
335 layouts_try_merge(struct nfsd4_layout_seg
*lo
, struct nfsd4_layout_seg
*new)
337 if (lo
->iomode
!= new->iomode
)
339 if (layout_end(new) < lo
->offset
)
341 if (layout_end(lo
) < new->offset
)
344 lo
->offset
= min(lo
->offset
, new->offset
);
345 layout_update_len(lo
, max(layout_end(lo
), layout_end(new)));
350 nfsd4_recall_conflict(struct nfs4_layout_stateid
*ls
)
352 struct nfs4_file
*fp
= ls
->ls_stid
.sc_file
;
353 struct nfs4_layout_stateid
*l
, *n
;
354 __be32 nfserr
= nfs_ok
;
356 assert_spin_locked(&fp
->fi_lock
);
358 list_for_each_entry_safe(l
, n
, &fp
->fi_lo_states
, ls_perfile
) {
360 nfsd4_recall_file_layout(l
);
361 nfserr
= nfserr_recallconflict
;
369 nfsd4_insert_layout(struct nfsd4_layoutget
*lgp
, struct nfs4_layout_stateid
*ls
)
371 struct nfsd4_layout_seg
*seg
= &lgp
->lg_seg
;
372 struct nfs4_file
*fp
= ls
->ls_stid
.sc_file
;
373 struct nfs4_layout
*lp
, *new = NULL
;
376 spin_lock(&fp
->fi_lock
);
377 nfserr
= nfsd4_recall_conflict(ls
);
380 spin_lock(&ls
->ls_lock
);
381 list_for_each_entry(lp
, &ls
->ls_layouts
, lo_perstate
) {
382 if (layouts_try_merge(&lp
->lo_seg
, seg
))
385 spin_unlock(&ls
->ls_lock
);
386 spin_unlock(&fp
->fi_lock
);
388 new = kmem_cache_alloc(nfs4_layout_cache
, GFP_KERNEL
);
390 return nfserr_jukebox
;
391 memcpy(&new->lo_seg
, seg
, sizeof(lp
->lo_seg
));
394 spin_lock(&fp
->fi_lock
);
395 nfserr
= nfsd4_recall_conflict(ls
);
398 spin_lock(&ls
->ls_lock
);
399 list_for_each_entry(lp
, &ls
->ls_layouts
, lo_perstate
) {
400 if (layouts_try_merge(&lp
->lo_seg
, seg
))
404 atomic_inc(&ls
->ls_stid
.sc_count
);
405 list_add_tail(&new->lo_perstate
, &ls
->ls_layouts
);
408 update_stateid(&ls
->ls_stid
.sc_stateid
);
409 memcpy(&lgp
->lg_sid
, &ls
->ls_stid
.sc_stateid
, sizeof(stateid_t
));
410 spin_unlock(&ls
->ls_lock
);
412 spin_unlock(&fp
->fi_lock
);
414 kmem_cache_free(nfs4_layout_cache
, new);
419 nfsd4_free_layouts(struct list_head
*reaplist
)
421 while (!list_empty(reaplist
)) {
422 struct nfs4_layout
*lp
= list_first_entry(reaplist
,
423 struct nfs4_layout
, lo_perstate
);
425 list_del(&lp
->lo_perstate
);
426 nfs4_put_stid(&lp
->lo_state
->ls_stid
);
427 kmem_cache_free(nfs4_layout_cache
, lp
);
432 nfsd4_return_file_layout(struct nfs4_layout
*lp
, struct nfsd4_layout_seg
*seg
,
433 struct list_head
*reaplist
)
435 struct nfsd4_layout_seg
*lo
= &lp
->lo_seg
;
436 u64 end
= layout_end(lo
);
438 if (seg
->offset
<= lo
->offset
) {
439 if (layout_end(seg
) >= end
) {
440 list_move_tail(&lp
->lo_perstate
, reaplist
);
445 /* retain the whole layout segment on a split. */
446 if (layout_end(seg
) < end
) {
447 dprintk("%s: split not supported\n", __func__
);
451 lo
->offset
= layout_end(seg
);
454 layout_update_len(lo
, end
);
458 nfsd4_return_file_layouts(struct svc_rqst
*rqstp
,
459 struct nfsd4_compound_state
*cstate
,
460 struct nfsd4_layoutreturn
*lrp
)
462 struct nfs4_layout_stateid
*ls
;
463 struct nfs4_layout
*lp
, *n
;
468 nfserr
= nfsd4_preprocess_layout_stateid(rqstp
, cstate
, &lrp
->lr_sid
,
469 false, lrp
->lr_layout_type
,
472 trace_layout_return_lookup_fail(&lrp
->lr_sid
);
476 spin_lock(&ls
->ls_lock
);
477 list_for_each_entry_safe(lp
, n
, &ls
->ls_layouts
, lo_perstate
) {
478 if (layouts_overlapping(lp
, &lrp
->lr_seg
)) {
479 nfsd4_return_file_layout(lp
, &lrp
->lr_seg
, &reaplist
);
483 if (!list_empty(&ls
->ls_layouts
)) {
485 update_stateid(&ls
->ls_stid
.sc_stateid
);
486 memcpy(&lrp
->lr_sid
, &ls
->ls_stid
.sc_stateid
,
489 lrp
->lrs_present
= 1;
491 trace_layoutstate_unhash(&ls
->ls_stid
.sc_stateid
);
492 nfs4_unhash_stid(&ls
->ls_stid
);
493 lrp
->lrs_present
= 0;
495 spin_unlock(&ls
->ls_lock
);
497 nfs4_put_stid(&ls
->ls_stid
);
498 nfsd4_free_layouts(&reaplist
);
503 nfsd4_return_client_layouts(struct svc_rqst
*rqstp
,
504 struct nfsd4_compound_state
*cstate
,
505 struct nfsd4_layoutreturn
*lrp
)
507 struct nfs4_layout_stateid
*ls
, *n
;
508 struct nfs4_client
*clp
= cstate
->clp
;
509 struct nfs4_layout
*lp
, *t
;
512 lrp
->lrs_present
= 0;
514 spin_lock(&clp
->cl_lock
);
515 list_for_each_entry_safe(ls
, n
, &clp
->cl_lo_states
, ls_perclnt
) {
516 if (lrp
->lr_return_type
== RETURN_FSID
&&
517 !fh_fsid_match(&ls
->ls_stid
.sc_file
->fi_fhandle
,
518 &cstate
->current_fh
.fh_handle
))
521 spin_lock(&ls
->ls_lock
);
522 list_for_each_entry_safe(lp
, t
, &ls
->ls_layouts
, lo_perstate
) {
523 if (lrp
->lr_seg
.iomode
== IOMODE_ANY
||
524 lrp
->lr_seg
.iomode
== lp
->lo_seg
.iomode
)
525 list_move_tail(&lp
->lo_perstate
, &reaplist
);
527 spin_unlock(&ls
->ls_lock
);
529 spin_unlock(&clp
->cl_lock
);
531 nfsd4_free_layouts(&reaplist
);
536 nfsd4_return_all_layouts(struct nfs4_layout_stateid
*ls
,
537 struct list_head
*reaplist
)
539 spin_lock(&ls
->ls_lock
);
540 list_splice_init(&ls
->ls_layouts
, reaplist
);
541 spin_unlock(&ls
->ls_lock
);
545 nfsd4_return_all_client_layouts(struct nfs4_client
*clp
)
547 struct nfs4_layout_stateid
*ls
, *n
;
550 spin_lock(&clp
->cl_lock
);
551 list_for_each_entry_safe(ls
, n
, &clp
->cl_lo_states
, ls_perclnt
)
552 nfsd4_return_all_layouts(ls
, &reaplist
);
553 spin_unlock(&clp
->cl_lock
);
555 nfsd4_free_layouts(&reaplist
);
559 nfsd4_return_all_file_layouts(struct nfs4_client
*clp
, struct nfs4_file
*fp
)
561 struct nfs4_layout_stateid
*ls
, *n
;
564 spin_lock(&fp
->fi_lock
);
565 list_for_each_entry_safe(ls
, n
, &fp
->fi_lo_states
, ls_perfile
) {
566 if (ls
->ls_stid
.sc_client
== clp
)
567 nfsd4_return_all_layouts(ls
, &reaplist
);
569 spin_unlock(&fp
->fi_lock
);
571 nfsd4_free_layouts(&reaplist
);
575 nfsd4_cb_layout_fail(struct nfs4_layout_stateid
*ls
)
577 struct nfs4_client
*clp
= ls
->ls_stid
.sc_client
;
578 char addr_str
[INET6_ADDRSTRLEN
];
579 static char *envp
[] = {
582 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
588 rpc_ntop((struct sockaddr
*)&clp
->cl_addr
, addr_str
, sizeof(addr_str
));
590 nfsd4_cb_layout_fail(ls
);
593 "nfsd: client %s failed to respond to layout recall. "
594 " Fencing..\n", addr_str
);
596 argv
[0] = "/sbin/nfsd-recall-failed";
598 argv
[2] = ls
->ls_file
->f_path
.mnt
->mnt_sb
->s_id
;
601 error
= call_usermodehelper(argv
[0], argv
, envp
, UMH_WAIT_PROC
);
603 printk(KERN_ERR
"nfsd: fence failed for client %s: %d!\n",
609 nfsd4_cb_layout_done(struct nfsd4_callback
*cb
, struct rpc_task
*task
)
611 struct nfs4_layout_stateid
*ls
=
612 container_of(cb
, struct nfs4_layout_stateid
, ls_recall
);
615 switch (task
->tk_status
) {
618 case -NFS4ERR_NOMATCHING_LAYOUT
:
619 trace_layout_recall_done(&ls
->ls_stid
.sc_stateid
);
623 /* Poll the client until it's done with the layout */
624 /* FIXME: cap number of retries.
625 * The pnfs standard states that we need to only expire
626 * the client after at-least "lease time" .eg lease-time * 2
627 * when failing to communicate a recall
629 rpc_delay(task
, HZ
/100); /* 10 mili-seconds */
633 * Unknown error or non-responding client, we'll need to fence.
635 nfsd4_cb_layout_fail(ls
);
641 nfsd4_cb_layout_release(struct nfsd4_callback
*cb
)
643 struct nfs4_layout_stateid
*ls
=
644 container_of(cb
, struct nfs4_layout_stateid
, ls_recall
);
647 trace_layout_recall_release(&ls
->ls_stid
.sc_stateid
);
649 nfsd4_return_all_layouts(ls
, &reaplist
);
650 nfsd4_free_layouts(&reaplist
);
651 nfs4_put_stid(&ls
->ls_stid
);
654 static struct nfsd4_callback_ops nfsd4_cb_layout_ops
= {
655 .done
= nfsd4_cb_layout_done
,
656 .release
= nfsd4_cb_layout_release
,
660 nfsd4_layout_lm_break(struct file_lock
*fl
)
663 * We don't want the locks code to timeout the lease for us;
664 * we'll remove it ourself if a layout isn't returned
667 fl
->fl_break_time
= 0;
668 nfsd4_recall_file_layout(fl
->fl_owner
);
673 nfsd4_layout_lm_change(struct file_lock
*onlist
, int arg
,
674 struct list_head
*dispose
)
676 BUG_ON(!(arg
& F_UNLCK
));
677 return lease_modify(onlist
, arg
, dispose
);
680 static const struct lock_manager_operations nfsd4_layouts_lm_ops
= {
681 .lm_break
= nfsd4_layout_lm_break
,
682 .lm_change
= nfsd4_layout_lm_change
,
686 nfsd4_init_pnfs(void)
690 for (i
= 0; i
< DEVID_HASH_SIZE
; i
++)
691 INIT_LIST_HEAD(&nfsd_devid_hash
[i
]);
693 nfs4_layout_cache
= kmem_cache_create("nfs4_layout",
694 sizeof(struct nfs4_layout
), 0, 0, NULL
);
695 if (!nfs4_layout_cache
)
698 nfs4_layout_stateid_cache
= kmem_cache_create("nfs4_layout_stateid",
699 sizeof(struct nfs4_layout_stateid
), 0, 0, NULL
);
700 if (!nfs4_layout_stateid_cache
) {
701 kmem_cache_destroy(nfs4_layout_cache
);
708 nfsd4_exit_pnfs(void)
712 kmem_cache_destroy(nfs4_layout_cache
);
713 kmem_cache_destroy(nfs4_layout_stateid_cache
);
715 for (i
= 0; i
< DEVID_HASH_SIZE
; i
++) {
716 struct nfsd4_deviceid_map
*map
, *n
;
718 list_for_each_entry_safe(map
, n
, &nfsd_devid_hash
[i
], hash
)