2 * Copyright (c) 2014 Christoph Hellwig.
4 #include <linux/kmod.h>
5 #include <linux/file.h>
6 #include <linux/jhash.h>
7 #include <linux/sched.h>
8 #include <linux/sunrpc/addr.h>
14 #define NFSDDBG_FACILITY NFSDDBG_PNFS
17 struct list_head lo_perstate
;
18 struct nfs4_layout_stateid
*lo_state
;
19 struct nfsd4_layout_seg lo_seg
;
22 static struct kmem_cache
*nfs4_layout_cache
;
23 static struct kmem_cache
*nfs4_layout_stateid_cache
;
25 static struct nfsd4_callback_ops nfsd4_cb_layout_ops
;
26 static const struct lock_manager_operations nfsd4_layouts_lm_ops
;
28 const struct nfsd4_layout_ops
*nfsd4_layout_ops
[LAYOUT_TYPE_MAX
] = {
29 [LAYOUT_BLOCK_VOLUME
] = &bl_layout_ops
,
32 /* pNFS device ID to export fsid mapping */
33 #define DEVID_HASH_BITS 8
34 #define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
35 #define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
36 static u64 nfsd_devid_seq
= 1;
37 static struct list_head nfsd_devid_hash
[DEVID_HASH_SIZE
];
38 static DEFINE_SPINLOCK(nfsd_devid_lock
);
40 static inline u32
devid_hashfn(u64 idx
)
42 return jhash_2words(idx
, idx
>> 32, 0) & DEVID_HASH_MASK
;
46 nfsd4_alloc_devid_map(const struct svc_fh
*fhp
)
48 const struct knfsd_fh
*fh
= &fhp
->fh_handle
;
49 size_t fsid_len
= key_len(fh
->fh_fsid_type
);
50 struct nfsd4_deviceid_map
*map
, *old
;
53 map
= kzalloc(sizeof(*map
) + fsid_len
, GFP_KERNEL
);
57 map
->fsid_type
= fh
->fh_fsid_type
;
58 memcpy(&map
->fsid
, fh
->fh_fsid
, fsid_len
);
60 spin_lock(&nfsd_devid_lock
);
61 if (fhp
->fh_export
->ex_devid_map
)
64 for (i
= 0; i
< DEVID_HASH_SIZE
; i
++) {
65 list_for_each_entry(old
, &nfsd_devid_hash
[i
], hash
) {
66 if (old
->fsid_type
!= fh
->fh_fsid_type
)
68 if (memcmp(old
->fsid
, fh
->fh_fsid
,
69 key_len(old
->fsid_type
)))
72 fhp
->fh_export
->ex_devid_map
= old
;
77 map
->idx
= nfsd_devid_seq
++;
78 list_add_tail_rcu(&map
->hash
, &nfsd_devid_hash
[devid_hashfn(map
->idx
)]);
79 fhp
->fh_export
->ex_devid_map
= map
;
83 spin_unlock(&nfsd_devid_lock
);
87 struct nfsd4_deviceid_map
*
88 nfsd4_find_devid_map(int idx
)
90 struct nfsd4_deviceid_map
*map
, *ret
= NULL
;
93 list_for_each_entry_rcu(map
, &nfsd_devid_hash
[devid_hashfn(idx
)], hash
)
102 nfsd4_set_deviceid(struct nfsd4_deviceid
*id
, const struct svc_fh
*fhp
,
103 u32 device_generation
)
105 if (!fhp
->fh_export
->ex_devid_map
) {
106 nfsd4_alloc_devid_map(fhp
);
107 if (!fhp
->fh_export
->ex_devid_map
)
111 id
->fsid_idx
= fhp
->fh_export
->ex_devid_map
->idx
;
112 id
->generation
= device_generation
;
117 void nfsd4_setup_layout_type(struct svc_export
*exp
)
119 struct super_block
*sb
= exp
->ex_path
.mnt
->mnt_sb
;
121 if (!(exp
->ex_flags
& NFSEXP_PNFS
))
124 if (sb
->s_export_op
->get_uuid
&&
125 sb
->s_export_op
->map_blocks
&&
126 sb
->s_export_op
->commit_blocks
)
127 exp
->ex_layout_type
= LAYOUT_BLOCK_VOLUME
;
131 nfsd4_free_layout_stateid(struct nfs4_stid
*stid
)
133 struct nfs4_layout_stateid
*ls
= layoutstateid(stid
);
134 struct nfs4_client
*clp
= ls
->ls_stid
.sc_client
;
135 struct nfs4_file
*fp
= ls
->ls_stid
.sc_file
;
137 trace_layoutstate_free(&ls
->ls_stid
.sc_stateid
);
139 spin_lock(&clp
->cl_lock
);
140 list_del_init(&ls
->ls_perclnt
);
141 spin_unlock(&clp
->cl_lock
);
143 spin_lock(&fp
->fi_lock
);
144 list_del_init(&ls
->ls_perfile
);
145 spin_unlock(&fp
->fi_lock
);
147 vfs_setlease(ls
->ls_file
, F_UNLCK
, NULL
, (void **)&ls
);
151 atomic_dec(&ls
->ls_stid
.sc_file
->fi_lo_recalls
);
153 kmem_cache_free(nfs4_layout_stateid_cache
, ls
);
157 nfsd4_layout_setlease(struct nfs4_layout_stateid
*ls
)
159 struct file_lock
*fl
;
162 fl
= locks_alloc_lock();
166 fl
->fl_lmops
= &nfsd4_layouts_lm_ops
;
167 fl
->fl_flags
= FL_LAYOUT
;
168 fl
->fl_type
= F_RDLCK
;
169 fl
->fl_end
= OFFSET_MAX
;
171 fl
->fl_pid
= current
->tgid
;
172 fl
->fl_file
= ls
->ls_file
;
174 status
= vfs_setlease(fl
->fl_file
, fl
->fl_type
, &fl
, NULL
);
183 static struct nfs4_layout_stateid
*
184 nfsd4_alloc_layout_stateid(struct nfsd4_compound_state
*cstate
,
185 struct nfs4_stid
*parent
, u32 layout_type
)
187 struct nfs4_client
*clp
= cstate
->clp
;
188 struct nfs4_file
*fp
= parent
->sc_file
;
189 struct nfs4_layout_stateid
*ls
;
190 struct nfs4_stid
*stp
;
192 stp
= nfs4_alloc_stid(cstate
->clp
, nfs4_layout_stateid_cache
,
193 nfsd4_free_layout_stateid
);
200 ls
= layoutstateid(stp
);
201 INIT_LIST_HEAD(&ls
->ls_perclnt
);
202 INIT_LIST_HEAD(&ls
->ls_perfile
);
203 spin_lock_init(&ls
->ls_lock
);
204 INIT_LIST_HEAD(&ls
->ls_layouts
);
205 mutex_init(&ls
->ls_mutex
);
206 ls
->ls_layout_type
= layout_type
;
207 nfsd4_init_cb(&ls
->ls_recall
, clp
, &nfsd4_cb_layout_ops
,
208 NFSPROC4_CLNT_CB_LAYOUT
);
210 if (parent
->sc_type
== NFS4_DELEG_STID
)
211 ls
->ls_file
= get_file(fp
->fi_deleg_file
);
213 ls
->ls_file
= find_any_file(fp
);
214 BUG_ON(!ls
->ls_file
);
216 if (nfsd4_layout_setlease(ls
)) {
219 kmem_cache_free(nfs4_layout_stateid_cache
, ls
);
223 spin_lock(&clp
->cl_lock
);
224 stp
->sc_type
= NFS4_LAYOUT_STID
;
225 list_add(&ls
->ls_perclnt
, &clp
->cl_lo_states
);
226 spin_unlock(&clp
->cl_lock
);
228 spin_lock(&fp
->fi_lock
);
229 list_add(&ls
->ls_perfile
, &fp
->fi_lo_states
);
230 spin_unlock(&fp
->fi_lock
);
232 trace_layoutstate_alloc(&ls
->ls_stid
.sc_stateid
);
237 nfsd4_preprocess_layout_stateid(struct svc_rqst
*rqstp
,
238 struct nfsd4_compound_state
*cstate
, stateid_t
*stateid
,
239 bool create
, u32 layout_type
, struct nfs4_layout_stateid
**lsp
)
241 struct nfs4_layout_stateid
*ls
;
242 struct nfs4_stid
*stid
;
243 unsigned char typemask
= NFS4_LAYOUT_STID
;
247 typemask
|= (NFS4_OPEN_STID
| NFS4_LOCK_STID
| NFS4_DELEG_STID
);
249 status
= nfsd4_lookup_stateid(cstate
, stateid
, typemask
, &stid
,
250 net_generic(SVC_NET(rqstp
), nfsd_net_id
));
254 if (!fh_match(&cstate
->current_fh
.fh_handle
,
255 &stid
->sc_file
->fi_fhandle
)) {
256 status
= nfserr_bad_stateid
;
260 if (stid
->sc_type
!= NFS4_LAYOUT_STID
) {
261 ls
= nfsd4_alloc_layout_stateid(cstate
, stid
, layout_type
);
264 status
= nfserr_jukebox
;
267 mutex_lock(&ls
->ls_mutex
);
269 ls
= container_of(stid
, struct nfs4_layout_stateid
, ls_stid
);
271 status
= nfserr_bad_stateid
;
272 mutex_lock(&ls
->ls_mutex
);
273 if (stateid
->si_generation
> stid
->sc_stateid
.si_generation
)
274 goto out_unlock_stid
;
275 if (layout_type
!= ls
->ls_layout_type
)
276 goto out_unlock_stid
;
283 mutex_unlock(&ls
->ls_mutex
);
291 nfsd4_recall_file_layout(struct nfs4_layout_stateid
*ls
)
293 spin_lock(&ls
->ls_lock
);
297 ls
->ls_recalled
= true;
298 atomic_inc(&ls
->ls_stid
.sc_file
->fi_lo_recalls
);
299 if (list_empty(&ls
->ls_layouts
))
302 trace_layout_recall(&ls
->ls_stid
.sc_stateid
);
304 atomic_inc(&ls
->ls_stid
.sc_count
);
305 nfsd4_run_cb(&ls
->ls_recall
);
308 spin_unlock(&ls
->ls_lock
);
312 layout_end(struct nfsd4_layout_seg
*seg
)
314 u64 end
= seg
->offset
+ seg
->length
;
315 return end
>= seg
->offset
? end
: NFS4_MAX_UINT64
;
319 layout_update_len(struct nfsd4_layout_seg
*lo
, u64 end
)
321 if (end
== NFS4_MAX_UINT64
)
322 lo
->length
= NFS4_MAX_UINT64
;
324 lo
->length
= end
- lo
->offset
;
328 layouts_overlapping(struct nfs4_layout
*lo
, struct nfsd4_layout_seg
*s
)
330 if (s
->iomode
!= IOMODE_ANY
&& s
->iomode
!= lo
->lo_seg
.iomode
)
332 if (layout_end(&lo
->lo_seg
) <= s
->offset
)
334 if (layout_end(s
) <= lo
->lo_seg
.offset
)
340 layouts_try_merge(struct nfsd4_layout_seg
*lo
, struct nfsd4_layout_seg
*new)
342 if (lo
->iomode
!= new->iomode
)
344 if (layout_end(new) < lo
->offset
)
346 if (layout_end(lo
) < new->offset
)
349 lo
->offset
= min(lo
->offset
, new->offset
);
350 layout_update_len(lo
, max(layout_end(lo
), layout_end(new)));
355 nfsd4_recall_conflict(struct nfs4_layout_stateid
*ls
)
357 struct nfs4_file
*fp
= ls
->ls_stid
.sc_file
;
358 struct nfs4_layout_stateid
*l
, *n
;
359 __be32 nfserr
= nfs_ok
;
361 assert_spin_locked(&fp
->fi_lock
);
363 list_for_each_entry_safe(l
, n
, &fp
->fi_lo_states
, ls_perfile
) {
365 nfsd4_recall_file_layout(l
);
366 nfserr
= nfserr_recallconflict
;
374 nfsd4_insert_layout(struct nfsd4_layoutget
*lgp
, struct nfs4_layout_stateid
*ls
)
376 struct nfsd4_layout_seg
*seg
= &lgp
->lg_seg
;
377 struct nfs4_file
*fp
= ls
->ls_stid
.sc_file
;
378 struct nfs4_layout
*lp
, *new = NULL
;
381 spin_lock(&fp
->fi_lock
);
382 nfserr
= nfsd4_recall_conflict(ls
);
385 spin_lock(&ls
->ls_lock
);
386 list_for_each_entry(lp
, &ls
->ls_layouts
, lo_perstate
) {
387 if (layouts_try_merge(&lp
->lo_seg
, seg
))
390 spin_unlock(&ls
->ls_lock
);
391 spin_unlock(&fp
->fi_lock
);
393 new = kmem_cache_alloc(nfs4_layout_cache
, GFP_KERNEL
);
395 return nfserr_jukebox
;
396 memcpy(&new->lo_seg
, seg
, sizeof(lp
->lo_seg
));
399 spin_lock(&fp
->fi_lock
);
400 nfserr
= nfsd4_recall_conflict(ls
);
403 spin_lock(&ls
->ls_lock
);
404 list_for_each_entry(lp
, &ls
->ls_layouts
, lo_perstate
) {
405 if (layouts_try_merge(&lp
->lo_seg
, seg
))
409 atomic_inc(&ls
->ls_stid
.sc_count
);
410 list_add_tail(&new->lo_perstate
, &ls
->ls_layouts
);
413 nfs4_inc_and_copy_stateid(&lgp
->lg_sid
, &ls
->ls_stid
);
414 spin_unlock(&ls
->ls_lock
);
416 spin_unlock(&fp
->fi_lock
);
418 kmem_cache_free(nfs4_layout_cache
, new);
423 nfsd4_free_layouts(struct list_head
*reaplist
)
425 while (!list_empty(reaplist
)) {
426 struct nfs4_layout
*lp
= list_first_entry(reaplist
,
427 struct nfs4_layout
, lo_perstate
);
429 list_del(&lp
->lo_perstate
);
430 nfs4_put_stid(&lp
->lo_state
->ls_stid
);
431 kmem_cache_free(nfs4_layout_cache
, lp
);
436 nfsd4_return_file_layout(struct nfs4_layout
*lp
, struct nfsd4_layout_seg
*seg
,
437 struct list_head
*reaplist
)
439 struct nfsd4_layout_seg
*lo
= &lp
->lo_seg
;
440 u64 end
= layout_end(lo
);
442 if (seg
->offset
<= lo
->offset
) {
443 if (layout_end(seg
) >= end
) {
444 list_move_tail(&lp
->lo_perstate
, reaplist
);
447 lo
->offset
= layout_end(seg
);
449 /* retain the whole layout segment on a split. */
450 if (layout_end(seg
) < end
) {
451 dprintk("%s: split not supported\n", __func__
);
457 layout_update_len(lo
, end
);
461 nfsd4_return_file_layouts(struct svc_rqst
*rqstp
,
462 struct nfsd4_compound_state
*cstate
,
463 struct nfsd4_layoutreturn
*lrp
)
465 struct nfs4_layout_stateid
*ls
;
466 struct nfs4_layout
*lp
, *n
;
471 nfserr
= nfsd4_preprocess_layout_stateid(rqstp
, cstate
, &lrp
->lr_sid
,
472 false, lrp
->lr_layout_type
,
475 trace_layout_return_lookup_fail(&lrp
->lr_sid
);
479 spin_lock(&ls
->ls_lock
);
480 list_for_each_entry_safe(lp
, n
, &ls
->ls_layouts
, lo_perstate
) {
481 if (layouts_overlapping(lp
, &lrp
->lr_seg
)) {
482 nfsd4_return_file_layout(lp
, &lrp
->lr_seg
, &reaplist
);
486 if (!list_empty(&ls
->ls_layouts
)) {
488 nfs4_inc_and_copy_stateid(&lrp
->lr_sid
, &ls
->ls_stid
);
489 lrp
->lrs_present
= 1;
491 trace_layoutstate_unhash(&ls
->ls_stid
.sc_stateid
);
492 nfs4_unhash_stid(&ls
->ls_stid
);
493 lrp
->lrs_present
= 0;
495 spin_unlock(&ls
->ls_lock
);
497 mutex_unlock(&ls
->ls_mutex
);
498 nfs4_put_stid(&ls
->ls_stid
);
499 nfsd4_free_layouts(&reaplist
);
504 nfsd4_return_client_layouts(struct svc_rqst
*rqstp
,
505 struct nfsd4_compound_state
*cstate
,
506 struct nfsd4_layoutreturn
*lrp
)
508 struct nfs4_layout_stateid
*ls
, *n
;
509 struct nfs4_client
*clp
= cstate
->clp
;
510 struct nfs4_layout
*lp
, *t
;
513 lrp
->lrs_present
= 0;
515 spin_lock(&clp
->cl_lock
);
516 list_for_each_entry_safe(ls
, n
, &clp
->cl_lo_states
, ls_perclnt
) {
517 if (ls
->ls_layout_type
!= lrp
->lr_layout_type
)
520 if (lrp
->lr_return_type
== RETURN_FSID
&&
521 !fh_fsid_match(&ls
->ls_stid
.sc_file
->fi_fhandle
,
522 &cstate
->current_fh
.fh_handle
))
525 spin_lock(&ls
->ls_lock
);
526 list_for_each_entry_safe(lp
, t
, &ls
->ls_layouts
, lo_perstate
) {
527 if (lrp
->lr_seg
.iomode
== IOMODE_ANY
||
528 lrp
->lr_seg
.iomode
== lp
->lo_seg
.iomode
)
529 list_move_tail(&lp
->lo_perstate
, &reaplist
);
531 spin_unlock(&ls
->ls_lock
);
533 spin_unlock(&clp
->cl_lock
);
535 nfsd4_free_layouts(&reaplist
);
540 nfsd4_return_all_layouts(struct nfs4_layout_stateid
*ls
,
541 struct list_head
*reaplist
)
543 spin_lock(&ls
->ls_lock
);
544 list_splice_init(&ls
->ls_layouts
, reaplist
);
545 spin_unlock(&ls
->ls_lock
);
549 nfsd4_return_all_client_layouts(struct nfs4_client
*clp
)
551 struct nfs4_layout_stateid
*ls
, *n
;
554 spin_lock(&clp
->cl_lock
);
555 list_for_each_entry_safe(ls
, n
, &clp
->cl_lo_states
, ls_perclnt
)
556 nfsd4_return_all_layouts(ls
, &reaplist
);
557 spin_unlock(&clp
->cl_lock
);
559 nfsd4_free_layouts(&reaplist
);
563 nfsd4_return_all_file_layouts(struct nfs4_client
*clp
, struct nfs4_file
*fp
)
565 struct nfs4_layout_stateid
*ls
, *n
;
568 spin_lock(&fp
->fi_lock
);
569 list_for_each_entry_safe(ls
, n
, &fp
->fi_lo_states
, ls_perfile
) {
570 if (ls
->ls_stid
.sc_client
== clp
)
571 nfsd4_return_all_layouts(ls
, &reaplist
);
573 spin_unlock(&fp
->fi_lock
);
575 nfsd4_free_layouts(&reaplist
);
579 nfsd4_cb_layout_fail(struct nfs4_layout_stateid
*ls
)
581 struct nfs4_client
*clp
= ls
->ls_stid
.sc_client
;
582 char addr_str
[INET6_ADDRSTRLEN
];
583 static char *envp
[] = {
586 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
592 rpc_ntop((struct sockaddr
*)&clp
->cl_addr
, addr_str
, sizeof(addr_str
));
594 trace_layout_recall_fail(&ls
->ls_stid
.sc_stateid
);
597 "nfsd: client %s failed to respond to layout recall. "
598 " Fencing..\n", addr_str
);
600 argv
[0] = "/sbin/nfsd-recall-failed";
602 argv
[2] = ls
->ls_file
->f_path
.mnt
->mnt_sb
->s_id
;
605 error
= call_usermodehelper(argv
[0], argv
, envp
, UMH_WAIT_PROC
);
607 printk(KERN_ERR
"nfsd: fence failed for client %s: %d!\n",
613 nfsd4_cb_layout_prepare(struct nfsd4_callback
*cb
)
615 struct nfs4_layout_stateid
*ls
=
616 container_of(cb
, struct nfs4_layout_stateid
, ls_recall
);
618 mutex_lock(&ls
->ls_mutex
);
619 nfs4_inc_and_copy_stateid(&ls
->ls_recall_sid
, &ls
->ls_stid
);
620 mutex_unlock(&ls
->ls_mutex
);
624 nfsd4_cb_layout_done(struct nfsd4_callback
*cb
, struct rpc_task
*task
)
626 struct nfs4_layout_stateid
*ls
=
627 container_of(cb
, struct nfs4_layout_stateid
, ls_recall
);
630 switch (task
->tk_status
) {
633 case -NFS4ERR_NOMATCHING_LAYOUT
:
634 trace_layout_recall_done(&ls
->ls_stid
.sc_stateid
);
638 /* Poll the client until it's done with the layout */
639 /* FIXME: cap number of retries.
640 * The pnfs standard states that we need to only expire
641 * the client after at-least "lease time" .eg lease-time * 2
642 * when failing to communicate a recall
644 rpc_delay(task
, HZ
/100); /* 10 mili-seconds */
648 * Unknown error or non-responding client, we'll need to fence.
650 nfsd4_cb_layout_fail(ls
);
656 nfsd4_cb_layout_release(struct nfsd4_callback
*cb
)
658 struct nfs4_layout_stateid
*ls
=
659 container_of(cb
, struct nfs4_layout_stateid
, ls_recall
);
662 trace_layout_recall_release(&ls
->ls_stid
.sc_stateid
);
664 nfsd4_return_all_layouts(ls
, &reaplist
);
665 nfsd4_free_layouts(&reaplist
);
666 nfs4_put_stid(&ls
->ls_stid
);
669 static struct nfsd4_callback_ops nfsd4_cb_layout_ops
= {
670 .prepare
= nfsd4_cb_layout_prepare
,
671 .done
= nfsd4_cb_layout_done
,
672 .release
= nfsd4_cb_layout_release
,
676 nfsd4_layout_lm_break(struct file_lock
*fl
)
679 * We don't want the locks code to timeout the lease for us;
680 * we'll remove it ourself if a layout isn't returned
683 fl
->fl_break_time
= 0;
684 nfsd4_recall_file_layout(fl
->fl_owner
);
689 nfsd4_layout_lm_change(struct file_lock
*onlist
, int arg
,
690 struct list_head
*dispose
)
692 BUG_ON(!(arg
& F_UNLCK
));
693 return lease_modify(onlist
, arg
, dispose
);
696 static const struct lock_manager_operations nfsd4_layouts_lm_ops
= {
697 .lm_break
= nfsd4_layout_lm_break
,
698 .lm_change
= nfsd4_layout_lm_change
,
702 nfsd4_init_pnfs(void)
706 for (i
= 0; i
< DEVID_HASH_SIZE
; i
++)
707 INIT_LIST_HEAD(&nfsd_devid_hash
[i
]);
709 nfs4_layout_cache
= kmem_cache_create("nfs4_layout",
710 sizeof(struct nfs4_layout
), 0, 0, NULL
);
711 if (!nfs4_layout_cache
)
714 nfs4_layout_stateid_cache
= kmem_cache_create("nfs4_layout_stateid",
715 sizeof(struct nfs4_layout_stateid
), 0, 0, NULL
);
716 if (!nfs4_layout_stateid_cache
) {
717 kmem_cache_destroy(nfs4_layout_cache
);
724 nfsd4_exit_pnfs(void)
728 kmem_cache_destroy(nfs4_layout_cache
);
729 kmem_cache_destroy(nfs4_layout_stateid_cache
);
731 for (i
= 0; i
< DEVID_HASH_SIZE
; i
++) {
732 struct nfsd4_deviceid_map
*map
, *n
;
734 list_for_each_entry_safe(map
, n
, &nfsd_devid_hash
[i
], hash
)