2 * Copyright (c) 2014 Christoph Hellwig.
4 #include <linux/kmod.h>
5 #include <linux/file.h>
6 #include <linux/jhash.h>
7 #include <linux/sched.h>
8 #include <linux/sunrpc/addr.h>
14 #define NFSDDBG_FACILITY NFSDDBG_PNFS
17 struct list_head lo_perstate
;
18 struct nfs4_layout_stateid
*lo_state
;
19 struct nfsd4_layout_seg lo_seg
;
22 static struct kmem_cache
*nfs4_layout_cache
;
23 static struct kmem_cache
*nfs4_layout_stateid_cache
;
25 static struct nfsd4_callback_ops nfsd4_cb_layout_ops
;
26 static const struct lock_manager_operations nfsd4_layouts_lm_ops
;
28 const struct nfsd4_layout_ops
*nfsd4_layout_ops
[LAYOUT_TYPE_MAX
] = {
29 [LAYOUT_BLOCK_VOLUME
] = &bl_layout_ops
,
32 /* pNFS device ID to export fsid mapping */
33 #define DEVID_HASH_BITS 8
34 #define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
35 #define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
36 static u64 nfsd_devid_seq
= 1;
37 static struct list_head nfsd_devid_hash
[DEVID_HASH_SIZE
];
38 static DEFINE_SPINLOCK(nfsd_devid_lock
);
40 static inline u32
devid_hashfn(u64 idx
)
42 return jhash_2words(idx
, idx
>> 32, 0) & DEVID_HASH_MASK
;
46 nfsd4_alloc_devid_map(const struct svc_fh
*fhp
)
48 const struct knfsd_fh
*fh
= &fhp
->fh_handle
;
49 size_t fsid_len
= key_len(fh
->fh_fsid_type
);
50 struct nfsd4_deviceid_map
*map
, *old
;
53 map
= kzalloc(sizeof(*map
) + fsid_len
, GFP_KERNEL
);
57 map
->fsid_type
= fh
->fh_fsid_type
;
58 memcpy(&map
->fsid
, fh
->fh_fsid
, fsid_len
);
60 spin_lock(&nfsd_devid_lock
);
61 if (fhp
->fh_export
->ex_devid_map
)
64 for (i
= 0; i
< DEVID_HASH_SIZE
; i
++) {
65 list_for_each_entry(old
, &nfsd_devid_hash
[i
], hash
) {
66 if (old
->fsid_type
!= fh
->fh_fsid_type
)
68 if (memcmp(old
->fsid
, fh
->fh_fsid
,
69 key_len(old
->fsid_type
)))
72 fhp
->fh_export
->ex_devid_map
= old
;
77 map
->idx
= nfsd_devid_seq
++;
78 list_add_tail_rcu(&map
->hash
, &nfsd_devid_hash
[devid_hashfn(map
->idx
)]);
79 fhp
->fh_export
->ex_devid_map
= map
;
83 spin_unlock(&nfsd_devid_lock
);
87 struct nfsd4_deviceid_map
*
88 nfsd4_find_devid_map(int idx
)
90 struct nfsd4_deviceid_map
*map
, *ret
= NULL
;
93 list_for_each_entry_rcu(map
, &nfsd_devid_hash
[devid_hashfn(idx
)], hash
)
102 nfsd4_set_deviceid(struct nfsd4_deviceid
*id
, const struct svc_fh
*fhp
,
103 u32 device_generation
)
105 if (!fhp
->fh_export
->ex_devid_map
) {
106 nfsd4_alloc_devid_map(fhp
);
107 if (!fhp
->fh_export
->ex_devid_map
)
111 id
->fsid_idx
= fhp
->fh_export
->ex_devid_map
->idx
;
112 id
->generation
= device_generation
;
117 void nfsd4_setup_layout_type(struct svc_export
*exp
)
119 struct super_block
*sb
= exp
->ex_path
.mnt
->mnt_sb
;
121 if (!(exp
->ex_flags
& NFSEXP_PNFS
))
124 if (sb
->s_export_op
->get_uuid
&&
125 sb
->s_export_op
->map_blocks
&&
126 sb
->s_export_op
->commit_blocks
)
127 exp
->ex_layout_type
= LAYOUT_BLOCK_VOLUME
;
131 nfsd4_free_layout_stateid(struct nfs4_stid
*stid
)
133 struct nfs4_layout_stateid
*ls
= layoutstateid(stid
);
134 struct nfs4_client
*clp
= ls
->ls_stid
.sc_client
;
135 struct nfs4_file
*fp
= ls
->ls_stid
.sc_file
;
137 trace_layoutstate_free(&ls
->ls_stid
.sc_stateid
);
139 spin_lock(&clp
->cl_lock
);
140 list_del_init(&ls
->ls_perclnt
);
141 spin_unlock(&clp
->cl_lock
);
143 spin_lock(&fp
->fi_lock
);
144 list_del_init(&ls
->ls_perfile
);
145 spin_unlock(&fp
->fi_lock
);
147 vfs_setlease(ls
->ls_file
, F_UNLCK
, NULL
, (void **)&ls
);
151 atomic_dec(&ls
->ls_stid
.sc_file
->fi_lo_recalls
);
153 kmem_cache_free(nfs4_layout_stateid_cache
, ls
);
157 nfsd4_layout_setlease(struct nfs4_layout_stateid
*ls
)
159 struct file_lock
*fl
;
162 fl
= locks_alloc_lock();
166 fl
->fl_lmops
= &nfsd4_layouts_lm_ops
;
167 fl
->fl_flags
= FL_LAYOUT
;
168 fl
->fl_type
= F_RDLCK
;
169 fl
->fl_end
= OFFSET_MAX
;
171 fl
->fl_pid
= current
->tgid
;
172 fl
->fl_file
= ls
->ls_file
;
174 status
= vfs_setlease(fl
->fl_file
, fl
->fl_type
, &fl
, NULL
);
183 static struct nfs4_layout_stateid
*
184 nfsd4_alloc_layout_stateid(struct nfsd4_compound_state
*cstate
,
185 struct nfs4_stid
*parent
, u32 layout_type
)
187 struct nfs4_client
*clp
= cstate
->clp
;
188 struct nfs4_file
*fp
= parent
->sc_file
;
189 struct nfs4_layout_stateid
*ls
;
190 struct nfs4_stid
*stp
;
192 stp
= nfs4_alloc_stid(cstate
->clp
, nfs4_layout_stateid_cache
);
195 stp
->sc_free
= nfsd4_free_layout_stateid
;
199 ls
= layoutstateid(stp
);
200 INIT_LIST_HEAD(&ls
->ls_perclnt
);
201 INIT_LIST_HEAD(&ls
->ls_perfile
);
202 spin_lock_init(&ls
->ls_lock
);
203 INIT_LIST_HEAD(&ls
->ls_layouts
);
204 mutex_init(&ls
->ls_mutex
);
205 ls
->ls_layout_type
= layout_type
;
206 nfsd4_init_cb(&ls
->ls_recall
, clp
, &nfsd4_cb_layout_ops
,
207 NFSPROC4_CLNT_CB_LAYOUT
);
209 if (parent
->sc_type
== NFS4_DELEG_STID
)
210 ls
->ls_file
= get_file(fp
->fi_deleg_file
);
212 ls
->ls_file
= find_any_file(fp
);
213 BUG_ON(!ls
->ls_file
);
215 if (nfsd4_layout_setlease(ls
)) {
218 kmem_cache_free(nfs4_layout_stateid_cache
, ls
);
222 spin_lock(&clp
->cl_lock
);
223 stp
->sc_type
= NFS4_LAYOUT_STID
;
224 list_add(&ls
->ls_perclnt
, &clp
->cl_lo_states
);
225 spin_unlock(&clp
->cl_lock
);
227 spin_lock(&fp
->fi_lock
);
228 list_add(&ls
->ls_perfile
, &fp
->fi_lo_states
);
229 spin_unlock(&fp
->fi_lock
);
231 trace_layoutstate_alloc(&ls
->ls_stid
.sc_stateid
);
236 nfsd4_preprocess_layout_stateid(struct svc_rqst
*rqstp
,
237 struct nfsd4_compound_state
*cstate
, stateid_t
*stateid
,
238 bool create
, u32 layout_type
, struct nfs4_layout_stateid
**lsp
)
240 struct nfs4_layout_stateid
*ls
;
241 struct nfs4_stid
*stid
;
242 unsigned char typemask
= NFS4_LAYOUT_STID
;
246 typemask
|= (NFS4_OPEN_STID
| NFS4_LOCK_STID
| NFS4_DELEG_STID
);
248 status
= nfsd4_lookup_stateid(cstate
, stateid
, typemask
, &stid
,
249 net_generic(SVC_NET(rqstp
), nfsd_net_id
));
253 if (!fh_match(&cstate
->current_fh
.fh_handle
,
254 &stid
->sc_file
->fi_fhandle
)) {
255 status
= nfserr_bad_stateid
;
259 if (stid
->sc_type
!= NFS4_LAYOUT_STID
) {
260 ls
= nfsd4_alloc_layout_stateid(cstate
, stid
, layout_type
);
263 status
= nfserr_jukebox
;
266 mutex_lock(&ls
->ls_mutex
);
268 ls
= container_of(stid
, struct nfs4_layout_stateid
, ls_stid
);
270 status
= nfserr_bad_stateid
;
271 mutex_lock(&ls
->ls_mutex
);
272 if (stateid
->si_generation
> stid
->sc_stateid
.si_generation
)
273 goto out_unlock_stid
;
274 if (layout_type
!= ls
->ls_layout_type
)
275 goto out_unlock_stid
;
282 mutex_unlock(&ls
->ls_mutex
);
290 nfsd4_recall_file_layout(struct nfs4_layout_stateid
*ls
)
292 spin_lock(&ls
->ls_lock
);
296 ls
->ls_recalled
= true;
297 atomic_inc(&ls
->ls_stid
.sc_file
->fi_lo_recalls
);
298 if (list_empty(&ls
->ls_layouts
))
301 trace_layout_recall(&ls
->ls_stid
.sc_stateid
);
303 atomic_inc(&ls
->ls_stid
.sc_count
);
304 nfsd4_run_cb(&ls
->ls_recall
);
307 spin_unlock(&ls
->ls_lock
);
311 layout_end(struct nfsd4_layout_seg
*seg
)
313 u64 end
= seg
->offset
+ seg
->length
;
314 return end
>= seg
->offset
? end
: NFS4_MAX_UINT64
;
318 layout_update_len(struct nfsd4_layout_seg
*lo
, u64 end
)
320 if (end
== NFS4_MAX_UINT64
)
321 lo
->length
= NFS4_MAX_UINT64
;
323 lo
->length
= end
- lo
->offset
;
327 layouts_overlapping(struct nfs4_layout
*lo
, struct nfsd4_layout_seg
*s
)
329 if (s
->iomode
!= IOMODE_ANY
&& s
->iomode
!= lo
->lo_seg
.iomode
)
331 if (layout_end(&lo
->lo_seg
) <= s
->offset
)
333 if (layout_end(s
) <= lo
->lo_seg
.offset
)
339 layouts_try_merge(struct nfsd4_layout_seg
*lo
, struct nfsd4_layout_seg
*new)
341 if (lo
->iomode
!= new->iomode
)
343 if (layout_end(new) < lo
->offset
)
345 if (layout_end(lo
) < new->offset
)
348 lo
->offset
= min(lo
->offset
, new->offset
);
349 layout_update_len(lo
, max(layout_end(lo
), layout_end(new)));
354 nfsd4_recall_conflict(struct nfs4_layout_stateid
*ls
)
356 struct nfs4_file
*fp
= ls
->ls_stid
.sc_file
;
357 struct nfs4_layout_stateid
*l
, *n
;
358 __be32 nfserr
= nfs_ok
;
360 assert_spin_locked(&fp
->fi_lock
);
362 list_for_each_entry_safe(l
, n
, &fp
->fi_lo_states
, ls_perfile
) {
364 nfsd4_recall_file_layout(l
);
365 nfserr
= nfserr_recallconflict
;
373 nfsd4_insert_layout(struct nfsd4_layoutget
*lgp
, struct nfs4_layout_stateid
*ls
)
375 struct nfsd4_layout_seg
*seg
= &lgp
->lg_seg
;
376 struct nfs4_file
*fp
= ls
->ls_stid
.sc_file
;
377 struct nfs4_layout
*lp
, *new = NULL
;
380 spin_lock(&fp
->fi_lock
);
381 nfserr
= nfsd4_recall_conflict(ls
);
384 spin_lock(&ls
->ls_lock
);
385 list_for_each_entry(lp
, &ls
->ls_layouts
, lo_perstate
) {
386 if (layouts_try_merge(&lp
->lo_seg
, seg
))
389 spin_unlock(&ls
->ls_lock
);
390 spin_unlock(&fp
->fi_lock
);
392 new = kmem_cache_alloc(nfs4_layout_cache
, GFP_KERNEL
);
394 return nfserr_jukebox
;
395 memcpy(&new->lo_seg
, seg
, sizeof(lp
->lo_seg
));
398 spin_lock(&fp
->fi_lock
);
399 nfserr
= nfsd4_recall_conflict(ls
);
402 spin_lock(&ls
->ls_lock
);
403 list_for_each_entry(lp
, &ls
->ls_layouts
, lo_perstate
) {
404 if (layouts_try_merge(&lp
->lo_seg
, seg
))
408 atomic_inc(&ls
->ls_stid
.sc_count
);
409 list_add_tail(&new->lo_perstate
, &ls
->ls_layouts
);
412 nfs4_inc_and_copy_stateid(&lgp
->lg_sid
, &ls
->ls_stid
);
413 spin_unlock(&ls
->ls_lock
);
415 spin_unlock(&fp
->fi_lock
);
417 kmem_cache_free(nfs4_layout_cache
, new);
422 nfsd4_free_layouts(struct list_head
*reaplist
)
424 while (!list_empty(reaplist
)) {
425 struct nfs4_layout
*lp
= list_first_entry(reaplist
,
426 struct nfs4_layout
, lo_perstate
);
428 list_del(&lp
->lo_perstate
);
429 nfs4_put_stid(&lp
->lo_state
->ls_stid
);
430 kmem_cache_free(nfs4_layout_cache
, lp
);
435 nfsd4_return_file_layout(struct nfs4_layout
*lp
, struct nfsd4_layout_seg
*seg
,
436 struct list_head
*reaplist
)
438 struct nfsd4_layout_seg
*lo
= &lp
->lo_seg
;
439 u64 end
= layout_end(lo
);
441 if (seg
->offset
<= lo
->offset
) {
442 if (layout_end(seg
) >= end
) {
443 list_move_tail(&lp
->lo_perstate
, reaplist
);
446 lo
->offset
= layout_end(seg
);
448 /* retain the whole layout segment on a split. */
449 if (layout_end(seg
) < end
) {
450 dprintk("%s: split not supported\n", __func__
);
456 layout_update_len(lo
, end
);
460 nfsd4_return_file_layouts(struct svc_rqst
*rqstp
,
461 struct nfsd4_compound_state
*cstate
,
462 struct nfsd4_layoutreturn
*lrp
)
464 struct nfs4_layout_stateid
*ls
;
465 struct nfs4_layout
*lp
, *n
;
470 nfserr
= nfsd4_preprocess_layout_stateid(rqstp
, cstate
, &lrp
->lr_sid
,
471 false, lrp
->lr_layout_type
,
474 trace_layout_return_lookup_fail(&lrp
->lr_sid
);
478 spin_lock(&ls
->ls_lock
);
479 list_for_each_entry_safe(lp
, n
, &ls
->ls_layouts
, lo_perstate
) {
480 if (layouts_overlapping(lp
, &lrp
->lr_seg
)) {
481 nfsd4_return_file_layout(lp
, &lrp
->lr_seg
, &reaplist
);
485 if (!list_empty(&ls
->ls_layouts
)) {
487 nfs4_inc_and_copy_stateid(&lrp
->lr_sid
, &ls
->ls_stid
);
488 lrp
->lrs_present
= 1;
490 trace_layoutstate_unhash(&ls
->ls_stid
.sc_stateid
);
491 nfs4_unhash_stid(&ls
->ls_stid
);
492 lrp
->lrs_present
= 0;
494 spin_unlock(&ls
->ls_lock
);
496 mutex_unlock(&ls
->ls_mutex
);
497 nfs4_put_stid(&ls
->ls_stid
);
498 nfsd4_free_layouts(&reaplist
);
503 nfsd4_return_client_layouts(struct svc_rqst
*rqstp
,
504 struct nfsd4_compound_state
*cstate
,
505 struct nfsd4_layoutreturn
*lrp
)
507 struct nfs4_layout_stateid
*ls
, *n
;
508 struct nfs4_client
*clp
= cstate
->clp
;
509 struct nfs4_layout
*lp
, *t
;
512 lrp
->lrs_present
= 0;
514 spin_lock(&clp
->cl_lock
);
515 list_for_each_entry_safe(ls
, n
, &clp
->cl_lo_states
, ls_perclnt
) {
516 if (ls
->ls_layout_type
!= lrp
->lr_layout_type
)
519 if (lrp
->lr_return_type
== RETURN_FSID
&&
520 !fh_fsid_match(&ls
->ls_stid
.sc_file
->fi_fhandle
,
521 &cstate
->current_fh
.fh_handle
))
524 spin_lock(&ls
->ls_lock
);
525 list_for_each_entry_safe(lp
, t
, &ls
->ls_layouts
, lo_perstate
) {
526 if (lrp
->lr_seg
.iomode
== IOMODE_ANY
||
527 lrp
->lr_seg
.iomode
== lp
->lo_seg
.iomode
)
528 list_move_tail(&lp
->lo_perstate
, &reaplist
);
530 spin_unlock(&ls
->ls_lock
);
532 spin_unlock(&clp
->cl_lock
);
534 nfsd4_free_layouts(&reaplist
);
539 nfsd4_return_all_layouts(struct nfs4_layout_stateid
*ls
,
540 struct list_head
*reaplist
)
542 spin_lock(&ls
->ls_lock
);
543 list_splice_init(&ls
->ls_layouts
, reaplist
);
544 spin_unlock(&ls
->ls_lock
);
548 nfsd4_return_all_client_layouts(struct nfs4_client
*clp
)
550 struct nfs4_layout_stateid
*ls
, *n
;
553 spin_lock(&clp
->cl_lock
);
554 list_for_each_entry_safe(ls
, n
, &clp
->cl_lo_states
, ls_perclnt
)
555 nfsd4_return_all_layouts(ls
, &reaplist
);
556 spin_unlock(&clp
->cl_lock
);
558 nfsd4_free_layouts(&reaplist
);
562 nfsd4_return_all_file_layouts(struct nfs4_client
*clp
, struct nfs4_file
*fp
)
564 struct nfs4_layout_stateid
*ls
, *n
;
567 spin_lock(&fp
->fi_lock
);
568 list_for_each_entry_safe(ls
, n
, &fp
->fi_lo_states
, ls_perfile
) {
569 if (ls
->ls_stid
.sc_client
== clp
)
570 nfsd4_return_all_layouts(ls
, &reaplist
);
572 spin_unlock(&fp
->fi_lock
);
574 nfsd4_free_layouts(&reaplist
);
578 nfsd4_cb_layout_fail(struct nfs4_layout_stateid
*ls
)
580 struct nfs4_client
*clp
= ls
->ls_stid
.sc_client
;
581 char addr_str
[INET6_ADDRSTRLEN
];
582 static char *envp
[] = {
585 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
591 rpc_ntop((struct sockaddr
*)&clp
->cl_addr
, addr_str
, sizeof(addr_str
));
593 trace_layout_recall_fail(&ls
->ls_stid
.sc_stateid
);
596 "nfsd: client %s failed to respond to layout recall. "
597 " Fencing..\n", addr_str
);
599 argv
[0] = "/sbin/nfsd-recall-failed";
601 argv
[2] = ls
->ls_file
->f_path
.mnt
->mnt_sb
->s_id
;
604 error
= call_usermodehelper(argv
[0], argv
, envp
, UMH_WAIT_PROC
);
606 printk(KERN_ERR
"nfsd: fence failed for client %s: %d!\n",
612 nfsd4_cb_layout_prepare(struct nfsd4_callback
*cb
)
614 struct nfs4_layout_stateid
*ls
=
615 container_of(cb
, struct nfs4_layout_stateid
, ls_recall
);
617 mutex_lock(&ls
->ls_mutex
);
618 nfs4_inc_and_copy_stateid(&ls
->ls_recall_sid
, &ls
->ls_stid
);
622 nfsd4_cb_layout_done(struct nfsd4_callback
*cb
, struct rpc_task
*task
)
624 struct nfs4_layout_stateid
*ls
=
625 container_of(cb
, struct nfs4_layout_stateid
, ls_recall
);
628 switch (task
->tk_status
) {
631 case -NFS4ERR_NOMATCHING_LAYOUT
:
632 trace_layout_recall_done(&ls
->ls_stid
.sc_stateid
);
636 /* Poll the client until it's done with the layout */
637 /* FIXME: cap number of retries.
638 * The pnfs standard states that we need to only expire
639 * the client after at-least "lease time" .eg lease-time * 2
640 * when failing to communicate a recall
642 rpc_delay(task
, HZ
/100); /* 10 mili-seconds */
646 * Unknown error or non-responding client, we'll need to fence.
648 nfsd4_cb_layout_fail(ls
);
654 nfsd4_cb_layout_release(struct nfsd4_callback
*cb
)
656 struct nfs4_layout_stateid
*ls
=
657 container_of(cb
, struct nfs4_layout_stateid
, ls_recall
);
660 trace_layout_recall_release(&ls
->ls_stid
.sc_stateid
);
662 mutex_unlock(&ls
->ls_mutex
);
663 nfsd4_return_all_layouts(ls
, &reaplist
);
664 nfsd4_free_layouts(&reaplist
);
665 nfs4_put_stid(&ls
->ls_stid
);
668 static struct nfsd4_callback_ops nfsd4_cb_layout_ops
= {
669 .prepare
= nfsd4_cb_layout_prepare
,
670 .done
= nfsd4_cb_layout_done
,
671 .release
= nfsd4_cb_layout_release
,
675 nfsd4_layout_lm_break(struct file_lock
*fl
)
678 * We don't want the locks code to timeout the lease for us;
679 * we'll remove it ourself if a layout isn't returned
682 fl
->fl_break_time
= 0;
683 nfsd4_recall_file_layout(fl
->fl_owner
);
688 nfsd4_layout_lm_change(struct file_lock
*onlist
, int arg
,
689 struct list_head
*dispose
)
691 BUG_ON(!(arg
& F_UNLCK
));
692 return lease_modify(onlist
, arg
, dispose
);
695 static const struct lock_manager_operations nfsd4_layouts_lm_ops
= {
696 .lm_break
= nfsd4_layout_lm_break
,
697 .lm_change
= nfsd4_layout_lm_change
,
701 nfsd4_init_pnfs(void)
705 for (i
= 0; i
< DEVID_HASH_SIZE
; i
++)
706 INIT_LIST_HEAD(&nfsd_devid_hash
[i
]);
708 nfs4_layout_cache
= kmem_cache_create("nfs4_layout",
709 sizeof(struct nfs4_layout
), 0, 0, NULL
);
710 if (!nfs4_layout_cache
)
713 nfs4_layout_stateid_cache
= kmem_cache_create("nfs4_layout_stateid",
714 sizeof(struct nfs4_layout_stateid
), 0, 0, NULL
);
715 if (!nfs4_layout_stateid_cache
) {
716 kmem_cache_destroy(nfs4_layout_cache
);
723 nfsd4_exit_pnfs(void)
727 kmem_cache_destroy(nfs4_layout_cache
);
728 kmem_cache_destroy(nfs4_layout_stateid_cache
);
730 for (i
= 0; i
< DEVID_HASH_SIZE
; i
++) {
731 struct nfsd4_deviceid_map
*map
, *n
;
733 list_for_each_entry_safe(map
, n
, &nfsd_devid_hash
[i
], hash
)