2 * Copyright (c) 2014 Christoph Hellwig.
4 #include <linux/kmod.h>
5 #include <linux/file.h>
6 #include <linux/jhash.h>
7 #include <linux/sched.h>
8 #include <linux/sunrpc/addr.h>
14 #define NFSDDBG_FACILITY NFSDDBG_PNFS
17 struct list_head lo_perstate
;
18 struct nfs4_layout_stateid
*lo_state
;
19 struct nfsd4_layout_seg lo_seg
;
22 static struct kmem_cache
*nfs4_layout_cache
;
23 static struct kmem_cache
*nfs4_layout_stateid_cache
;
25 static struct nfsd4_callback_ops nfsd4_cb_layout_ops
;
26 static const struct lock_manager_operations nfsd4_layouts_lm_ops
;
28 const struct nfsd4_layout_ops
*nfsd4_layout_ops
[LAYOUT_TYPE_MAX
] = {
29 [LAYOUT_BLOCK_VOLUME
] = &bl_layout_ops
,
32 /* pNFS device ID to export fsid mapping */
33 #define DEVID_HASH_BITS 8
34 #define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
35 #define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
36 static u64 nfsd_devid_seq
= 1;
37 static struct list_head nfsd_devid_hash
[DEVID_HASH_SIZE
];
38 static DEFINE_SPINLOCK(nfsd_devid_lock
);
40 static inline u32
devid_hashfn(u64 idx
)
42 return jhash_2words(idx
, idx
>> 32, 0) & DEVID_HASH_MASK
;
46 nfsd4_alloc_devid_map(const struct svc_fh
*fhp
)
48 const struct knfsd_fh
*fh
= &fhp
->fh_handle
;
49 size_t fsid_len
= key_len(fh
->fh_fsid_type
);
50 struct nfsd4_deviceid_map
*map
, *old
;
53 map
= kzalloc(sizeof(*map
) + fsid_len
, GFP_KERNEL
);
57 map
->fsid_type
= fh
->fh_fsid_type
;
58 memcpy(&map
->fsid
, fh
->fh_fsid
, fsid_len
);
60 spin_lock(&nfsd_devid_lock
);
61 if (fhp
->fh_export
->ex_devid_map
)
64 for (i
= 0; i
< DEVID_HASH_SIZE
; i
++) {
65 list_for_each_entry(old
, &nfsd_devid_hash
[i
], hash
) {
66 if (old
->fsid_type
!= fh
->fh_fsid_type
)
68 if (memcmp(old
->fsid
, fh
->fh_fsid
,
69 key_len(old
->fsid_type
)))
72 fhp
->fh_export
->ex_devid_map
= old
;
77 map
->idx
= nfsd_devid_seq
++;
78 list_add_tail_rcu(&map
->hash
, &nfsd_devid_hash
[devid_hashfn(map
->idx
)]);
79 fhp
->fh_export
->ex_devid_map
= map
;
83 spin_unlock(&nfsd_devid_lock
);
87 struct nfsd4_deviceid_map
*
88 nfsd4_find_devid_map(int idx
)
90 struct nfsd4_deviceid_map
*map
, *ret
= NULL
;
93 list_for_each_entry_rcu(map
, &nfsd_devid_hash
[devid_hashfn(idx
)], hash
)
102 nfsd4_set_deviceid(struct nfsd4_deviceid
*id
, const struct svc_fh
*fhp
,
103 u32 device_generation
)
105 if (!fhp
->fh_export
->ex_devid_map
) {
106 nfsd4_alloc_devid_map(fhp
);
107 if (!fhp
->fh_export
->ex_devid_map
)
111 id
->fsid_idx
= fhp
->fh_export
->ex_devid_map
->idx
;
112 id
->generation
= device_generation
;
117 void nfsd4_setup_layout_type(struct svc_export
*exp
)
119 struct super_block
*sb
= exp
->ex_path
.mnt
->mnt_sb
;
121 if (!(exp
->ex_flags
& NFSEXP_PNFS
))
124 if (sb
->s_export_op
->get_uuid
&&
125 sb
->s_export_op
->map_blocks
&&
126 sb
->s_export_op
->commit_blocks
)
127 exp
->ex_layout_type
= LAYOUT_BLOCK_VOLUME
;
131 nfsd4_free_layout_stateid(struct nfs4_stid
*stid
)
133 struct nfs4_layout_stateid
*ls
= layoutstateid(stid
);
134 struct nfs4_client
*clp
= ls
->ls_stid
.sc_client
;
135 struct nfs4_file
*fp
= ls
->ls_stid
.sc_file
;
137 trace_layoutstate_free(&ls
->ls_stid
.sc_stateid
);
139 spin_lock(&clp
->cl_lock
);
140 list_del_init(&ls
->ls_perclnt
);
141 spin_unlock(&clp
->cl_lock
);
143 spin_lock(&fp
->fi_lock
);
144 list_del_init(&ls
->ls_perfile
);
145 spin_unlock(&fp
->fi_lock
);
147 vfs_setlease(ls
->ls_file
, F_UNLCK
, NULL
, (void **)&ls
);
151 atomic_dec(&ls
->ls_stid
.sc_file
->fi_lo_recalls
);
153 kmem_cache_free(nfs4_layout_stateid_cache
, ls
);
157 nfsd4_layout_setlease(struct nfs4_layout_stateid
*ls
)
159 struct file_lock
*fl
;
162 fl
= locks_alloc_lock();
166 fl
->fl_lmops
= &nfsd4_layouts_lm_ops
;
167 fl
->fl_flags
= FL_LAYOUT
;
168 fl
->fl_type
= F_RDLCK
;
169 fl
->fl_end
= OFFSET_MAX
;
171 fl
->fl_pid
= current
->tgid
;
172 fl
->fl_file
= ls
->ls_file
;
174 status
= vfs_setlease(fl
->fl_file
, fl
->fl_type
, &fl
, NULL
);
183 static struct nfs4_layout_stateid
*
184 nfsd4_alloc_layout_stateid(struct nfsd4_compound_state
*cstate
,
185 struct nfs4_stid
*parent
, u32 layout_type
)
187 struct nfs4_client
*clp
= cstate
->clp
;
188 struct nfs4_file
*fp
= parent
->sc_file
;
189 struct nfs4_layout_stateid
*ls
;
190 struct nfs4_stid
*stp
;
192 stp
= nfs4_alloc_stid(cstate
->clp
, nfs4_layout_stateid_cache
);
195 stp
->sc_free
= nfsd4_free_layout_stateid
;
199 ls
= layoutstateid(stp
);
200 INIT_LIST_HEAD(&ls
->ls_perclnt
);
201 INIT_LIST_HEAD(&ls
->ls_perfile
);
202 spin_lock_init(&ls
->ls_lock
);
203 INIT_LIST_HEAD(&ls
->ls_layouts
);
204 ls
->ls_layout_type
= layout_type
;
205 nfsd4_init_cb(&ls
->ls_recall
, clp
, &nfsd4_cb_layout_ops
,
206 NFSPROC4_CLNT_CB_LAYOUT
);
208 if (parent
->sc_type
== NFS4_DELEG_STID
)
209 ls
->ls_file
= get_file(fp
->fi_deleg_file
);
211 ls
->ls_file
= find_any_file(fp
);
212 BUG_ON(!ls
->ls_file
);
214 if (nfsd4_layout_setlease(ls
)) {
217 kmem_cache_free(nfs4_layout_stateid_cache
, ls
);
221 spin_lock(&clp
->cl_lock
);
222 stp
->sc_type
= NFS4_LAYOUT_STID
;
223 list_add(&ls
->ls_perclnt
, &clp
->cl_lo_states
);
224 spin_unlock(&clp
->cl_lock
);
226 spin_lock(&fp
->fi_lock
);
227 list_add(&ls
->ls_perfile
, &fp
->fi_lo_states
);
228 spin_unlock(&fp
->fi_lock
);
230 trace_layoutstate_alloc(&ls
->ls_stid
.sc_stateid
);
235 nfsd4_preprocess_layout_stateid(struct svc_rqst
*rqstp
,
236 struct nfsd4_compound_state
*cstate
, stateid_t
*stateid
,
237 bool create
, u32 layout_type
, struct nfs4_layout_stateid
**lsp
)
239 struct nfs4_layout_stateid
*ls
;
240 struct nfs4_stid
*stid
;
241 unsigned char typemask
= NFS4_LAYOUT_STID
;
245 typemask
|= (NFS4_OPEN_STID
| NFS4_LOCK_STID
| NFS4_DELEG_STID
);
247 status
= nfsd4_lookup_stateid(cstate
, stateid
, typemask
, &stid
,
248 net_generic(SVC_NET(rqstp
), nfsd_net_id
));
252 if (!fh_match(&cstate
->current_fh
.fh_handle
,
253 &stid
->sc_file
->fi_fhandle
)) {
254 status
= nfserr_bad_stateid
;
258 if (stid
->sc_type
!= NFS4_LAYOUT_STID
) {
259 ls
= nfsd4_alloc_layout_stateid(cstate
, stid
, layout_type
);
262 status
= nfserr_jukebox
;
266 ls
= container_of(stid
, struct nfs4_layout_stateid
, ls_stid
);
268 status
= nfserr_bad_stateid
;
269 if (stateid
->si_generation
> stid
->sc_stateid
.si_generation
)
271 if (layout_type
!= ls
->ls_layout_type
)
285 nfsd4_recall_file_layout(struct nfs4_layout_stateid
*ls
)
287 spin_lock(&ls
->ls_lock
);
291 ls
->ls_recalled
= true;
292 atomic_inc(&ls
->ls_stid
.sc_file
->fi_lo_recalls
);
293 if (list_empty(&ls
->ls_layouts
))
296 trace_layout_recall(&ls
->ls_stid
.sc_stateid
);
298 atomic_inc(&ls
->ls_stid
.sc_count
);
299 update_stateid(&ls
->ls_stid
.sc_stateid
);
300 memcpy(&ls
->ls_recall_sid
, &ls
->ls_stid
.sc_stateid
, sizeof(stateid_t
));
301 nfsd4_run_cb(&ls
->ls_recall
);
304 spin_unlock(&ls
->ls_lock
);
308 layout_end(struct nfsd4_layout_seg
*seg
)
310 u64 end
= seg
->offset
+ seg
->length
;
311 return end
>= seg
->offset
? end
: NFS4_MAX_UINT64
;
315 layout_update_len(struct nfsd4_layout_seg
*lo
, u64 end
)
317 if (end
== NFS4_MAX_UINT64
)
318 lo
->length
= NFS4_MAX_UINT64
;
320 lo
->length
= end
- lo
->offset
;
324 layouts_overlapping(struct nfs4_layout
*lo
, struct nfsd4_layout_seg
*s
)
326 if (s
->iomode
!= IOMODE_ANY
&& s
->iomode
!= lo
->lo_seg
.iomode
)
328 if (layout_end(&lo
->lo_seg
) <= s
->offset
)
330 if (layout_end(s
) <= lo
->lo_seg
.offset
)
336 layouts_try_merge(struct nfsd4_layout_seg
*lo
, struct nfsd4_layout_seg
*new)
338 if (lo
->iomode
!= new->iomode
)
340 if (layout_end(new) < lo
->offset
)
342 if (layout_end(lo
) < new->offset
)
345 lo
->offset
= min(lo
->offset
, new->offset
);
346 layout_update_len(lo
, max(layout_end(lo
), layout_end(new)));
351 nfsd4_recall_conflict(struct nfs4_layout_stateid
*ls
)
353 struct nfs4_file
*fp
= ls
->ls_stid
.sc_file
;
354 struct nfs4_layout_stateid
*l
, *n
;
355 __be32 nfserr
= nfs_ok
;
357 assert_spin_locked(&fp
->fi_lock
);
359 list_for_each_entry_safe(l
, n
, &fp
->fi_lo_states
, ls_perfile
) {
361 nfsd4_recall_file_layout(l
);
362 nfserr
= nfserr_recallconflict
;
370 nfsd4_insert_layout(struct nfsd4_layoutget
*lgp
, struct nfs4_layout_stateid
*ls
)
372 struct nfsd4_layout_seg
*seg
= &lgp
->lg_seg
;
373 struct nfs4_file
*fp
= ls
->ls_stid
.sc_file
;
374 struct nfs4_layout
*lp
, *new = NULL
;
377 spin_lock(&fp
->fi_lock
);
378 nfserr
= nfsd4_recall_conflict(ls
);
381 spin_lock(&ls
->ls_lock
);
382 list_for_each_entry(lp
, &ls
->ls_layouts
, lo_perstate
) {
383 if (layouts_try_merge(&lp
->lo_seg
, seg
))
386 spin_unlock(&ls
->ls_lock
);
387 spin_unlock(&fp
->fi_lock
);
389 new = kmem_cache_alloc(nfs4_layout_cache
, GFP_KERNEL
);
391 return nfserr_jukebox
;
392 memcpy(&new->lo_seg
, seg
, sizeof(lp
->lo_seg
));
395 spin_lock(&fp
->fi_lock
);
396 nfserr
= nfsd4_recall_conflict(ls
);
399 spin_lock(&ls
->ls_lock
);
400 list_for_each_entry(lp
, &ls
->ls_layouts
, lo_perstate
) {
401 if (layouts_try_merge(&lp
->lo_seg
, seg
))
405 atomic_inc(&ls
->ls_stid
.sc_count
);
406 list_add_tail(&new->lo_perstate
, &ls
->ls_layouts
);
409 update_stateid(&ls
->ls_stid
.sc_stateid
);
410 memcpy(&lgp
->lg_sid
, &ls
->ls_stid
.sc_stateid
, sizeof(stateid_t
));
411 spin_unlock(&ls
->ls_lock
);
413 spin_unlock(&fp
->fi_lock
);
415 kmem_cache_free(nfs4_layout_cache
, new);
420 nfsd4_free_layouts(struct list_head
*reaplist
)
422 while (!list_empty(reaplist
)) {
423 struct nfs4_layout
*lp
= list_first_entry(reaplist
,
424 struct nfs4_layout
, lo_perstate
);
426 list_del(&lp
->lo_perstate
);
427 nfs4_put_stid(&lp
->lo_state
->ls_stid
);
428 kmem_cache_free(nfs4_layout_cache
, lp
);
433 nfsd4_return_file_layout(struct nfs4_layout
*lp
, struct nfsd4_layout_seg
*seg
,
434 struct list_head
*reaplist
)
436 struct nfsd4_layout_seg
*lo
= &lp
->lo_seg
;
437 u64 end
= layout_end(lo
);
439 if (seg
->offset
<= lo
->offset
) {
440 if (layout_end(seg
) >= end
) {
441 list_move_tail(&lp
->lo_perstate
, reaplist
);
444 lo
->offset
= layout_end(seg
);
446 /* retain the whole layout segment on a split. */
447 if (layout_end(seg
) < end
) {
448 dprintk("%s: split not supported\n", __func__
);
454 layout_update_len(lo
, end
);
458 nfsd4_return_file_layouts(struct svc_rqst
*rqstp
,
459 struct nfsd4_compound_state
*cstate
,
460 struct nfsd4_layoutreturn
*lrp
)
462 struct nfs4_layout_stateid
*ls
;
463 struct nfs4_layout
*lp
, *n
;
468 nfserr
= nfsd4_preprocess_layout_stateid(rqstp
, cstate
, &lrp
->lr_sid
,
469 false, lrp
->lr_layout_type
,
472 trace_layout_return_lookup_fail(&lrp
->lr_sid
);
476 spin_lock(&ls
->ls_lock
);
477 list_for_each_entry_safe(lp
, n
, &ls
->ls_layouts
, lo_perstate
) {
478 if (layouts_overlapping(lp
, &lrp
->lr_seg
)) {
479 nfsd4_return_file_layout(lp
, &lrp
->lr_seg
, &reaplist
);
483 if (!list_empty(&ls
->ls_layouts
)) {
485 update_stateid(&ls
->ls_stid
.sc_stateid
);
486 memcpy(&lrp
->lr_sid
, &ls
->ls_stid
.sc_stateid
,
489 lrp
->lrs_present
= 1;
491 trace_layoutstate_unhash(&ls
->ls_stid
.sc_stateid
);
492 nfs4_unhash_stid(&ls
->ls_stid
);
493 lrp
->lrs_present
= 0;
495 spin_unlock(&ls
->ls_lock
);
497 nfs4_put_stid(&ls
->ls_stid
);
498 nfsd4_free_layouts(&reaplist
);
503 nfsd4_return_client_layouts(struct svc_rqst
*rqstp
,
504 struct nfsd4_compound_state
*cstate
,
505 struct nfsd4_layoutreturn
*lrp
)
507 struct nfs4_layout_stateid
*ls
, *n
;
508 struct nfs4_client
*clp
= cstate
->clp
;
509 struct nfs4_layout
*lp
, *t
;
512 lrp
->lrs_present
= 0;
514 spin_lock(&clp
->cl_lock
);
515 list_for_each_entry_safe(ls
, n
, &clp
->cl_lo_states
, ls_perclnt
) {
516 if (ls
->ls_layout_type
!= lrp
->lr_layout_type
)
519 if (lrp
->lr_return_type
== RETURN_FSID
&&
520 !fh_fsid_match(&ls
->ls_stid
.sc_file
->fi_fhandle
,
521 &cstate
->current_fh
.fh_handle
))
524 spin_lock(&ls
->ls_lock
);
525 list_for_each_entry_safe(lp
, t
, &ls
->ls_layouts
, lo_perstate
) {
526 if (lrp
->lr_seg
.iomode
== IOMODE_ANY
||
527 lrp
->lr_seg
.iomode
== lp
->lo_seg
.iomode
)
528 list_move_tail(&lp
->lo_perstate
, &reaplist
);
530 spin_unlock(&ls
->ls_lock
);
532 spin_unlock(&clp
->cl_lock
);
534 nfsd4_free_layouts(&reaplist
);
539 nfsd4_return_all_layouts(struct nfs4_layout_stateid
*ls
,
540 struct list_head
*reaplist
)
542 spin_lock(&ls
->ls_lock
);
543 list_splice_init(&ls
->ls_layouts
, reaplist
);
544 spin_unlock(&ls
->ls_lock
);
548 nfsd4_return_all_client_layouts(struct nfs4_client
*clp
)
550 struct nfs4_layout_stateid
*ls
, *n
;
553 spin_lock(&clp
->cl_lock
);
554 list_for_each_entry_safe(ls
, n
, &clp
->cl_lo_states
, ls_perclnt
)
555 nfsd4_return_all_layouts(ls
, &reaplist
);
556 spin_unlock(&clp
->cl_lock
);
558 nfsd4_free_layouts(&reaplist
);
562 nfsd4_return_all_file_layouts(struct nfs4_client
*clp
, struct nfs4_file
*fp
)
564 struct nfs4_layout_stateid
*ls
, *n
;
567 spin_lock(&fp
->fi_lock
);
568 list_for_each_entry_safe(ls
, n
, &fp
->fi_lo_states
, ls_perfile
) {
569 if (ls
->ls_stid
.sc_client
== clp
)
570 nfsd4_return_all_layouts(ls
, &reaplist
);
572 spin_unlock(&fp
->fi_lock
);
574 nfsd4_free_layouts(&reaplist
);
578 nfsd4_cb_layout_fail(struct nfs4_layout_stateid
*ls
)
580 struct nfs4_client
*clp
= ls
->ls_stid
.sc_client
;
581 char addr_str
[INET6_ADDRSTRLEN
];
582 static char *envp
[] = {
585 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
591 rpc_ntop((struct sockaddr
*)&clp
->cl_addr
, addr_str
, sizeof(addr_str
));
593 trace_layout_recall_fail(&ls
->ls_stid
.sc_stateid
);
596 "nfsd: client %s failed to respond to layout recall. "
597 " Fencing..\n", addr_str
);
599 argv
[0] = "/sbin/nfsd-recall-failed";
601 argv
[2] = ls
->ls_file
->f_path
.mnt
->mnt_sb
->s_id
;
604 error
= call_usermodehelper(argv
[0], argv
, envp
, UMH_WAIT_PROC
);
606 printk(KERN_ERR
"nfsd: fence failed for client %s: %d!\n",
612 nfsd4_cb_layout_done(struct nfsd4_callback
*cb
, struct rpc_task
*task
)
614 struct nfs4_layout_stateid
*ls
=
615 container_of(cb
, struct nfs4_layout_stateid
, ls_recall
);
618 switch (task
->tk_status
) {
621 case -NFS4ERR_NOMATCHING_LAYOUT
:
622 trace_layout_recall_done(&ls
->ls_stid
.sc_stateid
);
626 /* Poll the client until it's done with the layout */
627 /* FIXME: cap number of retries.
628 * The pnfs standard states that we need to only expire
629 * the client after at-least "lease time" .eg lease-time * 2
630 * when failing to communicate a recall
632 rpc_delay(task
, HZ
/100); /* 10 mili-seconds */
636 * Unknown error or non-responding client, we'll need to fence.
638 nfsd4_cb_layout_fail(ls
);
644 nfsd4_cb_layout_release(struct nfsd4_callback
*cb
)
646 struct nfs4_layout_stateid
*ls
=
647 container_of(cb
, struct nfs4_layout_stateid
, ls_recall
);
650 trace_layout_recall_release(&ls
->ls_stid
.sc_stateid
);
652 nfsd4_return_all_layouts(ls
, &reaplist
);
653 nfsd4_free_layouts(&reaplist
);
654 nfs4_put_stid(&ls
->ls_stid
);
657 static struct nfsd4_callback_ops nfsd4_cb_layout_ops
= {
658 .done
= nfsd4_cb_layout_done
,
659 .release
= nfsd4_cb_layout_release
,
663 nfsd4_layout_lm_break(struct file_lock
*fl
)
666 * We don't want the locks code to timeout the lease for us;
667 * we'll remove it ourself if a layout isn't returned
670 fl
->fl_break_time
= 0;
671 nfsd4_recall_file_layout(fl
->fl_owner
);
676 nfsd4_layout_lm_change(struct file_lock
*onlist
, int arg
,
677 struct list_head
*dispose
)
679 BUG_ON(!(arg
& F_UNLCK
));
680 return lease_modify(onlist
, arg
, dispose
);
683 static const struct lock_manager_operations nfsd4_layouts_lm_ops
= {
684 .lm_break
= nfsd4_layout_lm_break
,
685 .lm_change
= nfsd4_layout_lm_change
,
689 nfsd4_init_pnfs(void)
693 for (i
= 0; i
< DEVID_HASH_SIZE
; i
++)
694 INIT_LIST_HEAD(&nfsd_devid_hash
[i
]);
696 nfs4_layout_cache
= kmem_cache_create("nfs4_layout",
697 sizeof(struct nfs4_layout
), 0, 0, NULL
);
698 if (!nfs4_layout_cache
)
701 nfs4_layout_stateid_cache
= kmem_cache_create("nfs4_layout_stateid",
702 sizeof(struct nfs4_layout_stateid
), 0, 0, NULL
);
703 if (!nfs4_layout_stateid_cache
) {
704 kmem_cache_destroy(nfs4_layout_cache
);
711 nfsd4_exit_pnfs(void)
715 kmem_cache_destroy(nfs4_layout_cache
);
716 kmem_cache_destroy(nfs4_layout_stateid_cache
);
718 for (i
= 0; i
< DEVID_HASH_SIZE
; i
++) {
719 struct nfsd4_deviceid_map
*map
, *n
;
721 list_for_each_entry_safe(map
, n
, &nfsd_devid_hash
[i
], hash
)