2 * Device operations for the pnfs nfs4 file layout driver.
4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
6 * Tao Peng <bergwolf@primarydata.com>
9 #include <linux/nfs_fs.h>
10 #include <linux/vmalloc.h>
11 #include <linux/module.h>
12 #include <linux/sunrpc/addr.h>
14 #include "../internal.h"
15 #include "../nfs4session.h"
16 #include "flexfilelayout.h"
18 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
20 static unsigned int dataserver_timeo
= NFS4_DEF_DS_TIMEO
;
21 static unsigned int dataserver_retrans
= NFS4_DEF_DS_RETRANS
;
23 void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds
*mirror_ds
)
26 nfs4_put_deviceid_node(&mirror_ds
->id_node
);
29 void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds
*mirror_ds
)
31 nfs4_print_deviceid(&mirror_ds
->id_node
.deviceid
);
32 nfs4_pnfs_ds_put(mirror_ds
->ds
);
33 kfree_rcu(mirror_ds
, id_node
.rcu
);
36 /* Decode opaque device data and construct new_ds using it */
37 struct nfs4_ff_layout_ds
*
38 nfs4_ff_alloc_deviceid_node(struct nfs_server
*server
, struct pnfs_device
*pdev
,
41 struct xdr_stream stream
;
44 struct list_head dsaddrs
;
45 struct nfs4_pnfs_ds_addr
*da
;
46 struct nfs4_ff_layout_ds
*new_ds
= NULL
;
47 struct nfs4_ff_ds_version
*ds_versions
= NULL
;
53 /* set up xdr stream */
54 scratch
= alloc_page(gfp_flags
);
58 new_ds
= kzalloc(sizeof(struct nfs4_ff_layout_ds
), gfp_flags
);
62 nfs4_init_deviceid_node(&new_ds
->id_node
,
65 INIT_LIST_HEAD(&dsaddrs
);
67 xdr_init_decode_pages(&stream
, &buf
, pdev
->pages
, pdev
->pglen
);
68 xdr_set_scratch_buffer(&stream
, page_address(scratch
), PAGE_SIZE
);
71 p
= xdr_inline_decode(&stream
, 4);
73 goto out_err_drain_dsaddrs
;
74 mp_count
= be32_to_cpup(p
);
75 dprintk("%s: multipath ds count %d\n", __func__
, mp_count
);
77 for (i
= 0; i
< mp_count
; i
++) {
79 da
= nfs4_decode_mp_ds_addr(server
->nfs_client
->cl_net
,
82 list_add_tail(&da
->da_node
, &dsaddrs
);
84 if (list_empty(&dsaddrs
)) {
85 dprintk("%s: no suitable DS addresses found\n",
88 goto out_err_drain_dsaddrs
;
92 p
= xdr_inline_decode(&stream
, 4);
94 goto out_err_drain_dsaddrs
;
95 version_count
= be32_to_cpup(p
);
96 dprintk("%s: version count %d\n", __func__
, version_count
);
98 ds_versions
= kzalloc(version_count
* sizeof(struct nfs4_ff_ds_version
),
103 for (i
= 0; i
< version_count
; i
++) {
104 /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
105 * tightly_coupled(4) */
106 p
= xdr_inline_decode(&stream
, 20);
108 goto out_err_drain_dsaddrs
;
109 ds_versions
[i
].version
= be32_to_cpup(p
++);
110 ds_versions
[i
].minor_version
= be32_to_cpup(p
++);
111 ds_versions
[i
].rsize
= nfs_block_size(be32_to_cpup(p
++), NULL
);
112 ds_versions
[i
].wsize
= nfs_block_size(be32_to_cpup(p
++), NULL
);
113 ds_versions
[i
].tightly_coupled
= be32_to_cpup(p
);
115 if (ds_versions
[i
].rsize
> NFS_MAX_FILE_IO_SIZE
)
116 ds_versions
[i
].rsize
= NFS_MAX_FILE_IO_SIZE
;
117 if (ds_versions
[i
].wsize
> NFS_MAX_FILE_IO_SIZE
)
118 ds_versions
[i
].wsize
= NFS_MAX_FILE_IO_SIZE
;
120 if (ds_versions
[i
].version
!= 3 || ds_versions
[i
].minor_version
!= 0) {
121 dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__
,
122 i
, ds_versions
[i
].version
,
123 ds_versions
[i
].minor_version
);
124 ret
= -EPROTONOSUPPORT
;
125 goto out_err_drain_dsaddrs
;
128 dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
129 __func__
, i
, ds_versions
[i
].version
,
130 ds_versions
[i
].minor_version
,
131 ds_versions
[i
].rsize
,
132 ds_versions
[i
].wsize
,
133 ds_versions
[i
].tightly_coupled
);
136 new_ds
->ds_versions
= ds_versions
;
137 new_ds
->ds_versions_cnt
= version_count
;
139 new_ds
->ds
= nfs4_pnfs_ds_add(&dsaddrs
, gfp_flags
);
141 goto out_err_drain_dsaddrs
;
143 /* If DS was already in cache, free ds addrs */
144 while (!list_empty(&dsaddrs
)) {
145 da
= list_first_entry(&dsaddrs
,
146 struct nfs4_pnfs_ds_addr
,
148 list_del_init(&da
->da_node
);
149 kfree(da
->da_remotestr
);
153 __free_page(scratch
);
156 out_err_drain_dsaddrs
:
157 while (!list_empty(&dsaddrs
)) {
158 da
= list_first_entry(&dsaddrs
, struct nfs4_pnfs_ds_addr
,
160 list_del_init(&da
->da_node
);
161 kfree(da
->da_remotestr
);
167 __free_page(scratch
);
171 dprintk("%s ERROR: returning %d\n", __func__
, ret
);
175 static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment
*lseg
,
176 struct nfs4_deviceid_node
*devid
)
178 nfs4_mark_deviceid_unavailable(devid
);
179 if (!ff_layout_has_available_ds(lseg
))
180 pnfs_error_mark_layout_for_return(lseg
->pls_layout
->plh_inode
,
184 static bool ff_layout_mirror_valid(struct pnfs_layout_segment
*lseg
,
185 struct nfs4_ff_layout_mirror
*mirror
)
187 if (mirror
== NULL
|| mirror
->mirror_ds
== NULL
) {
188 pnfs_error_mark_layout_for_return(lseg
->pls_layout
->plh_inode
,
192 if (mirror
->mirror_ds
->ds
== NULL
) {
193 struct nfs4_deviceid_node
*devid
;
194 devid
= &mirror
->mirror_ds
->id_node
;
195 ff_layout_mark_devid_invalid(lseg
, devid
);
202 end_offset(u64 start
, u64 len
)
207 return end
>= start
? end
: NFS4_MAX_UINT64
;
210 static void extend_ds_error(struct nfs4_ff_layout_ds_err
*err
,
211 u64 offset
, u64 length
)
215 end
= max_t(u64
, end_offset(err
->offset
, err
->length
),
216 end_offset(offset
, length
));
217 err
->offset
= min_t(u64
, err
->offset
, offset
);
218 err
->length
= end
- err
->offset
;
221 static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err
*err
, u64 offset
,
222 u64 length
, int status
, enum nfs_opnum4 opnum
,
223 nfs4_stateid
*stateid
,
224 struct nfs4_deviceid
*deviceid
)
226 return err
->status
== status
&& err
->opnum
== opnum
&&
227 nfs4_stateid_match(&err
->stateid
, stateid
) &&
228 !memcmp(&err
->deviceid
, deviceid
, sizeof(*deviceid
)) &&
229 end_offset(err
->offset
, err
->length
) >= offset
&&
230 err
->offset
<= end_offset(offset
, length
);
233 static bool merge_ds_error(struct nfs4_ff_layout_ds_err
*old
,
234 struct nfs4_ff_layout_ds_err
*new)
236 if (!ds_error_can_merge(old
, new->offset
, new->length
, new->status
,
237 new->opnum
, &new->stateid
, &new->deviceid
))
240 extend_ds_error(old
, new->offset
, new->length
);
245 ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout
*flo
,
246 struct nfs4_ff_layout_ds_err
*dserr
)
248 struct nfs4_ff_layout_ds_err
*err
;
250 list_for_each_entry(err
, &flo
->error_list
, list
) {
251 if (merge_ds_error(err
, dserr
)) {
256 list_add(&dserr
->list
, &flo
->error_list
);
261 ff_layout_update_ds_error(struct nfs4_flexfile_layout
*flo
, u64 offset
,
262 u64 length
, int status
, enum nfs_opnum4 opnum
,
263 nfs4_stateid
*stateid
, struct nfs4_deviceid
*deviceid
)
266 struct nfs4_ff_layout_ds_err
*err
;
268 list_for_each_entry(err
, &flo
->error_list
, list
) {
269 if (ds_error_can_merge(err
, offset
, length
, status
, opnum
,
270 stateid
, deviceid
)) {
272 extend_ds_error(err
, offset
, length
);
280 int ff_layout_track_ds_error(struct nfs4_flexfile_layout
*flo
,
281 struct nfs4_ff_layout_mirror
*mirror
, u64 offset
,
282 u64 length
, int status
, enum nfs_opnum4 opnum
,
285 struct nfs4_ff_layout_ds_err
*dserr
;
291 if (mirror
->mirror_ds
== NULL
)
294 spin_lock(&flo
->generic_hdr
.plh_inode
->i_lock
);
295 if (ff_layout_update_ds_error(flo
, offset
, length
, status
, opnum
,
297 &mirror
->mirror_ds
->id_node
.deviceid
)) {
298 spin_unlock(&flo
->generic_hdr
.plh_inode
->i_lock
);
301 spin_unlock(&flo
->generic_hdr
.plh_inode
->i_lock
);
302 dserr
= kmalloc(sizeof(*dserr
), gfp_flags
);
306 INIT_LIST_HEAD(&dserr
->list
);
307 dserr
->offset
= offset
;
308 dserr
->length
= length
;
309 dserr
->status
= status
;
310 dserr
->opnum
= opnum
;
311 nfs4_stateid_copy(&dserr
->stateid
, &mirror
->stateid
);
312 memcpy(&dserr
->deviceid
, &mirror
->mirror_ds
->id_node
.deviceid
,
313 NFS4_DEVICEID4_SIZE
);
315 spin_lock(&flo
->generic_hdr
.plh_inode
->i_lock
);
316 needfree
= ff_layout_add_ds_error_locked(flo
, dserr
);
317 spin_unlock(&flo
->generic_hdr
.plh_inode
->i_lock
);
324 /* currently we only support AUTH_NONE and AUTH_SYS */
325 static rpc_authflavor_t
326 nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror
*mirror
)
328 if (mirror
->uid
== (u32
)-1)
329 return RPC_AUTH_NULL
;
330 return RPC_AUTH_UNIX
;
333 /* fetch cred for NFSv3 DS */
334 static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror
*mirror
,
335 struct nfs4_pnfs_ds
*ds
)
337 if (ds
->ds_clp
&& !mirror
->cred
&&
338 mirror
->mirror_ds
->ds_versions
[0].version
== 3) {
339 struct rpc_auth
*auth
= ds
->ds_clp
->cl_rpcclient
->cl_auth
;
340 struct rpc_cred
*cred
;
341 struct auth_cred acred
= {
342 .uid
= make_kuid(&init_user_ns
, mirror
->uid
),
343 .gid
= make_kgid(&init_user_ns
, mirror
->gid
),
346 /* AUTH_NULL ignores acred */
347 cred
= auth
->au_ops
->lookup_cred(auth
, &acred
, 0);
349 dprintk("%s: lookup_cred failed with %ld\n",
350 __func__
, PTR_ERR(cred
));
351 return PTR_ERR(cred
);
353 if (cmpxchg(&mirror
->cred
, NULL
, cred
))
361 nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment
*lseg
, u32 mirror_idx
)
363 struct nfs4_ff_layout_mirror
*mirror
= FF_LAYOUT_COMP(lseg
, mirror_idx
);
364 struct nfs_fh
*fh
= NULL
;
366 if (!ff_layout_mirror_valid(lseg
, mirror
)) {
367 pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
368 __func__
, mirror_idx
);
372 /* FIXME: For now assume there is only 1 version available for the DS */
373 fh
= &mirror
->fh_versions
[0];
378 /* Upon return, either ds is connected, or ds is NULL */
379 struct nfs4_pnfs_ds
*
380 nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment
*lseg
, u32 ds_idx
,
383 struct nfs4_ff_layout_mirror
*mirror
= FF_LAYOUT_COMP(lseg
, ds_idx
);
384 struct nfs4_pnfs_ds
*ds
= NULL
;
385 struct nfs4_deviceid_node
*devid
;
386 struct inode
*ino
= lseg
->pls_layout
->plh_inode
;
387 struct nfs_server
*s
= NFS_SERVER(ino
);
388 unsigned int max_payload
;
389 rpc_authflavor_t flavor
;
391 if (!ff_layout_mirror_valid(lseg
, mirror
)) {
392 pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
397 devid
= &mirror
->mirror_ds
->id_node
;
398 if (ff_layout_test_devid_unavailable(devid
))
401 ds
= mirror
->mirror_ds
->ds
;
402 /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
405 goto out_update_creds
;
407 flavor
= nfs4_ff_layout_choose_authflavor(mirror
);
409 /* FIXME: For now we assume the server sent only one version of NFS
412 nfs4_pnfs_ds_connect(s
, ds
, devid
, dataserver_timeo
,
414 mirror
->mirror_ds
->ds_versions
[0].version
,
415 mirror
->mirror_ds
->ds_versions
[0].minor_version
,
418 /* connect success, check rsize/wsize limit */
421 nfs_block_size(rpc_max_payload(ds
->ds_clp
->cl_rpcclient
),
423 if (mirror
->mirror_ds
->ds_versions
[0].rsize
> max_payload
)
424 mirror
->mirror_ds
->ds_versions
[0].rsize
= max_payload
;
425 if (mirror
->mirror_ds
->ds_versions
[0].wsize
> max_payload
)
426 mirror
->mirror_ds
->ds_versions
[0].wsize
= max_payload
;
428 ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg
->pls_layout
),
429 mirror
, lseg
->pls_range
.offset
,
430 lseg
->pls_range
.length
, NFS4ERR_NXIO
,
431 OP_ILLEGAL
, GFP_NOIO
);
433 pnfs_error_mark_layout_for_return(ino
, lseg
);
434 if (ff_layout_has_available_ds(lseg
))
435 pnfs_set_retry_layoutget(lseg
->pls_layout
);
437 pnfs_clear_retry_layoutget(lseg
->pls_layout
);
440 if (ff_layout_has_available_ds(lseg
))
441 set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE
,
442 &lseg
->pls_layout
->plh_flags
);
444 pnfs_error_mark_layout_for_return(ino
, lseg
);
445 pnfs_clear_retry_layoutget(lseg
->pls_layout
);
450 if (ff_layout_update_mirror_cred(mirror
, ds
))
457 ff_layout_get_ds_cred(struct pnfs_layout_segment
*lseg
, u32 ds_idx
,
458 struct rpc_cred
*mdscred
)
460 struct nfs4_ff_layout_mirror
*mirror
= FF_LAYOUT_COMP(lseg
, ds_idx
);
461 struct rpc_cred
*cred
= ERR_PTR(-EINVAL
);
463 if (!nfs4_ff_layout_prepare_ds(lseg
, ds_idx
, true))
466 if (mirror
&& mirror
->cred
)
475 * Find or create a DS rpc client with th MDS server rpc client auth flavor
476 * in the nfs_client cl_ds_clients list.
479 nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment
*lseg
, u32 ds_idx
,
480 struct nfs_client
*ds_clp
, struct inode
*inode
)
482 struct nfs4_ff_layout_mirror
*mirror
= FF_LAYOUT_COMP(lseg
, ds_idx
);
484 switch (mirror
->mirror_ds
->ds_versions
[0].version
) {
486 /* For NFSv3 DS, flavor is set when creating DS connections */
487 return ds_clp
->cl_rpcclient
;
489 return nfs4_find_or_create_ds_client(ds_clp
, inode
);
495 static bool is_range_intersecting(u64 offset1
, u64 length1
,
496 u64 offset2
, u64 length2
)
498 u64 end1
= end_offset(offset1
, length1
);
499 u64 end2
= end_offset(offset2
, length2
);
501 return (end1
== NFS4_MAX_UINT64
|| end1
> offset2
) &&
502 (end2
== NFS4_MAX_UINT64
|| end2
> offset1
);
505 /* called with inode i_lock held */
506 int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout
*flo
,
507 struct xdr_stream
*xdr
, int *count
,
508 const struct pnfs_layout_range
*range
)
510 struct nfs4_ff_layout_ds_err
*err
, *n
;
513 list_for_each_entry_safe(err
, n
, &flo
->error_list
, list
) {
514 if (!is_range_intersecting(err
->offset
, err
->length
,
515 range
->offset
, range
->length
))
517 /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
518 * + array length + deviceid(NFS4_DEVICEID4_SIZE)
519 * + status(4) + opnum(4)
521 p
= xdr_reserve_space(xdr
,
522 28 + NFS4_STATEID_SIZE
+ NFS4_DEVICEID4_SIZE
);
525 p
= xdr_encode_hyper(p
, err
->offset
);
526 p
= xdr_encode_hyper(p
, err
->length
);
527 p
= xdr_encode_opaque_fixed(p
, &err
->stateid
,
530 *p
++ = cpu_to_be32(1);
531 p
= xdr_encode_opaque_fixed(p
, &err
->deviceid
,
532 NFS4_DEVICEID4_SIZE
);
533 *p
++ = cpu_to_be32(err
->status
);
534 *p
++ = cpu_to_be32(err
->opnum
);
536 list_del(&err
->list
);
537 dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
538 __func__
, err
->offset
, err
->length
, err
->status
,
546 static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment
*lseg
)
548 struct nfs4_ff_layout_mirror
*mirror
;
549 struct nfs4_deviceid_node
*devid
;
552 for (idx
= 0; idx
< FF_LAYOUT_MIRROR_COUNT(lseg
); idx
++) {
553 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
554 if (mirror
&& mirror
->mirror_ds
) {
555 devid
= &mirror
->mirror_ds
->id_node
;
556 if (!ff_layout_test_devid_unavailable(devid
))
564 static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment
*lseg
)
566 struct nfs4_ff_layout_mirror
*mirror
;
567 struct nfs4_deviceid_node
*devid
;
570 for (idx
= 0; idx
< FF_LAYOUT_MIRROR_COUNT(lseg
); idx
++) {
571 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
572 if (!mirror
|| !mirror
->mirror_ds
)
574 devid
= &mirror
->mirror_ds
->id_node
;
575 if (ff_layout_test_devid_unavailable(devid
))
579 return FF_LAYOUT_MIRROR_COUNT(lseg
) != 0;
582 bool ff_layout_has_available_ds(struct pnfs_layout_segment
*lseg
)
584 if (lseg
->pls_range
.iomode
== IOMODE_READ
)
585 return ff_read_layout_has_available_ds(lseg
);
586 /* Note: RW layout needs all mirrors available */
587 return ff_rw_layout_has_available_ds(lseg
);
590 module_param(dataserver_retrans
, uint
, 0644);
591 MODULE_PARM_DESC(dataserver_retrans
, "The number of times the NFSv4.1 client "
592 "retries a request before it attempts further "
593 " recovery action.");
594 module_param(dataserver_timeo
, uint
, 0644);
595 MODULE_PARM_DESC(dataserver_timeo
, "The time (in tenths of a second) the "
596 "NFSv4.1 client waits for a response from a "
597 " data server before it retries an NFS request.");