1 // SPDX-License-Identifier: GPL-2.0
3 * Device operations for the pnfs nfs4 file layout driver.
5 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
7 * Tao Peng <bergwolf@primarydata.com>
10 #include <linux/nfs_fs.h>
11 #include <linux/vmalloc.h>
12 #include <linux/module.h>
13 #include <linux/sunrpc/addr.h>
15 #include "../internal.h"
16 #include "../nfs4session.h"
17 #include "flexfilelayout.h"
19 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
21 static unsigned int dataserver_timeo
= NFS_DEF_TCP_RETRANS
;
22 static unsigned int dataserver_retrans
;
24 static bool ff_layout_has_available_ds(struct pnfs_layout_segment
*lseg
);
26 void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds
*mirror_ds
)
28 if (!IS_ERR_OR_NULL(mirror_ds
))
29 nfs4_put_deviceid_node(&mirror_ds
->id_node
);
32 void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds
*mirror_ds
)
34 nfs4_print_deviceid(&mirror_ds
->id_node
.deviceid
);
35 nfs4_pnfs_ds_put(mirror_ds
->ds
);
36 kfree(mirror_ds
->ds_versions
);
37 kfree_rcu(mirror_ds
, id_node
.rcu
);
40 /* Decode opaque device data and construct new_ds using it */
41 struct nfs4_ff_layout_ds
*
42 nfs4_ff_alloc_deviceid_node(struct nfs_server
*server
, struct pnfs_device
*pdev
,
45 struct xdr_stream stream
;
48 struct list_head dsaddrs
;
49 struct nfs4_pnfs_ds_addr
*da
;
50 struct nfs4_ff_layout_ds
*new_ds
= NULL
;
51 struct nfs4_ff_ds_version
*ds_versions
= NULL
;
57 /* set up xdr stream */
58 scratch
= alloc_page(gfp_flags
);
62 new_ds
= kzalloc(sizeof(struct nfs4_ff_layout_ds
), gfp_flags
);
66 nfs4_init_deviceid_node(&new_ds
->id_node
,
69 INIT_LIST_HEAD(&dsaddrs
);
71 xdr_init_decode_pages(&stream
, &buf
, pdev
->pages
, pdev
->pglen
);
72 xdr_set_scratch_buffer(&stream
, page_address(scratch
), PAGE_SIZE
);
75 p
= xdr_inline_decode(&stream
, 4);
77 goto out_err_drain_dsaddrs
;
78 mp_count
= be32_to_cpup(p
);
79 dprintk("%s: multipath ds count %d\n", __func__
, mp_count
);
81 for (i
= 0; i
< mp_count
; i
++) {
83 da
= nfs4_decode_mp_ds_addr(server
->nfs_client
->cl_net
,
86 list_add_tail(&da
->da_node
, &dsaddrs
);
88 if (list_empty(&dsaddrs
)) {
89 dprintk("%s: no suitable DS addresses found\n",
92 goto out_err_drain_dsaddrs
;
96 p
= xdr_inline_decode(&stream
, 4);
98 goto out_err_drain_dsaddrs
;
99 version_count
= be32_to_cpup(p
);
100 dprintk("%s: version count %d\n", __func__
, version_count
);
102 ds_versions
= kcalloc(version_count
,
103 sizeof(struct nfs4_ff_ds_version
),
108 for (i
= 0; i
< version_count
; i
++) {
109 /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
110 * tightly_coupled(4) */
111 p
= xdr_inline_decode(&stream
, 20);
113 goto out_err_drain_dsaddrs
;
114 ds_versions
[i
].version
= be32_to_cpup(p
++);
115 ds_versions
[i
].minor_version
= be32_to_cpup(p
++);
116 ds_versions
[i
].rsize
= nfs_block_size(be32_to_cpup(p
++), NULL
);
117 ds_versions
[i
].wsize
= nfs_block_size(be32_to_cpup(p
++), NULL
);
118 ds_versions
[i
].tightly_coupled
= be32_to_cpup(p
);
120 if (ds_versions
[i
].rsize
> NFS_MAX_FILE_IO_SIZE
)
121 ds_versions
[i
].rsize
= NFS_MAX_FILE_IO_SIZE
;
122 if (ds_versions
[i
].wsize
> NFS_MAX_FILE_IO_SIZE
)
123 ds_versions
[i
].wsize
= NFS_MAX_FILE_IO_SIZE
;
126 * check for valid major/minor combination.
127 * currently we support dataserver which talk:
128 * v3, v4.0, v4.1, v4.2
130 if (!((ds_versions
[i
].version
== 3 && ds_versions
[i
].minor_version
== 0) ||
131 (ds_versions
[i
].version
== 4 && ds_versions
[i
].minor_version
< 3))) {
132 dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__
,
133 i
, ds_versions
[i
].version
,
134 ds_versions
[i
].minor_version
);
135 ret
= -EPROTONOSUPPORT
;
136 goto out_err_drain_dsaddrs
;
139 dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
140 __func__
, i
, ds_versions
[i
].version
,
141 ds_versions
[i
].minor_version
,
142 ds_versions
[i
].rsize
,
143 ds_versions
[i
].wsize
,
144 ds_versions
[i
].tightly_coupled
);
147 new_ds
->ds_versions
= ds_versions
;
148 new_ds
->ds_versions_cnt
= version_count
;
150 new_ds
->ds
= nfs4_pnfs_ds_add(&dsaddrs
, gfp_flags
);
152 goto out_err_drain_dsaddrs
;
154 /* If DS was already in cache, free ds addrs */
155 while (!list_empty(&dsaddrs
)) {
156 da
= list_first_entry(&dsaddrs
,
157 struct nfs4_pnfs_ds_addr
,
159 list_del_init(&da
->da_node
);
160 kfree(da
->da_remotestr
);
164 __free_page(scratch
);
167 out_err_drain_dsaddrs
:
168 while (!list_empty(&dsaddrs
)) {
169 da
= list_first_entry(&dsaddrs
, struct nfs4_pnfs_ds_addr
,
171 list_del_init(&da
->da_node
);
172 kfree(da
->da_remotestr
);
178 __free_page(scratch
);
182 dprintk("%s ERROR: returning %d\n", __func__
, ret
);
186 static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment
*lseg
,
187 struct nfs4_deviceid_node
*devid
)
189 nfs4_delete_deviceid(devid
->ld
, devid
->nfs_client
, &devid
->deviceid
);
190 if (!ff_layout_has_available_ds(lseg
))
191 pnfs_error_mark_layout_for_return(lseg
->pls_layout
->plh_inode
,
195 static bool ff_layout_mirror_valid(struct pnfs_layout_segment
*lseg
,
196 struct nfs4_ff_layout_mirror
*mirror
,
199 if (mirror
== NULL
|| IS_ERR(mirror
->mirror_ds
))
201 if (mirror
->mirror_ds
== NULL
) {
203 struct nfs4_deviceid_node
*node
;
204 struct pnfs_layout_hdr
*lh
= lseg
->pls_layout
;
205 struct nfs4_ff_layout_ds
*mirror_ds
= ERR_PTR(-ENODEV
);
207 node
= nfs4_find_get_deviceid(NFS_SERVER(lh
->plh_inode
),
208 &mirror
->devid
, lh
->plh_lc_cred
,
211 mirror_ds
= FF_LAYOUT_MIRROR_DS(node
);
213 /* check for race with another call to this function */
214 if (cmpxchg(&mirror
->mirror_ds
, NULL
, mirror_ds
) &&
215 mirror_ds
!= ERR_PTR(-ENODEV
))
216 nfs4_put_deviceid_node(node
);
221 if (IS_ERR(mirror
->mirror_ds
))
224 if (mirror
->mirror_ds
->ds
== NULL
) {
225 struct nfs4_deviceid_node
*devid
;
226 devid
= &mirror
->mirror_ds
->id_node
;
227 ff_layout_mark_devid_invalid(lseg
, devid
);
232 pnfs_error_mark_layout_for_return(lseg
->pls_layout
->plh_inode
, lseg
);
236 static void extend_ds_error(struct nfs4_ff_layout_ds_err
*err
,
237 u64 offset
, u64 length
)
241 end
= max_t(u64
, pnfs_end_offset(err
->offset
, err
->length
),
242 pnfs_end_offset(offset
, length
));
243 err
->offset
= min_t(u64
, err
->offset
, offset
);
244 err
->length
= end
- err
->offset
;
248 ff_ds_error_match(const struct nfs4_ff_layout_ds_err
*e1
,
249 const struct nfs4_ff_layout_ds_err
*e2
)
253 if (e1
->opnum
!= e2
->opnum
)
254 return e1
->opnum
< e2
->opnum
? -1 : 1;
255 if (e1
->status
!= e2
->status
)
256 return e1
->status
< e2
->status
? -1 : 1;
257 ret
= memcmp(e1
->stateid
.data
, e2
->stateid
.data
,
258 sizeof(e1
->stateid
.data
));
261 ret
= memcmp(&e1
->deviceid
, &e2
->deviceid
, sizeof(e1
->deviceid
));
264 if (pnfs_end_offset(e1
->offset
, e1
->length
) < e2
->offset
)
266 if (e1
->offset
> pnfs_end_offset(e2
->offset
, e2
->length
))
268 /* If ranges overlap or are contiguous, they are the same */
273 ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout
*flo
,
274 struct nfs4_ff_layout_ds_err
*dserr
)
276 struct nfs4_ff_layout_ds_err
*err
, *tmp
;
277 struct list_head
*head
= &flo
->error_list
;
280 /* Do insertion sort w/ merges */
281 list_for_each_entry_safe(err
, tmp
, &flo
->error_list
, list
) {
282 match
= ff_ds_error_match(err
, dserr
);
286 /* Add entry "dserr" _before_ entry "err" */
290 /* Entries match, so merge "err" into "dserr" */
291 extend_ds_error(dserr
, err
->offset
, err
->length
);
292 list_replace(&err
->list
, &dserr
->list
);
297 list_add_tail(&dserr
->list
, head
);
300 int ff_layout_track_ds_error(struct nfs4_flexfile_layout
*flo
,
301 struct nfs4_ff_layout_mirror
*mirror
, u64 offset
,
302 u64 length
, int status
, enum nfs_opnum4 opnum
,
305 struct nfs4_ff_layout_ds_err
*dserr
;
310 if (mirror
->mirror_ds
== NULL
)
313 dserr
= kmalloc(sizeof(*dserr
), gfp_flags
);
317 INIT_LIST_HEAD(&dserr
->list
);
318 dserr
->offset
= offset
;
319 dserr
->length
= length
;
320 dserr
->status
= status
;
321 dserr
->opnum
= opnum
;
322 nfs4_stateid_copy(&dserr
->stateid
, &mirror
->stateid
);
323 memcpy(&dserr
->deviceid
, &mirror
->mirror_ds
->id_node
.deviceid
,
324 NFS4_DEVICEID4_SIZE
);
326 spin_lock(&flo
->generic_hdr
.plh_inode
->i_lock
);
327 ff_layout_add_ds_error_locked(flo
, dserr
);
328 spin_unlock(&flo
->generic_hdr
.plh_inode
->i_lock
);
333 static struct rpc_cred
*
334 ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror
*mirror
, u32 iomode
)
336 struct rpc_cred
*cred
, __rcu
**pcred
;
338 if (iomode
== IOMODE_READ
)
339 pcred
= &mirror
->ro_cred
;
341 pcred
= &mirror
->rw_cred
;
345 cred
= rcu_dereference(*pcred
);
349 cred
= get_rpccred_rcu(cred
);
356 nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment
*lseg
, u32 mirror_idx
)
358 struct nfs4_ff_layout_mirror
*mirror
= FF_LAYOUT_COMP(lseg
, mirror_idx
);
359 struct nfs_fh
*fh
= NULL
;
361 if (!ff_layout_mirror_valid(lseg
, mirror
, false)) {
362 pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
363 __func__
, mirror_idx
);
367 /* FIXME: For now assume there is only 1 version available for the DS */
368 fh
= &mirror
->fh_versions
[0];
374 nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment
*lseg
,
376 nfs4_stateid
*stateid
)
378 struct nfs4_ff_layout_mirror
*mirror
= FF_LAYOUT_COMP(lseg
, mirror_idx
);
380 if (!ff_layout_mirror_valid(lseg
, mirror
, false)) {
381 pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
382 __func__
, mirror_idx
);
386 nfs4_stateid_copy(stateid
, &mirror
->stateid
);
393 * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
394 * @lseg: the layout segment we're operating on
395 * @ds_idx: index of the DS to use
396 * @fail_return: return layout on connect failure?
398 * Try to prepare a DS connection to accept an RPC call. This involves
399 * selecting a mirror to use and connecting the client to it if it's not
402 * Since we only need a single functioning mirror to satisfy a read, we don't
403 * want to return the layout if there is one. For writes though, any down
404 * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish
405 * between the two cases.
407 * Returns a pointer to a connected DS object on success or NULL on failure.
409 struct nfs4_pnfs_ds
*
410 nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment
*lseg
, u32 ds_idx
,
413 struct nfs4_ff_layout_mirror
*mirror
= FF_LAYOUT_COMP(lseg
, ds_idx
);
414 struct nfs4_pnfs_ds
*ds
= NULL
;
415 struct nfs4_deviceid_node
*devid
;
416 struct inode
*ino
= lseg
->pls_layout
->plh_inode
;
417 struct nfs_server
*s
= NFS_SERVER(ino
);
418 unsigned int max_payload
;
421 if (!ff_layout_mirror_valid(lseg
, mirror
, true)) {
422 pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
427 devid
= &mirror
->mirror_ds
->id_node
;
428 if (ff_layout_test_devid_unavailable(devid
))
431 ds
= mirror
->mirror_ds
->ds
;
432 /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
437 /* FIXME: For now we assume the server sent only one version of NFS
440 status
= nfs4_pnfs_ds_connect(s
, ds
, devid
, dataserver_timeo
,
442 mirror
->mirror_ds
->ds_versions
[0].version
,
443 mirror
->mirror_ds
->ds_versions
[0].minor_version
);
445 /* connect success, check rsize/wsize limit */
448 nfs_block_size(rpc_max_payload(ds
->ds_clp
->cl_rpcclient
),
450 if (mirror
->mirror_ds
->ds_versions
[0].rsize
> max_payload
)
451 mirror
->mirror_ds
->ds_versions
[0].rsize
= max_payload
;
452 if (mirror
->mirror_ds
->ds_versions
[0].wsize
> max_payload
)
453 mirror
->mirror_ds
->ds_versions
[0].wsize
= max_payload
;
457 ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg
->pls_layout
),
458 mirror
, lseg
->pls_range
.offset
,
459 lseg
->pls_range
.length
, NFS4ERR_NXIO
,
460 OP_ILLEGAL
, GFP_NOIO
);
461 if (fail_return
|| !ff_layout_has_available_ds(lseg
))
462 pnfs_error_mark_layout_for_return(ino
, lseg
);
469 ff_layout_get_ds_cred(struct pnfs_layout_segment
*lseg
, u32 ds_idx
,
470 struct rpc_cred
*mdscred
)
472 struct nfs4_ff_layout_mirror
*mirror
= FF_LAYOUT_COMP(lseg
, ds_idx
);
473 struct rpc_cred
*cred
;
475 if (mirror
&& !mirror
->mirror_ds
->ds_versions
[0].tightly_coupled
) {
476 cred
= ff_layout_get_mirror_cred(mirror
, lseg
->pls_range
.iomode
);
478 cred
= get_rpccred(mdscred
);
480 cred
= get_rpccred(mdscred
);
486 * Find or create a DS rpc client with th MDS server rpc client auth flavor
487 * in the nfs_client cl_ds_clients list.
490 nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment
*lseg
, u32 ds_idx
,
491 struct nfs_client
*ds_clp
, struct inode
*inode
)
493 struct nfs4_ff_layout_mirror
*mirror
= FF_LAYOUT_COMP(lseg
, ds_idx
);
495 switch (mirror
->mirror_ds
->ds_versions
[0].version
) {
497 /* For NFSv3 DS, flavor is set when creating DS connections */
498 return ds_clp
->cl_rpcclient
;
500 return nfs4_find_or_create_ds_client(ds_clp
, inode
);
506 void ff_layout_free_ds_ioerr(struct list_head
*head
)
508 struct nfs4_ff_layout_ds_err
*err
;
510 while (!list_empty(head
)) {
511 err
= list_first_entry(head
,
512 struct nfs4_ff_layout_ds_err
,
514 list_del(&err
->list
);
519 /* called with inode i_lock held */
520 int ff_layout_encode_ds_ioerr(struct xdr_stream
*xdr
, const struct list_head
*head
)
522 struct nfs4_ff_layout_ds_err
*err
;
525 list_for_each_entry(err
, head
, list
) {
526 /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
527 * + array length + deviceid(NFS4_DEVICEID4_SIZE)
528 * + status(4) + opnum(4)
530 p
= xdr_reserve_space(xdr
,
531 28 + NFS4_STATEID_SIZE
+ NFS4_DEVICEID4_SIZE
);
534 p
= xdr_encode_hyper(p
, err
->offset
);
535 p
= xdr_encode_hyper(p
, err
->length
);
536 p
= xdr_encode_opaque_fixed(p
, &err
->stateid
,
539 *p
++ = cpu_to_be32(1);
540 p
= xdr_encode_opaque_fixed(p
, &err
->deviceid
,
541 NFS4_DEVICEID4_SIZE
);
542 *p
++ = cpu_to_be32(err
->status
);
543 *p
++ = cpu_to_be32(err
->opnum
);
544 dprintk("%s: offset %llu length %llu status %d op %d\n",
545 __func__
, err
->offset
, err
->length
, err
->status
,
553 unsigned int do_layout_fetch_ds_ioerr(struct pnfs_layout_hdr
*lo
,
554 const struct pnfs_layout_range
*range
,
555 struct list_head
*head
,
558 struct nfs4_flexfile_layout
*flo
= FF_LAYOUT_FROM_HDR(lo
);
559 struct inode
*inode
= lo
->plh_inode
;
560 struct nfs4_ff_layout_ds_err
*err
, *n
;
561 unsigned int ret
= 0;
563 spin_lock(&inode
->i_lock
);
564 list_for_each_entry_safe(err
, n
, &flo
->error_list
, list
) {
565 if (!pnfs_is_range_intersecting(err
->offset
,
566 pnfs_end_offset(err
->offset
, err
->length
),
568 pnfs_end_offset(range
->offset
, range
->length
)))
572 list_move(&err
->list
, head
);
576 spin_unlock(&inode
->i_lock
);
580 unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr
*lo
,
581 const struct pnfs_layout_range
*range
,
582 struct list_head
*head
,
587 ret
= do_layout_fetch_ds_ioerr(lo
, range
, head
, maxnum
);
588 /* If we're over the max, discard all remaining entries */
591 do_layout_fetch_ds_ioerr(lo
, range
, &discard
, -1);
592 ff_layout_free_ds_ioerr(&discard
);
597 static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment
*lseg
)
599 struct nfs4_ff_layout_mirror
*mirror
;
600 struct nfs4_deviceid_node
*devid
;
603 for (idx
= 0; idx
< FF_LAYOUT_MIRROR_COUNT(lseg
); idx
++) {
604 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
606 if (!mirror
->mirror_ds
)
608 if (IS_ERR(mirror
->mirror_ds
))
610 devid
= &mirror
->mirror_ds
->id_node
;
611 if (!ff_layout_test_devid_unavailable(devid
))
619 static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment
*lseg
)
621 struct nfs4_ff_layout_mirror
*mirror
;
622 struct nfs4_deviceid_node
*devid
;
625 for (idx
= 0; idx
< FF_LAYOUT_MIRROR_COUNT(lseg
); idx
++) {
626 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
627 if (!mirror
|| IS_ERR(mirror
->mirror_ds
))
629 if (!mirror
->mirror_ds
)
631 devid
= &mirror
->mirror_ds
->id_node
;
632 if (ff_layout_test_devid_unavailable(devid
))
636 return FF_LAYOUT_MIRROR_COUNT(lseg
) != 0;
639 static bool ff_layout_has_available_ds(struct pnfs_layout_segment
*lseg
)
641 if (lseg
->pls_range
.iomode
== IOMODE_READ
)
642 return ff_read_layout_has_available_ds(lseg
);
643 /* Note: RW layout needs all mirrors available */
644 return ff_rw_layout_has_available_ds(lseg
);
647 bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment
*lseg
)
649 return ff_layout_no_fallback_to_mds(lseg
) ||
650 ff_layout_has_available_ds(lseg
);
653 bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment
*lseg
)
655 return lseg
->pls_range
.iomode
== IOMODE_RW
&&
656 ff_layout_no_read_on_rw(lseg
);
659 module_param(dataserver_retrans
, uint
, 0644);
660 MODULE_PARM_DESC(dataserver_retrans
, "The number of times the NFSv4.1 client "
661 "retries a request before it attempts further "
662 " recovery action.");
663 module_param(dataserver_timeo
, uint
, 0644);
664 MODULE_PARM_DESC(dataserver_timeo
, "The time (in tenths of a second) the "
665 "NFSv4.1 client waits for a response from a "
666 " data server before it retries an NFS request.");