1 // SPDX-License-Identifier: GPL-2.0-only
3 * Module for pnfs flexfile layout driver.
5 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
7 * Tao Peng <bergwolf@primarydata.com>
10 #include <linux/nfs_fs.h>
11 #include <linux/nfs_mount.h>
12 #include <linux/nfs_page.h>
13 #include <linux/module.h>
14 #include <linux/file.h>
15 #include <linux/sched/mm.h>
17 #include <linux/sunrpc/metrics.h>
19 #include "flexfilelayout.h"
20 #include "../nfs4session.h"
21 #include "../nfs4idmap.h"
22 #include "../internal.h"
23 #include "../delegation.h"
24 #include "../nfs4trace.h"
25 #include "../iostat.h"
29 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
31 #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
32 #define FF_LAYOUTRETURN_MAXERR 20
34 enum nfs4_ff_op_type
{
35 NFS4_FF_OP_LAYOUTSTATS
,
36 NFS4_FF_OP_LAYOUTRETURN
,
39 static unsigned short io_maxretrans
;
41 static const struct pnfs_commit_ops ff_layout_commit_ops
;
42 static void ff_layout_read_record_layoutstats_done(struct rpc_task
*task
,
43 struct nfs_pgio_header
*hdr
);
45 ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr
*lo
,
46 struct nfs42_layoutstat_devinfo
*devinfo
,
47 int dev_limit
, enum nfs4_ff_op_type type
);
48 static void ff_layout_encode_ff_layoutupdate(struct xdr_stream
*xdr
,
49 const struct nfs42_layoutstat_devinfo
*devinfo
,
50 struct nfs4_ff_layout_mirror
*mirror
);
52 static struct pnfs_layout_hdr
*
53 ff_layout_alloc_layout_hdr(struct inode
*inode
, gfp_t gfp_flags
)
55 struct nfs4_flexfile_layout
*ffl
;
57 ffl
= kzalloc(sizeof(*ffl
), gfp_flags
);
59 pnfs_init_ds_commit_info(&ffl
->commit_info
);
60 INIT_LIST_HEAD(&ffl
->error_list
);
61 INIT_LIST_HEAD(&ffl
->mirrors
);
62 ffl
->last_report_time
= ktime_get();
63 ffl
->commit_info
.ops
= &ff_layout_commit_ops
;
64 return &ffl
->generic_hdr
;
70 ff_layout_free_layout_hdr(struct pnfs_layout_hdr
*lo
)
72 struct nfs4_flexfile_layout
*ffl
= FF_LAYOUT_FROM_HDR(lo
);
73 struct nfs4_ff_layout_ds_err
*err
, *n
;
75 list_for_each_entry_safe(err
, n
, &ffl
->error_list
, list
) {
79 kfree_rcu(ffl
, generic_hdr
.plh_rcu
);
82 static int decode_pnfs_stateid(struct xdr_stream
*xdr
, nfs4_stateid
*stateid
)
86 p
= xdr_inline_decode(xdr
, NFS4_STATEID_SIZE
);
87 if (unlikely(p
== NULL
))
89 stateid
->type
= NFS4_PNFS_DS_STATEID_TYPE
;
90 memcpy(stateid
->data
, p
, NFS4_STATEID_SIZE
);
91 dprintk("%s: stateid id= [%x%x%x%x]\n", __func__
,
92 p
[0], p
[1], p
[2], p
[3]);
96 static int decode_deviceid(struct xdr_stream
*xdr
, struct nfs4_deviceid
*devid
)
100 p
= xdr_inline_decode(xdr
, NFS4_DEVICEID4_SIZE
);
103 memcpy(devid
, p
, NFS4_DEVICEID4_SIZE
);
104 nfs4_print_deviceid(devid
);
108 static int decode_nfs_fh(struct xdr_stream
*xdr
, struct nfs_fh
*fh
)
112 p
= xdr_inline_decode(xdr
, 4);
115 fh
->size
= be32_to_cpup(p
++);
116 if (fh
->size
> NFS_MAXFHSIZE
) {
117 printk(KERN_ERR
"NFS flexfiles: Too big fh received %d\n",
122 p
= xdr_inline_decode(xdr
, fh
->size
);
125 memcpy(&fh
->data
, p
, fh
->size
);
126 dprintk("%s: fh len %d\n", __func__
, fh
->size
);
132 * Currently only stringified uids and gids are accepted.
133 * I.e., kerberos is not supported to the DSes, so no pricipals.
135 * That means that one common function will suffice, but when
136 * principals are added, this should be split to accomodate
137 * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
140 decode_name(struct xdr_stream
*xdr
, u32
*id
)
145 /* opaque_length(4)*/
146 p
= xdr_inline_decode(xdr
, 4);
149 len
= be32_to_cpup(p
++);
153 dprintk("%s: len %u\n", __func__
, len
);
156 p
= xdr_inline_decode(xdr
, len
);
160 if (!nfs_map_string_to_numeric((char *)p
, len
, id
))
166 static struct nfsd_file
*
167 ff_local_open_fh(struct nfs_client
*clp
, const struct cred
*cred
,
168 struct nfs_fh
*fh
, fmode_t mode
)
170 if (mode
& FMODE_WRITE
) {
172 * Always request read and write access since this corresponds
178 return nfs_local_open_fh(clp
, cred
, fh
, mode
);
181 static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror
*m1
,
182 const struct nfs4_ff_layout_mirror
*m2
)
186 if (m1
->fh_versions_cnt
!= m2
->fh_versions_cnt
)
188 for (i
= 0; i
< m1
->fh_versions_cnt
; i
++) {
189 bool found_fh
= false;
190 for (j
= 0; j
< m2
->fh_versions_cnt
; j
++) {
191 if (nfs_compare_fh(&m1
->fh_versions
[i
],
192 &m2
->fh_versions
[j
]) == 0) {
203 static struct nfs4_ff_layout_mirror
*
204 ff_layout_add_mirror(struct pnfs_layout_hdr
*lo
,
205 struct nfs4_ff_layout_mirror
*mirror
)
207 struct nfs4_flexfile_layout
*ff_layout
= FF_LAYOUT_FROM_HDR(lo
);
208 struct nfs4_ff_layout_mirror
*pos
;
209 struct inode
*inode
= lo
->plh_inode
;
211 spin_lock(&inode
->i_lock
);
212 list_for_each_entry(pos
, &ff_layout
->mirrors
, mirrors
) {
213 if (memcmp(&mirror
->devid
, &pos
->devid
, sizeof(pos
->devid
)) != 0)
215 if (!ff_mirror_match_fh(mirror
, pos
))
217 if (refcount_inc_not_zero(&pos
->ref
)) {
218 spin_unlock(&inode
->i_lock
);
222 list_add(&mirror
->mirrors
, &ff_layout
->mirrors
);
224 spin_unlock(&inode
->i_lock
);
229 ff_layout_remove_mirror(struct nfs4_ff_layout_mirror
*mirror
)
232 if (mirror
->layout
== NULL
)
234 inode
= mirror
->layout
->plh_inode
;
235 spin_lock(&inode
->i_lock
);
236 list_del(&mirror
->mirrors
);
237 spin_unlock(&inode
->i_lock
);
238 mirror
->layout
= NULL
;
241 static struct nfs4_ff_layout_mirror
*ff_layout_alloc_mirror(gfp_t gfp_flags
)
243 struct nfs4_ff_layout_mirror
*mirror
;
245 mirror
= kzalloc(sizeof(*mirror
), gfp_flags
);
246 if (mirror
!= NULL
) {
247 spin_lock_init(&mirror
->lock
);
248 refcount_set(&mirror
->ref
, 1);
249 INIT_LIST_HEAD(&mirror
->mirrors
);
254 static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror
*mirror
)
256 const struct cred
*cred
;
258 ff_layout_remove_mirror(mirror
);
259 kfree(mirror
->fh_versions
);
260 cred
= rcu_access_pointer(mirror
->ro_cred
);
262 cred
= rcu_access_pointer(mirror
->rw_cred
);
264 nfs4_ff_layout_put_deviceid(mirror
->mirror_ds
);
268 static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror
*mirror
)
270 if (mirror
!= NULL
&& refcount_dec_and_test(&mirror
->ref
))
271 ff_layout_free_mirror(mirror
);
274 static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment
*fls
)
278 for (i
= 0; i
< fls
->mirror_array_cnt
; i
++)
279 ff_layout_put_mirror(fls
->mirror_array
[i
]);
282 static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment
*fls
)
285 ff_layout_free_mirror_array(fls
);
291 ff_lseg_match_mirrors(struct pnfs_layout_segment
*l1
,
292 struct pnfs_layout_segment
*l2
)
294 const struct nfs4_ff_layout_segment
*fl1
= FF_LAYOUT_LSEG(l1
);
295 const struct nfs4_ff_layout_segment
*fl2
= FF_LAYOUT_LSEG(l1
);
298 if (fl1
->mirror_array_cnt
!= fl2
->mirror_array_cnt
)
300 for (i
= 0; i
< fl1
->mirror_array_cnt
; i
++) {
301 if (fl1
->mirror_array
[i
] != fl2
->mirror_array
[i
])
308 ff_lseg_range_is_after(const struct pnfs_layout_range
*l1
,
309 const struct pnfs_layout_range
*l2
)
313 if (l1
->iomode
!= l2
->iomode
)
314 return l1
->iomode
!= IOMODE_READ
;
315 end1
= pnfs_calc_offset_end(l1
->offset
, l1
->length
);
316 end2
= pnfs_calc_offset_end(l2
->offset
, l2
->length
);
317 if (end1
< l2
->offset
)
319 if (end2
< l1
->offset
)
321 return l2
->offset
<= l1
->offset
;
325 ff_lseg_merge(struct pnfs_layout_segment
*new,
326 struct pnfs_layout_segment
*old
)
328 u64 new_end
, old_end
;
330 if (test_bit(NFS_LSEG_LAYOUTRETURN
, &old
->pls_flags
))
332 if (new->pls_range
.iomode
!= old
->pls_range
.iomode
)
334 old_end
= pnfs_calc_offset_end(old
->pls_range
.offset
,
335 old
->pls_range
.length
);
336 if (old_end
< new->pls_range
.offset
)
338 new_end
= pnfs_calc_offset_end(new->pls_range
.offset
,
339 new->pls_range
.length
);
340 if (new_end
< old
->pls_range
.offset
)
342 if (!ff_lseg_match_mirrors(new, old
))
345 /* Mergeable: copy info from 'old' to 'new' */
346 if (new_end
< old_end
)
348 if (new->pls_range
.offset
< old
->pls_range
.offset
)
349 new->pls_range
.offset
= old
->pls_range
.offset
;
350 new->pls_range
.length
= pnfs_calc_offset_length(new->pls_range
.offset
,
352 if (test_bit(NFS_LSEG_ROC
, &old
->pls_flags
))
353 set_bit(NFS_LSEG_ROC
, &new->pls_flags
);
358 ff_layout_add_lseg(struct pnfs_layout_hdr
*lo
,
359 struct pnfs_layout_segment
*lseg
,
360 struct list_head
*free_me
)
362 pnfs_generic_layout_insert_lseg(lo
, lseg
,
363 ff_lseg_range_is_after
,
368 static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment
*fls
)
372 for (i
= 0; i
< fls
->mirror_array_cnt
- 1; i
++) {
373 for (j
= i
+ 1; j
< fls
->mirror_array_cnt
; j
++)
374 if (fls
->mirror_array
[i
]->efficiency
<
375 fls
->mirror_array
[j
]->efficiency
)
376 swap(fls
->mirror_array
[i
],
377 fls
->mirror_array
[j
]);
381 static struct pnfs_layout_segment
*
382 ff_layout_alloc_lseg(struct pnfs_layout_hdr
*lh
,
383 struct nfs4_layoutget_res
*lgr
,
386 struct pnfs_layout_segment
*ret
;
387 struct nfs4_ff_layout_segment
*fls
= NULL
;
388 struct xdr_stream stream
;
390 struct page
*scratch
;
392 u32 mirror_array_cnt
;
396 dprintk("--> %s\n", __func__
);
397 scratch
= alloc_page(gfp_flags
);
399 return ERR_PTR(-ENOMEM
);
401 xdr_init_decode_pages(&stream
, &buf
, lgr
->layoutp
->pages
,
403 xdr_set_scratch_page(&stream
, scratch
);
405 /* stripe unit and mirror_array_cnt */
407 p
= xdr_inline_decode(&stream
, 8 + 4);
411 p
= xdr_decode_hyper(p
, &stripe_unit
);
412 mirror_array_cnt
= be32_to_cpup(p
++);
413 dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__
,
414 stripe_unit
, mirror_array_cnt
);
416 if (mirror_array_cnt
> NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT
||
417 mirror_array_cnt
== 0)
421 fls
= kzalloc(struct_size(fls
, mirror_array
, mirror_array_cnt
),
426 fls
->mirror_array_cnt
= mirror_array_cnt
;
427 fls
->stripe_unit
= stripe_unit
;
429 for (i
= 0; i
< fls
->mirror_array_cnt
; i
++) {
430 struct nfs4_ff_layout_mirror
*mirror
;
432 const struct cred __rcu
*cred
;
435 u32 ds_count
, fh_count
, id
;
439 p
= xdr_inline_decode(&stream
, 4);
442 ds_count
= be32_to_cpup(p
);
444 /* FIXME: allow for striping? */
448 fls
->mirror_array
[i
] = ff_layout_alloc_mirror(gfp_flags
);
449 if (fls
->mirror_array
[i
] == NULL
) {
454 fls
->mirror_array
[i
]->ds_count
= ds_count
;
457 rc
= decode_deviceid(&stream
, &fls
->mirror_array
[i
]->devid
);
463 p
= xdr_inline_decode(&stream
, 4);
466 fls
->mirror_array
[i
]->efficiency
= be32_to_cpup(p
);
469 rc
= decode_pnfs_stateid(&stream
, &fls
->mirror_array
[i
]->stateid
);
475 p
= xdr_inline_decode(&stream
, 4);
478 fh_count
= be32_to_cpup(p
);
480 fls
->mirror_array
[i
]->fh_versions
=
481 kcalloc(fh_count
, sizeof(struct nfs_fh
),
483 if (fls
->mirror_array
[i
]->fh_versions
== NULL
) {
488 for (j
= 0; j
< fh_count
; j
++) {
489 rc
= decode_nfs_fh(&stream
,
490 &fls
->mirror_array
[i
]->fh_versions
[j
]);
495 fls
->mirror_array
[i
]->fh_versions_cnt
= fh_count
;
498 rc
= decode_name(&stream
, &id
);
502 uid
= make_kuid(&init_user_ns
, id
);
505 rc
= decode_name(&stream
, &id
);
509 gid
= make_kgid(&init_user_ns
, id
);
511 if (gfp_flags
& __GFP_FS
)
512 kcred
= prepare_kernel_cred(&init_task
);
514 unsigned int nofs_flags
= memalloc_nofs_save();
515 kcred
= prepare_kernel_cred(&init_task
);
516 memalloc_nofs_restore(nofs_flags
);
523 cred
= RCU_INITIALIZER(kcred
);
525 if (lgr
->range
.iomode
== IOMODE_READ
)
526 rcu_assign_pointer(fls
->mirror_array
[i
]->ro_cred
, cred
);
528 rcu_assign_pointer(fls
->mirror_array
[i
]->rw_cred
, cred
);
530 mirror
= ff_layout_add_mirror(lh
, fls
->mirror_array
[i
]);
531 if (mirror
!= fls
->mirror_array
[i
]) {
532 /* swap cred ptrs so free_mirror will clean up old */
533 if (lgr
->range
.iomode
== IOMODE_READ
) {
534 cred
= xchg(&mirror
->ro_cred
, cred
);
535 rcu_assign_pointer(fls
->mirror_array
[i
]->ro_cred
, cred
);
537 cred
= xchg(&mirror
->rw_cred
, cred
);
538 rcu_assign_pointer(fls
->mirror_array
[i
]->rw_cred
, cred
);
540 ff_layout_free_mirror(fls
->mirror_array
[i
]);
541 fls
->mirror_array
[i
] = mirror
;
544 dprintk("%s: iomode %s uid %u gid %u\n", __func__
,
545 lgr
->range
.iomode
== IOMODE_READ
? "READ" : "RW",
546 from_kuid(&init_user_ns
, uid
),
547 from_kgid(&init_user_ns
, gid
));
550 p
= xdr_inline_decode(&stream
, 4);
552 goto out_sort_mirrors
;
553 fls
->flags
= be32_to_cpup(p
);
555 p
= xdr_inline_decode(&stream
, 4);
557 goto out_sort_mirrors
;
558 for (i
=0; i
< fls
->mirror_array_cnt
; i
++)
559 fls
->mirror_array
[i
]->report_interval
= be32_to_cpup(p
);
562 ff_layout_sort_mirrors(fls
);
563 ret
= &fls
->generic_hdr
;
564 dprintk("<-- %s (success)\n", __func__
);
566 __free_page(scratch
);
569 _ff_layout_free_lseg(fls
);
571 dprintk("<-- %s (%d)\n", __func__
, rc
);
576 ff_layout_free_lseg(struct pnfs_layout_segment
*lseg
)
578 struct nfs4_ff_layout_segment
*fls
= FF_LAYOUT_LSEG(lseg
);
580 dprintk("--> %s\n", __func__
);
582 if (lseg
->pls_range
.iomode
== IOMODE_RW
) {
583 struct nfs4_flexfile_layout
*ffl
;
586 ffl
= FF_LAYOUT_FROM_HDR(lseg
->pls_layout
);
587 inode
= ffl
->generic_hdr
.plh_inode
;
588 spin_lock(&inode
->i_lock
);
589 pnfs_generic_ds_cinfo_release_lseg(&ffl
->commit_info
, lseg
);
590 spin_unlock(&inode
->i_lock
);
592 _ff_layout_free_lseg(fls
);
596 nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer
*timer
, ktime_t now
)
598 /* first IO request? */
599 if (atomic_inc_return(&timer
->n_ops
) == 1) {
600 timer
->start_time
= now
;
605 nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer
*timer
, ktime_t now
)
609 if (atomic_dec_return(&timer
->n_ops
) < 0)
612 start
= timer
->start_time
;
613 timer
->start_time
= now
;
614 return ktime_sub(now
, start
);
618 nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror
*mirror
,
619 struct nfs4_ff_layoutstat
*layoutstat
,
622 s64 report_interval
= FF_LAYOUTSTATS_REPORT_INTERVAL
;
623 struct nfs4_flexfile_layout
*ffl
= FF_LAYOUT_FROM_HDR(mirror
->layout
);
625 nfs4_ff_start_busy_timer(&layoutstat
->busy_timer
, now
);
626 if (!mirror
->start_time
)
627 mirror
->start_time
= now
;
628 if (mirror
->report_interval
!= 0)
629 report_interval
= (s64
)mirror
->report_interval
* 1000LL;
630 else if (layoutstats_timer
!= 0)
631 report_interval
= (s64
)layoutstats_timer
* 1000LL;
632 if (ktime_to_ms(ktime_sub(now
, ffl
->last_report_time
)) >=
634 ffl
->last_report_time
= now
;
642 nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat
*layoutstat
,
645 struct nfs4_ff_io_stat
*iostat
= &layoutstat
->io_stat
;
647 iostat
->ops_requested
++;
648 iostat
->bytes_requested
+= requested
;
652 nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat
*layoutstat
,
655 ktime_t time_completed
,
656 ktime_t time_started
)
658 struct nfs4_ff_io_stat
*iostat
= &layoutstat
->io_stat
;
659 ktime_t completion_time
= ktime_sub(time_completed
, time_started
);
662 iostat
->ops_completed
++;
663 iostat
->bytes_completed
+= completed
;
664 iostat
->bytes_not_delivered
+= requested
- completed
;
666 timer
= nfs4_ff_end_busy_timer(&layoutstat
->busy_timer
, time_completed
);
667 iostat
->total_busy_time
=
668 ktime_add(iostat
->total_busy_time
, timer
);
669 iostat
->aggregate_completion_time
=
670 ktime_add(iostat
->aggregate_completion_time
,
675 nfs4_ff_layout_stat_io_start_read(struct inode
*inode
,
676 struct nfs4_ff_layout_mirror
*mirror
,
677 __u64 requested
, ktime_t now
)
681 spin_lock(&mirror
->lock
);
682 report
= nfs4_ff_layoutstat_start_io(mirror
, &mirror
->read_stat
, now
);
683 nfs4_ff_layout_stat_io_update_requested(&mirror
->read_stat
, requested
);
684 set_bit(NFS4_FF_MIRROR_STAT_AVAIL
, &mirror
->flags
);
685 spin_unlock(&mirror
->lock
);
688 pnfs_report_layoutstat(inode
, nfs_io_gfp_mask());
692 nfs4_ff_layout_stat_io_end_read(struct rpc_task
*task
,
693 struct nfs4_ff_layout_mirror
*mirror
,
697 spin_lock(&mirror
->lock
);
698 nfs4_ff_layout_stat_io_update_completed(&mirror
->read_stat
,
699 requested
, completed
,
700 ktime_get(), task
->tk_start
);
701 set_bit(NFS4_FF_MIRROR_STAT_AVAIL
, &mirror
->flags
);
702 spin_unlock(&mirror
->lock
);
706 nfs4_ff_layout_stat_io_start_write(struct inode
*inode
,
707 struct nfs4_ff_layout_mirror
*mirror
,
708 __u64 requested
, ktime_t now
)
712 spin_lock(&mirror
->lock
);
713 report
= nfs4_ff_layoutstat_start_io(mirror
, &mirror
->write_stat
, now
);
714 nfs4_ff_layout_stat_io_update_requested(&mirror
->write_stat
, requested
);
715 set_bit(NFS4_FF_MIRROR_STAT_AVAIL
, &mirror
->flags
);
716 spin_unlock(&mirror
->lock
);
719 pnfs_report_layoutstat(inode
, nfs_io_gfp_mask());
723 nfs4_ff_layout_stat_io_end_write(struct rpc_task
*task
,
724 struct nfs4_ff_layout_mirror
*mirror
,
727 enum nfs3_stable_how committed
)
729 if (committed
== NFS_UNSTABLE
)
730 requested
= completed
= 0;
732 spin_lock(&mirror
->lock
);
733 nfs4_ff_layout_stat_io_update_completed(&mirror
->write_stat
,
734 requested
, completed
, ktime_get(), task
->tk_start
);
735 set_bit(NFS4_FF_MIRROR_STAT_AVAIL
, &mirror
->flags
);
736 spin_unlock(&mirror
->lock
);
740 ff_layout_mark_ds_unreachable(struct pnfs_layout_segment
*lseg
, u32 idx
)
742 struct nfs4_deviceid_node
*devid
= FF_LAYOUT_DEVID_NODE(lseg
, idx
);
745 nfs4_mark_deviceid_unavailable(devid
);
749 ff_layout_mark_ds_reachable(struct pnfs_layout_segment
*lseg
, u32 idx
)
751 struct nfs4_deviceid_node
*devid
= FF_LAYOUT_DEVID_NODE(lseg
, idx
);
754 nfs4_mark_deviceid_available(devid
);
757 static struct nfs4_pnfs_ds
*
758 ff_layout_choose_ds_for_read(struct pnfs_layout_segment
*lseg
,
759 u32 start_idx
, u32
*best_idx
,
762 struct nfs4_ff_layout_segment
*fls
= FF_LAYOUT_LSEG(lseg
);
763 struct nfs4_ff_layout_mirror
*mirror
;
764 struct nfs4_pnfs_ds
*ds
;
767 /* mirrors are initially sorted by efficiency */
768 for (idx
= start_idx
; idx
< fls
->mirror_array_cnt
; idx
++) {
769 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
770 ds
= nfs4_ff_layout_prepare_ds(lseg
, mirror
, false);
775 nfs4_test_deviceid_unavailable(&mirror
->mirror_ds
->id_node
))
785 static struct nfs4_pnfs_ds
*
786 ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment
*lseg
,
787 u32 start_idx
, u32
*best_idx
)
789 return ff_layout_choose_ds_for_read(lseg
, start_idx
, best_idx
, false);
792 static struct nfs4_pnfs_ds
*
793 ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment
*lseg
,
794 u32 start_idx
, u32
*best_idx
)
796 return ff_layout_choose_ds_for_read(lseg
, start_idx
, best_idx
, true);
799 static struct nfs4_pnfs_ds
*
800 ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment
*lseg
,
801 u32 start_idx
, u32
*best_idx
)
803 struct nfs4_pnfs_ds
*ds
;
805 ds
= ff_layout_choose_valid_ds_for_read(lseg
, start_idx
, best_idx
);
808 return ff_layout_choose_any_ds_for_read(lseg
, start_idx
, best_idx
);
811 static struct nfs4_pnfs_ds
*
812 ff_layout_get_ds_for_read(struct nfs_pageio_descriptor
*pgio
,
815 struct pnfs_layout_segment
*lseg
= pgio
->pg_lseg
;
816 struct nfs4_pnfs_ds
*ds
;
818 ds
= ff_layout_choose_best_ds_for_read(lseg
, pgio
->pg_mirror_idx
,
820 if (ds
|| !pgio
->pg_mirror_idx
)
822 return ff_layout_choose_best_ds_for_read(lseg
, 0, best_idx
);
826 ff_layout_pg_get_read(struct nfs_pageio_descriptor
*pgio
,
827 struct nfs_page
*req
,
830 pnfs_put_lseg(pgio
->pg_lseg
);
832 pnfs_update_layout(pgio
->pg_inode
, nfs_req_openctx(req
),
833 req_offset(req
), req
->wb_bytes
, IOMODE_READ
,
834 strict_iomode
, nfs_io_gfp_mask());
835 if (IS_ERR(pgio
->pg_lseg
)) {
836 pgio
->pg_error
= PTR_ERR(pgio
->pg_lseg
);
837 pgio
->pg_lseg
= NULL
;
842 ff_layout_pg_init_read(struct nfs_pageio_descriptor
*pgio
,
843 struct nfs_page
*req
)
845 struct nfs_pgio_mirror
*pgm
;
846 struct nfs4_ff_layout_mirror
*mirror
;
847 struct nfs4_pnfs_ds
*ds
;
851 pnfs_generic_pg_check_layout(pgio
, req
);
852 /* Use full layout for now */
853 if (!pgio
->pg_lseg
) {
854 ff_layout_pg_get_read(pgio
, req
, false);
858 if (ff_layout_avoid_read_on_rw(pgio
->pg_lseg
)) {
859 ff_layout_pg_get_read(pgio
, req
, true);
864 ds
= ff_layout_get_ds_for_read(pgio
, &ds_idx
);
866 if (!ff_layout_no_fallback_to_mds(pgio
->pg_lseg
))
868 pnfs_generic_pg_cleanup(pgio
);
869 /* Sleep for 1 second before retrying */
874 mirror
= FF_LAYOUT_COMP(pgio
->pg_lseg
, ds_idx
);
875 pgm
= &pgio
->pg_mirrors
[0];
876 pgm
->pg_bsize
= mirror
->mirror_ds
->ds_versions
[0].rsize
;
878 pgio
->pg_mirror_idx
= ds_idx
;
880 if (NFS_SERVER(pgio
->pg_inode
)->flags
&
881 (NFS_MOUNT_SOFT
|NFS_MOUNT_SOFTERR
))
882 pgio
->pg_maxretrans
= io_maxretrans
;
885 if (pgio
->pg_error
< 0)
888 trace_pnfs_mds_fallback_pg_init_read(pgio
->pg_inode
,
889 0, NFS4_MAX_UINT64
, IOMODE_READ
,
890 NFS_I(pgio
->pg_inode
)->layout
,
892 pgio
->pg_maxretrans
= 0;
893 nfs_pageio_reset_read_mds(pgio
);
897 ff_layout_pg_init_write(struct nfs_pageio_descriptor
*pgio
,
898 struct nfs_page
*req
)
900 struct nfs4_ff_layout_mirror
*mirror
;
901 struct nfs_pgio_mirror
*pgm
;
902 struct nfs4_pnfs_ds
*ds
;
906 pnfs_generic_pg_check_layout(pgio
, req
);
907 if (!pgio
->pg_lseg
) {
909 pnfs_update_layout(pgio
->pg_inode
, nfs_req_openctx(req
),
910 req_offset(req
), req
->wb_bytes
,
911 IOMODE_RW
, false, nfs_io_gfp_mask());
912 if (IS_ERR(pgio
->pg_lseg
)) {
913 pgio
->pg_error
= PTR_ERR(pgio
->pg_lseg
);
914 pgio
->pg_lseg
= NULL
;
918 /* If no lseg, fall back to write through mds */
919 if (pgio
->pg_lseg
== NULL
)
922 /* Use a direct mapping of ds_idx to pgio mirror_idx */
923 if (pgio
->pg_mirror_count
!= FF_LAYOUT_MIRROR_COUNT(pgio
->pg_lseg
))
926 for (i
= 0; i
< pgio
->pg_mirror_count
; i
++) {
927 mirror
= FF_LAYOUT_COMP(pgio
->pg_lseg
, i
);
928 ds
= nfs4_ff_layout_prepare_ds(pgio
->pg_lseg
, mirror
, true);
930 if (!ff_layout_no_fallback_to_mds(pgio
->pg_lseg
))
932 pnfs_generic_pg_cleanup(pgio
);
933 /* Sleep for 1 second before retrying */
937 pgm
= &pgio
->pg_mirrors
[i
];
938 pgm
->pg_bsize
= mirror
->mirror_ds
->ds_versions
[0].wsize
;
941 if (NFS_SERVER(pgio
->pg_inode
)->flags
&
942 (NFS_MOUNT_SOFT
|NFS_MOUNT_SOFTERR
))
943 pgio
->pg_maxretrans
= io_maxretrans
;
946 pnfs_generic_pg_cleanup(pgio
);
947 pgio
->pg_error
= -EAGAIN
;
950 trace_pnfs_mds_fallback_pg_init_write(pgio
->pg_inode
,
951 0, NFS4_MAX_UINT64
, IOMODE_RW
,
952 NFS_I(pgio
->pg_inode
)->layout
,
954 pgio
->pg_maxretrans
= 0;
955 nfs_pageio_reset_write_mds(pgio
);
956 pgio
->pg_error
= -EAGAIN
;
960 ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor
*pgio
,
961 struct nfs_page
*req
)
963 if (!pgio
->pg_lseg
) {
965 pnfs_update_layout(pgio
->pg_inode
, nfs_req_openctx(req
),
966 req_offset(req
), req
->wb_bytes
,
967 IOMODE_RW
, false, nfs_io_gfp_mask());
968 if (IS_ERR(pgio
->pg_lseg
)) {
969 pgio
->pg_error
= PTR_ERR(pgio
->pg_lseg
);
970 pgio
->pg_lseg
= NULL
;
975 return FF_LAYOUT_MIRROR_COUNT(pgio
->pg_lseg
);
977 trace_pnfs_mds_fallback_pg_get_mirror_count(pgio
->pg_inode
,
978 0, NFS4_MAX_UINT64
, IOMODE_RW
,
979 NFS_I(pgio
->pg_inode
)->layout
,
981 /* no lseg means that pnfs is not in use, so no mirroring here */
982 nfs_pageio_reset_write_mds(pgio
);
988 ff_layout_pg_set_mirror_write(struct nfs_pageio_descriptor
*desc
, u32 idx
)
990 u32 old
= desc
->pg_mirror_idx
;
992 desc
->pg_mirror_idx
= idx
;
996 static struct nfs_pgio_mirror
*
997 ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor
*desc
, u32 idx
)
999 return &desc
->pg_mirrors
[idx
];
1002 static const struct nfs_pageio_ops ff_layout_pg_read_ops
= {
1003 .pg_init
= ff_layout_pg_init_read
,
1004 .pg_test
= pnfs_generic_pg_test
,
1005 .pg_doio
= pnfs_generic_pg_readpages
,
1006 .pg_cleanup
= pnfs_generic_pg_cleanup
,
1009 static const struct nfs_pageio_ops ff_layout_pg_write_ops
= {
1010 .pg_init
= ff_layout_pg_init_write
,
1011 .pg_test
= pnfs_generic_pg_test
,
1012 .pg_doio
= pnfs_generic_pg_writepages
,
1013 .pg_get_mirror_count
= ff_layout_pg_get_mirror_count_write
,
1014 .pg_cleanup
= pnfs_generic_pg_cleanup
,
1015 .pg_get_mirror
= ff_layout_pg_get_mirror_write
,
1016 .pg_set_mirror
= ff_layout_pg_set_mirror_write
,
1019 static void ff_layout_reset_write(struct nfs_pgio_header
*hdr
, bool retry_pnfs
)
1021 struct rpc_task
*task
= &hdr
->task
;
1023 pnfs_layoutcommit_inode(hdr
->inode
, false);
1026 dprintk("%s Reset task %5u for i/o through pNFS "
1027 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__
,
1029 hdr
->inode
->i_sb
->s_id
,
1030 (unsigned long long)NFS_FILEID(hdr
->inode
),
1032 (unsigned long long)hdr
->args
.offset
);
1034 hdr
->completion_ops
->reschedule_io(hdr
);
1038 if (!test_and_set_bit(NFS_IOHDR_REDO
, &hdr
->flags
)) {
1039 dprintk("%s Reset task %5u for i/o through MDS "
1040 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__
,
1042 hdr
->inode
->i_sb
->s_id
,
1043 (unsigned long long)NFS_FILEID(hdr
->inode
),
1045 (unsigned long long)hdr
->args
.offset
);
1047 trace_pnfs_mds_fallback_write_done(hdr
->inode
,
1048 hdr
->args
.offset
, hdr
->args
.count
,
1049 IOMODE_RW
, NFS_I(hdr
->inode
)->layout
,
1051 task
->tk_status
= pnfs_write_done_resend_to_mds(hdr
);
1055 static void ff_layout_resend_pnfs_read(struct nfs_pgio_header
*hdr
)
1057 u32 idx
= hdr
->pgio_mirror_idx
+ 1;
1060 if (ff_layout_choose_any_ds_for_read(hdr
->lseg
, idx
, &new_idx
))
1061 ff_layout_send_layouterror(hdr
->lseg
);
1063 pnfs_error_mark_layout_for_return(hdr
->inode
, hdr
->lseg
);
1064 pnfs_read_resend_pnfs(hdr
, new_idx
);
1067 static void ff_layout_reset_read(struct nfs_pgio_header
*hdr
)
1069 struct rpc_task
*task
= &hdr
->task
;
1071 pnfs_layoutcommit_inode(hdr
->inode
, false);
1072 pnfs_error_mark_layout_for_return(hdr
->inode
, hdr
->lseg
);
1074 if (!test_and_set_bit(NFS_IOHDR_REDO
, &hdr
->flags
)) {
1075 dprintk("%s Reset task %5u for i/o through MDS "
1076 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__
,
1078 hdr
->inode
->i_sb
->s_id
,
1079 (unsigned long long)NFS_FILEID(hdr
->inode
),
1081 (unsigned long long)hdr
->args
.offset
);
1083 trace_pnfs_mds_fallback_read_done(hdr
->inode
,
1084 hdr
->args
.offset
, hdr
->args
.count
,
1085 IOMODE_READ
, NFS_I(hdr
->inode
)->layout
,
1087 task
->tk_status
= pnfs_read_done_resend_to_mds(hdr
);
1091 static int ff_layout_async_handle_error_v4(struct rpc_task
*task
,
1092 struct nfs4_state
*state
,
1093 struct nfs_client
*clp
,
1094 struct pnfs_layout_segment
*lseg
,
1097 struct pnfs_layout_hdr
*lo
= lseg
->pls_layout
;
1098 struct inode
*inode
= lo
->plh_inode
;
1099 struct nfs4_deviceid_node
*devid
= FF_LAYOUT_DEVID_NODE(lseg
, idx
);
1100 struct nfs4_slot_table
*tbl
= &clp
->cl_session
->fc_slot_table
;
1102 switch (task
->tk_status
) {
1103 case -NFS4ERR_BADSESSION
:
1104 case -NFS4ERR_BADSLOT
:
1105 case -NFS4ERR_BAD_HIGH_SLOT
:
1106 case -NFS4ERR_DEADSESSION
:
1107 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION
:
1108 case -NFS4ERR_SEQ_FALSE_RETRY
:
1109 case -NFS4ERR_SEQ_MISORDERED
:
1110 dprintk("%s ERROR %d, Reset session. Exchangeid "
1111 "flags 0x%x\n", __func__
, task
->tk_status
,
1112 clp
->cl_exchange_flags
);
1113 nfs4_schedule_session_recovery(clp
->cl_session
, task
->tk_status
);
1115 case -NFS4ERR_DELAY
:
1116 case -NFS4ERR_GRACE
:
1117 rpc_delay(task
, FF_LAYOUT_POLL_RETRY_MAX
);
1119 case -NFS4ERR_RETRY_UNCACHED_REP
:
1121 /* Invalidate Layout errors */
1122 case -NFS4ERR_PNFS_NO_LAYOUT
:
1123 case -ESTALE
: /* mapped NFS4ERR_STALE */
1124 case -EBADHANDLE
: /* mapped NFS4ERR_BADHANDLE */
1125 case -EISDIR
: /* mapped NFS4ERR_ISDIR */
1126 case -NFS4ERR_FHEXPIRED
:
1127 case -NFS4ERR_WRONG_TYPE
:
1128 dprintk("%s Invalid layout error %d\n", __func__
,
1131 * Destroy layout so new i/o will get a new layout.
1132 * Layout will not be destroyed until all current lseg
1133 * references are put. Mark layout as invalid to resend failed
1134 * i/o and all i/o waiting on the slot table to the MDS until
1135 * layout is destroyed and a new valid layout is obtained.
1137 pnfs_destroy_layout(NFS_I(inode
));
1138 rpc_wake_up(&tbl
->slot_tbl_waitq
);
1140 /* RPC connection errors */
1150 dprintk("%s DS connection error %d\n", __func__
,
1152 nfs4_delete_deviceid(devid
->ld
, devid
->nfs_client
,
1154 rpc_wake_up(&tbl
->slot_tbl_waitq
);
1157 if (ff_layout_avoid_mds_available_ds(lseg
))
1158 return -NFS4ERR_RESET_TO_PNFS
;
1160 dprintk("%s Retry through MDS. Error %d\n", __func__
,
1162 return -NFS4ERR_RESET_TO_MDS
;
1164 task
->tk_status
= 0;
1168 /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
1169 static int ff_layout_async_handle_error_v3(struct rpc_task
*task
,
1170 struct pnfs_layout_segment
*lseg
,
1173 struct nfs4_deviceid_node
*devid
= FF_LAYOUT_DEVID_NODE(lseg
, idx
);
1175 switch (task
->tk_status
) {
1176 /* File access problems. Don't mark the device as unavailable */
1185 nfs_inc_stats(lseg
->pls_layout
->plh_inode
, NFSIOS_DELAY
);
1188 dprintk("%s DS connection error %d\n", __func__
,
1190 nfs4_delete_deviceid(devid
->ld
, devid
->nfs_client
,
1193 /* FIXME: Need to prevent infinite looping here. */
1194 return -NFS4ERR_RESET_TO_PNFS
;
1196 task
->tk_status
= 0;
1197 rpc_restart_call_prepare(task
);
1198 rpc_delay(task
, NFS_JUKEBOX_RETRY_TIME
);
1202 static int ff_layout_async_handle_error(struct rpc_task
*task
,
1203 struct nfs4_state
*state
,
1204 struct nfs_client
*clp
,
1205 struct pnfs_layout_segment
*lseg
,
1208 int vers
= clp
->cl_nfs_mod
->rpc_vers
->number
;
1210 if (task
->tk_status
>= 0) {
1211 ff_layout_mark_ds_reachable(lseg
, idx
);
1215 /* Handle the case of an invalid layout segment */
1216 if (!pnfs_is_valid_lseg(lseg
))
1217 return -NFS4ERR_RESET_TO_PNFS
;
1221 return ff_layout_async_handle_error_v3(task
, lseg
, idx
);
1223 return ff_layout_async_handle_error_v4(task
, state
, clp
,
1226 /* should never happen */
1232 static void ff_layout_io_track_ds_error(struct pnfs_layout_segment
*lseg
,
1233 u32 idx
, u64 offset
, u64 length
,
1234 u32
*op_status
, int opnum
, int error
)
1236 struct nfs4_ff_layout_mirror
*mirror
;
1237 u32 status
= *op_status
;
1244 case -EPROTONOSUPPORT
:
1258 *op_status
= status
= NFS4ERR_NXIO
;
1261 *op_status
= status
= NFS4ERR_ACCESS
;
1268 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
1269 err
= ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg
->pls_layout
),
1270 mirror
, offset
, length
, status
, opnum
,
1278 ff_layout_mark_ds_unreachable(lseg
, idx
);
1280 * Don't return the layout if this is a read and we still
1281 * have layouts to try
1283 if (opnum
== OP_READ
)
1287 pnfs_error_mark_layout_for_return(lseg
->pls_layout
->plh_inode
,
1291 dprintk("%s: err %d op %d status %u\n", __func__
, err
, opnum
, status
);
1294 /* NFS_PROTO call done callback routines */
1295 static int ff_layout_read_done_cb(struct rpc_task
*task
,
1296 struct nfs_pgio_header
*hdr
)
1300 if (task
->tk_status
< 0) {
1301 ff_layout_io_track_ds_error(hdr
->lseg
, hdr
->pgio_mirror_idx
,
1302 hdr
->args
.offset
, hdr
->args
.count
,
1303 &hdr
->res
.op_status
, OP_READ
,
1305 trace_ff_layout_read_error(hdr
);
1308 err
= ff_layout_async_handle_error(task
, hdr
->args
.context
->state
,
1309 hdr
->ds_clp
, hdr
->lseg
,
1310 hdr
->pgio_mirror_idx
);
1312 trace_nfs4_pnfs_read(hdr
, err
);
1313 clear_bit(NFS_IOHDR_RESEND_PNFS
, &hdr
->flags
);
1314 clear_bit(NFS_IOHDR_RESEND_MDS
, &hdr
->flags
);
1316 case -NFS4ERR_RESET_TO_PNFS
:
1317 set_bit(NFS_IOHDR_RESEND_PNFS
, &hdr
->flags
);
1318 return task
->tk_status
;
1319 case -NFS4ERR_RESET_TO_MDS
:
1320 set_bit(NFS_IOHDR_RESEND_MDS
, &hdr
->flags
);
1321 return task
->tk_status
;
1328 rpc_restart_call_prepare(task
);
1333 ff_layout_need_layoutcommit(struct pnfs_layout_segment
*lseg
)
1335 return !(FF_LAYOUT_LSEG(lseg
)->flags
& FF_FLAGS_NO_LAYOUTCOMMIT
);
1339 * We reference the rpc_cred of the first WRITE that triggers the need for
1340 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
1341 * rfc5661 is not clear about which credential should be used.
1343 * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
1344 * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
1345 * we always send layoutcommit after DS writes.
1348 ff_layout_set_layoutcommit(struct inode
*inode
,
1349 struct pnfs_layout_segment
*lseg
,
1352 if (!ff_layout_need_layoutcommit(lseg
))
1355 pnfs_set_layoutcommit(inode
, lseg
, end_offset
);
1356 dprintk("%s inode %lu pls_end_pos %llu\n", __func__
, inode
->i_ino
,
1357 (unsigned long long) NFS_I(inode
)->layout
->plh_lwb
);
1360 static void ff_layout_read_record_layoutstats_start(struct rpc_task
*task
,
1361 struct nfs_pgio_header
*hdr
)
1363 if (test_and_set_bit(NFS_IOHDR_STAT
, &hdr
->flags
))
1365 nfs4_ff_layout_stat_io_start_read(hdr
->inode
,
1366 FF_LAYOUT_COMP(hdr
->lseg
, hdr
->pgio_mirror_idx
),
1371 static void ff_layout_read_record_layoutstats_done(struct rpc_task
*task
,
1372 struct nfs_pgio_header
*hdr
)
1374 if (!test_and_clear_bit(NFS_IOHDR_STAT
, &hdr
->flags
))
1376 nfs4_ff_layout_stat_io_end_read(task
,
1377 FF_LAYOUT_COMP(hdr
->lseg
, hdr
->pgio_mirror_idx
),
1380 set_bit(NFS_LSEG_LAYOUTRETURN
, &hdr
->lseg
->pls_flags
);
1383 static int ff_layout_read_prepare_common(struct rpc_task
*task
,
1384 struct nfs_pgio_header
*hdr
)
1386 if (unlikely(test_bit(NFS_CONTEXT_BAD
, &hdr
->args
.context
->flags
))) {
1387 rpc_exit(task
, -EIO
);
1391 if (!pnfs_is_valid_lseg(hdr
->lseg
)) {
1392 rpc_exit(task
, -EAGAIN
);
1396 ff_layout_read_record_layoutstats_start(task
, hdr
);
1401 * Call ops for the async read/write cases
1402 * In the case of dense layouts, the offset needs to be reset to its
1405 static void ff_layout_read_prepare_v3(struct rpc_task
*task
, void *data
)
1407 struct nfs_pgio_header
*hdr
= data
;
1409 if (ff_layout_read_prepare_common(task
, hdr
))
1412 rpc_call_start(task
);
1415 static void ff_layout_read_prepare_v4(struct rpc_task
*task
, void *data
)
1417 struct nfs_pgio_header
*hdr
= data
;
1419 if (nfs4_setup_sequence(hdr
->ds_clp
,
1420 &hdr
->args
.seq_args
,
1425 ff_layout_read_prepare_common(task
, hdr
);
1428 static void ff_layout_read_call_done(struct rpc_task
*task
, void *data
)
1430 struct nfs_pgio_header
*hdr
= data
;
1432 if (test_bit(NFS_IOHDR_REDO
, &hdr
->flags
) &&
1433 task
->tk_status
== 0) {
1434 nfs4_sequence_done(task
, &hdr
->res
.seq_res
);
1438 /* Note this may cause RPC to be resent */
1439 hdr
->mds_ops
->rpc_call_done(task
, hdr
);
1442 static void ff_layout_read_count_stats(struct rpc_task
*task
, void *data
)
1444 struct nfs_pgio_header
*hdr
= data
;
1446 ff_layout_read_record_layoutstats_done(task
, hdr
);
1447 rpc_count_iostats_metrics(task
,
1448 &NFS_CLIENT(hdr
->inode
)->cl_metrics
[NFSPROC4_CLNT_READ
]);
1451 static void ff_layout_read_release(void *data
)
1453 struct nfs_pgio_header
*hdr
= data
;
1455 ff_layout_read_record_layoutstats_done(&hdr
->task
, hdr
);
1456 if (test_bit(NFS_IOHDR_RESEND_PNFS
, &hdr
->flags
))
1457 ff_layout_resend_pnfs_read(hdr
);
1458 else if (test_bit(NFS_IOHDR_RESEND_MDS
, &hdr
->flags
))
1459 ff_layout_reset_read(hdr
);
1460 pnfs_generic_rw_release(data
);
1464 static int ff_layout_write_done_cb(struct rpc_task
*task
,
1465 struct nfs_pgio_header
*hdr
)
1467 loff_t end_offs
= 0;
1470 if (task
->tk_status
< 0) {
1471 ff_layout_io_track_ds_error(hdr
->lseg
, hdr
->pgio_mirror_idx
,
1472 hdr
->args
.offset
, hdr
->args
.count
,
1473 &hdr
->res
.op_status
, OP_WRITE
,
1475 trace_ff_layout_write_error(hdr
);
1478 err
= ff_layout_async_handle_error(task
, hdr
->args
.context
->state
,
1479 hdr
->ds_clp
, hdr
->lseg
,
1480 hdr
->pgio_mirror_idx
);
1482 trace_nfs4_pnfs_write(hdr
, err
);
1483 clear_bit(NFS_IOHDR_RESEND_PNFS
, &hdr
->flags
);
1484 clear_bit(NFS_IOHDR_RESEND_MDS
, &hdr
->flags
);
1486 case -NFS4ERR_RESET_TO_PNFS
:
1487 set_bit(NFS_IOHDR_RESEND_PNFS
, &hdr
->flags
);
1488 return task
->tk_status
;
1489 case -NFS4ERR_RESET_TO_MDS
:
1490 set_bit(NFS_IOHDR_RESEND_MDS
, &hdr
->flags
);
1491 return task
->tk_status
;
1496 if (hdr
->res
.verf
->committed
== NFS_FILE_SYNC
||
1497 hdr
->res
.verf
->committed
== NFS_DATA_SYNC
)
1498 end_offs
= hdr
->mds_offset
+ (loff_t
)hdr
->res
.count
;
1500 /* Note: if the write is unstable, don't set end_offs until commit */
1501 ff_layout_set_layoutcommit(hdr
->inode
, hdr
->lseg
, end_offs
);
1503 /* zero out fattr since we don't care DS attr at all */
1504 hdr
->fattr
.valid
= 0;
1505 if (task
->tk_status
>= 0)
1506 nfs_writeback_update_inode(hdr
);
1511 static int ff_layout_commit_done_cb(struct rpc_task
*task
,
1512 struct nfs_commit_data
*data
)
1516 if (task
->tk_status
< 0) {
1517 ff_layout_io_track_ds_error(data
->lseg
, data
->ds_commit_index
,
1518 data
->args
.offset
, data
->args
.count
,
1519 &data
->res
.op_status
, OP_COMMIT
,
1521 trace_ff_layout_commit_error(data
);
1524 err
= ff_layout_async_handle_error(task
, NULL
, data
->ds_clp
,
1525 data
->lseg
, data
->ds_commit_index
);
1527 trace_nfs4_pnfs_commit_ds(data
, err
);
1529 case -NFS4ERR_RESET_TO_PNFS
:
1530 pnfs_generic_prepare_to_resend_writes(data
);
1532 case -NFS4ERR_RESET_TO_MDS
:
1533 pnfs_generic_prepare_to_resend_writes(data
);
1536 rpc_restart_call_prepare(task
);
1540 ff_layout_set_layoutcommit(data
->inode
, data
->lseg
, data
->lwb
);
1545 static void ff_layout_write_record_layoutstats_start(struct rpc_task
*task
,
1546 struct nfs_pgio_header
*hdr
)
1548 if (test_and_set_bit(NFS_IOHDR_STAT
, &hdr
->flags
))
1550 nfs4_ff_layout_stat_io_start_write(hdr
->inode
,
1551 FF_LAYOUT_COMP(hdr
->lseg
, hdr
->pgio_mirror_idx
),
1556 static void ff_layout_write_record_layoutstats_done(struct rpc_task
*task
,
1557 struct nfs_pgio_header
*hdr
)
1559 if (!test_and_clear_bit(NFS_IOHDR_STAT
, &hdr
->flags
))
1561 nfs4_ff_layout_stat_io_end_write(task
,
1562 FF_LAYOUT_COMP(hdr
->lseg
, hdr
->pgio_mirror_idx
),
1563 hdr
->args
.count
, hdr
->res
.count
,
1564 hdr
->res
.verf
->committed
);
1565 set_bit(NFS_LSEG_LAYOUTRETURN
, &hdr
->lseg
->pls_flags
);
1568 static int ff_layout_write_prepare_common(struct rpc_task
*task
,
1569 struct nfs_pgio_header
*hdr
)
1571 if (unlikely(test_bit(NFS_CONTEXT_BAD
, &hdr
->args
.context
->flags
))) {
1572 rpc_exit(task
, -EIO
);
1576 if (!pnfs_is_valid_lseg(hdr
->lseg
)) {
1577 rpc_exit(task
, -EAGAIN
);
1581 ff_layout_write_record_layoutstats_start(task
, hdr
);
1585 static void ff_layout_write_prepare_v3(struct rpc_task
*task
, void *data
)
1587 struct nfs_pgio_header
*hdr
= data
;
1589 if (ff_layout_write_prepare_common(task
, hdr
))
1592 rpc_call_start(task
);
1595 static void ff_layout_write_prepare_v4(struct rpc_task
*task
, void *data
)
1597 struct nfs_pgio_header
*hdr
= data
;
1599 if (nfs4_setup_sequence(hdr
->ds_clp
,
1600 &hdr
->args
.seq_args
,
1605 ff_layout_write_prepare_common(task
, hdr
);
1608 static void ff_layout_write_call_done(struct rpc_task
*task
, void *data
)
1610 struct nfs_pgio_header
*hdr
= data
;
1612 if (test_bit(NFS_IOHDR_REDO
, &hdr
->flags
) &&
1613 task
->tk_status
== 0) {
1614 nfs4_sequence_done(task
, &hdr
->res
.seq_res
);
1618 /* Note this may cause RPC to be resent */
1619 hdr
->mds_ops
->rpc_call_done(task
, hdr
);
1622 static void ff_layout_write_count_stats(struct rpc_task
*task
, void *data
)
1624 struct nfs_pgio_header
*hdr
= data
;
1626 ff_layout_write_record_layoutstats_done(task
, hdr
);
1627 rpc_count_iostats_metrics(task
,
1628 &NFS_CLIENT(hdr
->inode
)->cl_metrics
[NFSPROC4_CLNT_WRITE
]);
1631 static void ff_layout_write_release(void *data
)
1633 struct nfs_pgio_header
*hdr
= data
;
1635 ff_layout_write_record_layoutstats_done(&hdr
->task
, hdr
);
1636 if (test_bit(NFS_IOHDR_RESEND_PNFS
, &hdr
->flags
)) {
1637 ff_layout_send_layouterror(hdr
->lseg
);
1638 ff_layout_reset_write(hdr
, true);
1639 } else if (test_bit(NFS_IOHDR_RESEND_MDS
, &hdr
->flags
))
1640 ff_layout_reset_write(hdr
, false);
1641 pnfs_generic_rw_release(data
);
1644 static void ff_layout_commit_record_layoutstats_start(struct rpc_task
*task
,
1645 struct nfs_commit_data
*cdata
)
1647 if (test_and_set_bit(NFS_IOHDR_STAT
, &cdata
->flags
))
1649 nfs4_ff_layout_stat_io_start_write(cdata
->inode
,
1650 FF_LAYOUT_COMP(cdata
->lseg
, cdata
->ds_commit_index
),
1654 static void ff_layout_commit_record_layoutstats_done(struct rpc_task
*task
,
1655 struct nfs_commit_data
*cdata
)
1657 struct nfs_page
*req
;
1660 if (!test_and_clear_bit(NFS_IOHDR_STAT
, &cdata
->flags
))
1663 if (task
->tk_status
== 0) {
1664 list_for_each_entry(req
, &cdata
->pages
, wb_list
)
1665 count
+= req
->wb_bytes
;
1667 nfs4_ff_layout_stat_io_end_write(task
,
1668 FF_LAYOUT_COMP(cdata
->lseg
, cdata
->ds_commit_index
),
1669 count
, count
, NFS_FILE_SYNC
);
1670 set_bit(NFS_LSEG_LAYOUTRETURN
, &cdata
->lseg
->pls_flags
);
1673 static int ff_layout_commit_prepare_common(struct rpc_task
*task
,
1674 struct nfs_commit_data
*cdata
)
1676 if (!pnfs_is_valid_lseg(cdata
->lseg
)) {
1677 rpc_exit(task
, -EAGAIN
);
1681 ff_layout_commit_record_layoutstats_start(task
, cdata
);
1685 static void ff_layout_commit_prepare_v3(struct rpc_task
*task
, void *data
)
1687 if (ff_layout_commit_prepare_common(task
, data
))
1690 rpc_call_start(task
);
1693 static void ff_layout_commit_prepare_v4(struct rpc_task
*task
, void *data
)
1695 struct nfs_commit_data
*wdata
= data
;
1697 if (nfs4_setup_sequence(wdata
->ds_clp
,
1698 &wdata
->args
.seq_args
,
1699 &wdata
->res
.seq_res
,
1702 ff_layout_commit_prepare_common(task
, data
);
1705 static void ff_layout_commit_done(struct rpc_task
*task
, void *data
)
1707 pnfs_generic_write_commit_done(task
, data
);
1710 static void ff_layout_commit_count_stats(struct rpc_task
*task
, void *data
)
1712 struct nfs_commit_data
*cdata
= data
;
1714 ff_layout_commit_record_layoutstats_done(task
, cdata
);
1715 rpc_count_iostats_metrics(task
,
1716 &NFS_CLIENT(cdata
->inode
)->cl_metrics
[NFSPROC4_CLNT_COMMIT
]);
1719 static void ff_layout_commit_release(void *data
)
1721 struct nfs_commit_data
*cdata
= data
;
1723 ff_layout_commit_record_layoutstats_done(&cdata
->task
, cdata
);
1724 pnfs_generic_commit_release(data
);
1727 static const struct rpc_call_ops ff_layout_read_call_ops_v3
= {
1728 .rpc_call_prepare
= ff_layout_read_prepare_v3
,
1729 .rpc_call_done
= ff_layout_read_call_done
,
1730 .rpc_count_stats
= ff_layout_read_count_stats
,
1731 .rpc_release
= ff_layout_read_release
,
1734 static const struct rpc_call_ops ff_layout_read_call_ops_v4
= {
1735 .rpc_call_prepare
= ff_layout_read_prepare_v4
,
1736 .rpc_call_done
= ff_layout_read_call_done
,
1737 .rpc_count_stats
= ff_layout_read_count_stats
,
1738 .rpc_release
= ff_layout_read_release
,
1741 static const struct rpc_call_ops ff_layout_write_call_ops_v3
= {
1742 .rpc_call_prepare
= ff_layout_write_prepare_v3
,
1743 .rpc_call_done
= ff_layout_write_call_done
,
1744 .rpc_count_stats
= ff_layout_write_count_stats
,
1745 .rpc_release
= ff_layout_write_release
,
1748 static const struct rpc_call_ops ff_layout_write_call_ops_v4
= {
1749 .rpc_call_prepare
= ff_layout_write_prepare_v4
,
1750 .rpc_call_done
= ff_layout_write_call_done
,
1751 .rpc_count_stats
= ff_layout_write_count_stats
,
1752 .rpc_release
= ff_layout_write_release
,
1755 static const struct rpc_call_ops ff_layout_commit_call_ops_v3
= {
1756 .rpc_call_prepare
= ff_layout_commit_prepare_v3
,
1757 .rpc_call_done
= ff_layout_commit_done
,
1758 .rpc_count_stats
= ff_layout_commit_count_stats
,
1759 .rpc_release
= ff_layout_commit_release
,
1762 static const struct rpc_call_ops ff_layout_commit_call_ops_v4
= {
1763 .rpc_call_prepare
= ff_layout_commit_prepare_v4
,
1764 .rpc_call_done
= ff_layout_commit_done
,
1765 .rpc_count_stats
= ff_layout_commit_count_stats
,
1766 .rpc_release
= ff_layout_commit_release
,
1769 static enum pnfs_try_status
1770 ff_layout_read_pagelist(struct nfs_pgio_header
*hdr
)
1772 struct pnfs_layout_segment
*lseg
= hdr
->lseg
;
1773 struct nfs4_pnfs_ds
*ds
;
1774 struct rpc_clnt
*ds_clnt
;
1775 struct nfsd_file
*localio
;
1776 struct nfs4_ff_layout_mirror
*mirror
;
1777 const struct cred
*ds_cred
;
1778 loff_t offset
= hdr
->args
.offset
;
1779 u32 idx
= hdr
->pgio_mirror_idx
;
1783 dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
1784 __func__
, hdr
->inode
->i_ino
,
1785 hdr
->args
.pgbase
, (size_t)hdr
->args
.count
, offset
);
1787 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
1788 ds
= nfs4_ff_layout_prepare_ds(lseg
, mirror
, false);
1792 ds_clnt
= nfs4_ff_find_or_create_ds_client(mirror
, ds
->ds_clp
,
1794 if (IS_ERR(ds_clnt
))
1797 ds_cred
= ff_layout_get_ds_cred(mirror
, &lseg
->pls_range
, hdr
->cred
);
1801 vers
= nfs4_ff_layout_ds_version(mirror
);
1803 dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__
,
1804 ds
->ds_remotestr
, refcount_read(&ds
->ds_clp
->cl_count
), vers
);
1806 hdr
->pgio_done_cb
= ff_layout_read_done_cb
;
1807 refcount_inc(&ds
->ds_clp
->cl_count
);
1808 hdr
->ds_clp
= ds
->ds_clp
;
1809 fh
= nfs4_ff_layout_select_ds_fh(mirror
);
1813 nfs4_ff_layout_select_ds_stateid(mirror
, &hdr
->args
.stateid
);
1816 * Note that if we ever decide to split across DSes,
1817 * then we may need to handle dense-like offsets.
1819 hdr
->args
.offset
= offset
;
1820 hdr
->mds_offset
= offset
;
1822 /* Start IO accounting for local read */
1823 localio
= ff_local_open_fh(ds
->ds_clp
, ds_cred
, fh
, FMODE_READ
);
1825 hdr
->task
.tk_start
= ktime_get();
1826 ff_layout_read_record_layoutstats_start(&hdr
->task
, hdr
);
1829 /* Perform an asynchronous read to ds */
1830 nfs_initiate_pgio(ds_clnt
, hdr
, ds_cred
, ds
->ds_clp
->rpc_ops
,
1831 vers
== 3 ? &ff_layout_read_call_ops_v3
:
1832 &ff_layout_read_call_ops_v4
,
1833 0, RPC_TASK_SOFTCONN
, localio
);
1835 return PNFS_ATTEMPTED
;
1838 if (ff_layout_avoid_mds_available_ds(lseg
))
1839 return PNFS_TRY_AGAIN
;
1840 trace_pnfs_mds_fallback_read_pagelist(hdr
->inode
,
1841 hdr
->args
.offset
, hdr
->args
.count
,
1842 IOMODE_READ
, NFS_I(hdr
->inode
)->layout
, lseg
);
1843 return PNFS_NOT_ATTEMPTED
;
1846 /* Perform async writes. */
1847 static enum pnfs_try_status
1848 ff_layout_write_pagelist(struct nfs_pgio_header
*hdr
, int sync
)
1850 struct pnfs_layout_segment
*lseg
= hdr
->lseg
;
1851 struct nfs4_pnfs_ds
*ds
;
1852 struct rpc_clnt
*ds_clnt
;
1853 struct nfsd_file
*localio
;
1854 struct nfs4_ff_layout_mirror
*mirror
;
1855 const struct cred
*ds_cred
;
1856 loff_t offset
= hdr
->args
.offset
;
1859 u32 idx
= hdr
->pgio_mirror_idx
;
1861 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
1862 ds
= nfs4_ff_layout_prepare_ds(lseg
, mirror
, true);
1866 ds_clnt
= nfs4_ff_find_or_create_ds_client(mirror
, ds
->ds_clp
,
1868 if (IS_ERR(ds_clnt
))
1871 ds_cred
= ff_layout_get_ds_cred(mirror
, &lseg
->pls_range
, hdr
->cred
);
1875 vers
= nfs4_ff_layout_ds_version(mirror
);
1877 dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
1878 __func__
, hdr
->inode
->i_ino
, sync
, (size_t) hdr
->args
.count
,
1879 offset
, ds
->ds_remotestr
, refcount_read(&ds
->ds_clp
->cl_count
),
1882 hdr
->pgio_done_cb
= ff_layout_write_done_cb
;
1883 refcount_inc(&ds
->ds_clp
->cl_count
);
1884 hdr
->ds_clp
= ds
->ds_clp
;
1885 hdr
->ds_commit_idx
= idx
;
1886 fh
= nfs4_ff_layout_select_ds_fh(mirror
);
1890 nfs4_ff_layout_select_ds_stateid(mirror
, &hdr
->args
.stateid
);
1893 * Note that if we ever decide to split across DSes,
1894 * then we may need to handle dense-like offsets.
1896 hdr
->args
.offset
= offset
;
1898 /* Start IO accounting for local write */
1899 localio
= ff_local_open_fh(ds
->ds_clp
, ds_cred
, fh
,
1900 FMODE_READ
|FMODE_WRITE
);
1902 hdr
->task
.tk_start
= ktime_get();
1903 ff_layout_write_record_layoutstats_start(&hdr
->task
, hdr
);
1906 /* Perform an asynchronous write */
1907 nfs_initiate_pgio(ds_clnt
, hdr
, ds_cred
, ds
->ds_clp
->rpc_ops
,
1908 vers
== 3 ? &ff_layout_write_call_ops_v3
:
1909 &ff_layout_write_call_ops_v4
,
1910 sync
, RPC_TASK_SOFTCONN
, localio
);
1912 return PNFS_ATTEMPTED
;
1915 if (ff_layout_avoid_mds_available_ds(lseg
))
1916 return PNFS_TRY_AGAIN
;
1917 trace_pnfs_mds_fallback_write_pagelist(hdr
->inode
,
1918 hdr
->args
.offset
, hdr
->args
.count
,
1919 IOMODE_RW
, NFS_I(hdr
->inode
)->layout
, lseg
);
1920 return PNFS_NOT_ATTEMPTED
;
1923 static u32
calc_ds_index_from_commit(struct pnfs_layout_segment
*lseg
, u32 i
)
1928 static struct nfs_fh
*
1929 select_ds_fh_from_commit(struct pnfs_layout_segment
*lseg
, u32 i
)
1931 struct nfs4_ff_layout_segment
*flseg
= FF_LAYOUT_LSEG(lseg
);
1933 /* FIXME: Assume that there is only one NFS version available
1936 return &flseg
->mirror_array
[i
]->fh_versions
[0];
1939 static int ff_layout_initiate_commit(struct nfs_commit_data
*data
, int how
)
1941 struct pnfs_layout_segment
*lseg
= data
->lseg
;
1942 struct nfs4_pnfs_ds
*ds
;
1943 struct rpc_clnt
*ds_clnt
;
1944 struct nfsd_file
*localio
;
1945 struct nfs4_ff_layout_mirror
*mirror
;
1946 const struct cred
*ds_cred
;
1951 if (!lseg
|| !(pnfs_is_valid_lseg(lseg
) ||
1952 test_bit(NFS_LSEG_LAYOUTRETURN
, &lseg
->pls_flags
)))
1955 idx
= calc_ds_index_from_commit(lseg
, data
->ds_commit_index
);
1956 mirror
= FF_LAYOUT_COMP(lseg
, idx
);
1957 ds
= nfs4_ff_layout_prepare_ds(lseg
, mirror
, true);
1961 ds_clnt
= nfs4_ff_find_or_create_ds_client(mirror
, ds
->ds_clp
,
1963 if (IS_ERR(ds_clnt
))
1966 ds_cred
= ff_layout_get_ds_cred(mirror
, &lseg
->pls_range
, data
->cred
);
1970 vers
= nfs4_ff_layout_ds_version(mirror
);
1972 dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__
,
1973 data
->inode
->i_ino
, how
, refcount_read(&ds
->ds_clp
->cl_count
),
1975 data
->commit_done_cb
= ff_layout_commit_done_cb
;
1976 data
->cred
= ds_cred
;
1977 refcount_inc(&ds
->ds_clp
->cl_count
);
1978 data
->ds_clp
= ds
->ds_clp
;
1979 fh
= select_ds_fh_from_commit(lseg
, data
->ds_commit_index
);
1983 /* Start IO accounting for local commit */
1984 localio
= ff_local_open_fh(ds
->ds_clp
, ds_cred
, fh
,
1985 FMODE_READ
|FMODE_WRITE
);
1987 data
->task
.tk_start
= ktime_get();
1988 ff_layout_commit_record_layoutstats_start(&data
->task
, data
);
1991 ret
= nfs_initiate_commit(ds_clnt
, data
, ds
->ds_clp
->rpc_ops
,
1992 vers
== 3 ? &ff_layout_commit_call_ops_v3
:
1993 &ff_layout_commit_call_ops_v4
,
1994 how
, RPC_TASK_SOFTCONN
, localio
);
1998 pnfs_generic_prepare_to_resend_writes(data
);
1999 pnfs_generic_commit_release(data
);
2004 ff_layout_commit_pagelist(struct inode
*inode
, struct list_head
*mds_pages
,
2005 int how
, struct nfs_commit_info
*cinfo
)
2007 return pnfs_generic_commit_pagelist(inode
, mds_pages
, how
, cinfo
,
2008 ff_layout_initiate_commit
);
2011 static bool ff_layout_match_rw(const struct rpc_task
*task
,
2012 const struct nfs_pgio_header
*hdr
,
2013 const struct pnfs_layout_segment
*lseg
)
2015 return hdr
->lseg
== lseg
;
2018 static bool ff_layout_match_commit(const struct rpc_task
*task
,
2019 const struct nfs_commit_data
*cdata
,
2020 const struct pnfs_layout_segment
*lseg
)
2022 return cdata
->lseg
== lseg
;
2025 static bool ff_layout_match_io(const struct rpc_task
*task
, const void *data
)
2027 const struct rpc_call_ops
*ops
= task
->tk_ops
;
2029 if (ops
== &ff_layout_read_call_ops_v3
||
2030 ops
== &ff_layout_read_call_ops_v4
||
2031 ops
== &ff_layout_write_call_ops_v3
||
2032 ops
== &ff_layout_write_call_ops_v4
)
2033 return ff_layout_match_rw(task
, task
->tk_calldata
, data
);
2034 if (ops
== &ff_layout_commit_call_ops_v3
||
2035 ops
== &ff_layout_commit_call_ops_v4
)
2036 return ff_layout_match_commit(task
, task
->tk_calldata
, data
);
2040 static void ff_layout_cancel_io(struct pnfs_layout_segment
*lseg
)
2042 struct nfs4_ff_layout_segment
*flseg
= FF_LAYOUT_LSEG(lseg
);
2043 struct nfs4_ff_layout_mirror
*mirror
;
2044 struct nfs4_ff_layout_ds
*mirror_ds
;
2045 struct nfs4_pnfs_ds
*ds
;
2046 struct nfs_client
*ds_clp
;
2047 struct rpc_clnt
*clnt
;
2050 for (idx
= 0; idx
< flseg
->mirror_array_cnt
; idx
++) {
2051 mirror
= flseg
->mirror_array
[idx
];
2052 mirror_ds
= mirror
->mirror_ds
;
2053 if (IS_ERR_OR_NULL(mirror_ds
))
2055 ds
= mirror
->mirror_ds
->ds
;
2058 ds_clp
= ds
->ds_clp
;
2061 clnt
= ds_clp
->cl_rpcclient
;
2064 if (!rpc_cancel_tasks(clnt
, -EAGAIN
, ff_layout_match_io
, lseg
))
2066 rpc_clnt_disconnect(clnt
);
2070 static struct pnfs_ds_commit_info
*
2071 ff_layout_get_ds_info(struct inode
*inode
)
2073 struct pnfs_layout_hdr
*layout
= NFS_I(inode
)->layout
;
2078 return &FF_LAYOUT_FROM_HDR(layout
)->commit_info
;
2082 ff_layout_setup_ds_info(struct pnfs_ds_commit_info
*fl_cinfo
,
2083 struct pnfs_layout_segment
*lseg
)
2085 struct nfs4_ff_layout_segment
*flseg
= FF_LAYOUT_LSEG(lseg
);
2086 struct inode
*inode
= lseg
->pls_layout
->plh_inode
;
2087 struct pnfs_commit_array
*array
, *new;
2089 new = pnfs_alloc_commit_array(flseg
->mirror_array_cnt
,
2092 spin_lock(&inode
->i_lock
);
2093 array
= pnfs_add_commit_array(fl_cinfo
, new, lseg
);
2094 spin_unlock(&inode
->i_lock
);
2096 pnfs_free_commit_array(new);
2101 ff_layout_release_ds_info(struct pnfs_ds_commit_info
*fl_cinfo
,
2102 struct inode
*inode
)
2104 spin_lock(&inode
->i_lock
);
2105 pnfs_generic_ds_cinfo_destroy(fl_cinfo
);
2106 spin_unlock(&inode
->i_lock
);
2110 ff_layout_free_deviceid_node(struct nfs4_deviceid_node
*d
)
2112 nfs4_ff_layout_free_deviceid(container_of(d
, struct nfs4_ff_layout_ds
,
2116 static int ff_layout_encode_ioerr(struct xdr_stream
*xdr
,
2117 const struct nfs4_layoutreturn_args
*args
,
2118 const struct nfs4_flexfile_layoutreturn_args
*ff_args
)
2122 start
= xdr_reserve_space(xdr
, 4);
2123 if (unlikely(!start
))
2126 *start
= cpu_to_be32(ff_args
->num_errors
);
2127 /* This assume we always return _ALL_ layouts */
2128 return ff_layout_encode_ds_ioerr(xdr
, &ff_args
->errors
);
2132 ff_layout_encode_ff_iostat_head(struct xdr_stream
*xdr
,
2133 const nfs4_stateid
*stateid
,
2134 const struct nfs42_layoutstat_devinfo
*devinfo
)
2138 p
= xdr_reserve_space(xdr
, 8 + 8);
2139 p
= xdr_encode_hyper(p
, devinfo
->offset
);
2140 p
= xdr_encode_hyper(p
, devinfo
->length
);
2141 encode_opaque_fixed(xdr
, stateid
->data
, NFS4_STATEID_SIZE
);
2142 p
= xdr_reserve_space(xdr
, 4*8);
2143 p
= xdr_encode_hyper(p
, devinfo
->read_count
);
2144 p
= xdr_encode_hyper(p
, devinfo
->read_bytes
);
2145 p
= xdr_encode_hyper(p
, devinfo
->write_count
);
2146 p
= xdr_encode_hyper(p
, devinfo
->write_bytes
);
2147 encode_opaque_fixed(xdr
, devinfo
->dev_id
.data
, NFS4_DEVICEID4_SIZE
);
2151 ff_layout_encode_ff_iostat(struct xdr_stream
*xdr
,
2152 const nfs4_stateid
*stateid
,
2153 const struct nfs42_layoutstat_devinfo
*devinfo
)
2155 ff_layout_encode_ff_iostat_head(xdr
, stateid
, devinfo
);
2156 ff_layout_encode_ff_layoutupdate(xdr
, devinfo
,
2157 devinfo
->ld_private
.data
);
2160 /* report nothing for now */
2161 static void ff_layout_encode_iostats_array(struct xdr_stream
*xdr
,
2162 const struct nfs4_layoutreturn_args
*args
,
2163 struct nfs4_flexfile_layoutreturn_args
*ff_args
)
2168 p
= xdr_reserve_space(xdr
, 4);
2169 *p
= cpu_to_be32(ff_args
->num_dev
);
2170 for (i
= 0; i
< ff_args
->num_dev
; i
++)
2171 ff_layout_encode_ff_iostat(xdr
,
2172 &args
->layout
->plh_stateid
,
2173 &ff_args
->devinfo
[i
]);
2177 ff_layout_free_iostats_array(struct nfs42_layoutstat_devinfo
*devinfo
,
2178 unsigned int num_entries
)
2182 for (i
= 0; i
< num_entries
; i
++) {
2183 if (!devinfo
[i
].ld_private
.ops
)
2185 if (!devinfo
[i
].ld_private
.ops
->free
)
2187 devinfo
[i
].ld_private
.ops
->free(&devinfo
[i
].ld_private
);
2191 static struct nfs4_deviceid_node
*
2192 ff_layout_alloc_deviceid_node(struct nfs_server
*server
,
2193 struct pnfs_device
*pdev
, gfp_t gfp_flags
)
2195 struct nfs4_ff_layout_ds
*dsaddr
;
2197 dsaddr
= nfs4_ff_alloc_deviceid_node(server
, pdev
, gfp_flags
);
2200 return &dsaddr
->id_node
;
2204 ff_layout_encode_layoutreturn(struct xdr_stream
*xdr
,
2205 const void *voidargs
,
2206 const struct nfs4_xdr_opaque_data
*ff_opaque
)
2208 const struct nfs4_layoutreturn_args
*args
= voidargs
;
2209 struct nfs4_flexfile_layoutreturn_args
*ff_args
= ff_opaque
->data
;
2210 struct xdr_buf tmp_buf
= {
2213 .iov_base
= page_address(ff_args
->pages
[0]),
2216 .buflen
= PAGE_SIZE
,
2218 struct xdr_stream tmp_xdr
;
2221 dprintk("%s: Begin\n", __func__
);
2223 xdr_init_encode(&tmp_xdr
, &tmp_buf
, NULL
, NULL
);
2225 ff_layout_encode_ioerr(&tmp_xdr
, args
, ff_args
);
2226 ff_layout_encode_iostats_array(&tmp_xdr
, args
, ff_args
);
2228 start
= xdr_reserve_space(xdr
, 4);
2229 *start
= cpu_to_be32(tmp_buf
.len
);
2230 xdr_write_pages(xdr
, ff_args
->pages
, 0, tmp_buf
.len
);
2232 dprintk("%s: Return\n", __func__
);
2236 ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data
*args
)
2238 struct nfs4_flexfile_layoutreturn_args
*ff_args
;
2242 ff_args
= args
->data
;
2245 ff_layout_free_ds_ioerr(&ff_args
->errors
);
2246 ff_layout_free_iostats_array(ff_args
->devinfo
, ff_args
->num_dev
);
2248 put_page(ff_args
->pages
[0]);
2252 static const struct nfs4_xdr_opaque_ops layoutreturn_ops
= {
2253 .encode
= ff_layout_encode_layoutreturn
,
2254 .free
= ff_layout_free_layoutreturn
,
2258 ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args
*args
)
2260 struct nfs4_flexfile_layoutreturn_args
*ff_args
;
2261 struct nfs4_flexfile_layout
*ff_layout
= FF_LAYOUT_FROM_HDR(args
->layout
);
2263 ff_args
= kmalloc(sizeof(*ff_args
), nfs_io_gfp_mask());
2266 ff_args
->pages
[0] = alloc_page(nfs_io_gfp_mask());
2267 if (!ff_args
->pages
[0])
2268 goto out_nomem_free
;
2270 INIT_LIST_HEAD(&ff_args
->errors
);
2271 ff_args
->num_errors
= ff_layout_fetch_ds_ioerr(args
->layout
,
2272 &args
->range
, &ff_args
->errors
,
2273 FF_LAYOUTRETURN_MAXERR
);
2275 spin_lock(&args
->inode
->i_lock
);
2276 ff_args
->num_dev
= ff_layout_mirror_prepare_stats(
2277 &ff_layout
->generic_hdr
, &ff_args
->devinfo
[0],
2278 ARRAY_SIZE(ff_args
->devinfo
), NFS4_FF_OP_LAYOUTRETURN
);
2279 spin_unlock(&args
->inode
->i_lock
);
2281 args
->ld_private
->ops
= &layoutreturn_ops
;
2282 args
->ld_private
->data
= ff_args
;
2290 #ifdef CONFIG_NFS_V4_2
2292 ff_layout_send_layouterror(struct pnfs_layout_segment
*lseg
)
2294 struct pnfs_layout_hdr
*lo
= lseg
->pls_layout
;
2295 struct nfs42_layout_error
*errors
;
2298 if (!nfs_server_capable(lo
->plh_inode
, NFS_CAP_LAYOUTERROR
))
2300 ff_layout_fetch_ds_ioerr(lo
, &lseg
->pls_range
, &head
, -1);
2301 if (list_empty(&head
))
2304 errors
= kmalloc_array(NFS42_LAYOUTERROR_MAX
, sizeof(*errors
),
2306 if (errors
!= NULL
) {
2307 const struct nfs4_ff_layout_ds_err
*pos
;
2310 list_for_each_entry(pos
, &head
, list
) {
2311 errors
[n
].offset
= pos
->offset
;
2312 errors
[n
].length
= pos
->length
;
2313 nfs4_stateid_copy(&errors
[n
].stateid
, &pos
->stateid
);
2314 errors
[n
].errors
[0].dev_id
= pos
->deviceid
;
2315 errors
[n
].errors
[0].status
= pos
->status
;
2316 errors
[n
].errors
[0].opnum
= pos
->opnum
;
2318 if (!list_is_last(&pos
->list
, &head
) &&
2319 n
< NFS42_LAYOUTERROR_MAX
)
2321 if (nfs42_proc_layouterror(lseg
, errors
, n
) < 0)
2327 ff_layout_free_ds_ioerr(&head
);
2331 ff_layout_send_layouterror(struct pnfs_layout_segment
*lseg
)
2337 ff_layout_ntop4(const struct sockaddr
*sap
, char *buf
, const size_t buflen
)
2339 const struct sockaddr_in
*sin
= (struct sockaddr_in
*)sap
;
2341 return snprintf(buf
, buflen
, "%pI4", &sin
->sin_addr
);
2345 ff_layout_ntop6_noscopeid(const struct sockaddr
*sap
, char *buf
,
2348 const struct sockaddr_in6
*sin6
= (struct sockaddr_in6
*)sap
;
2349 const struct in6_addr
*addr
= &sin6
->sin6_addr
;
2352 * RFC 4291, Section 2.2.2
2354 * Shorthanded ANY address
2356 if (ipv6_addr_any(addr
))
2357 return snprintf(buf
, buflen
, "::");
2360 * RFC 4291, Section 2.2.2
2362 * Shorthanded loopback address
2364 if (ipv6_addr_loopback(addr
))
2365 return snprintf(buf
, buflen
, "::1");
2368 * RFC 4291, Section 2.2.3
2370 * Special presentation address format for mapped v4
2373 if (ipv6_addr_v4mapped(addr
))
2374 return snprintf(buf
, buflen
, "::ffff:%pI4",
2375 &addr
->s6_addr32
[3]);
2378 * RFC 4291, Section 2.2.1
2380 return snprintf(buf
, buflen
, "%pI6c", addr
);
2383 /* Derived from rpc_sockaddr2uaddr */
2385 ff_layout_encode_netaddr(struct xdr_stream
*xdr
, struct nfs4_pnfs_ds_addr
*da
)
2387 struct sockaddr
*sap
= (struct sockaddr
*)&da
->da_addr
;
2388 char portbuf
[RPCBIND_MAXUADDRPLEN
];
2389 char addrbuf
[RPCBIND_MAXUADDRLEN
];
2390 unsigned short port
;
2394 switch (sap
->sa_family
) {
2396 if (ff_layout_ntop4(sap
, addrbuf
, sizeof(addrbuf
)) == 0)
2398 port
= ntohs(((struct sockaddr_in
*)sap
)->sin_port
);
2401 if (ff_layout_ntop6_noscopeid(sap
, addrbuf
, sizeof(addrbuf
)) == 0)
2403 port
= ntohs(((struct sockaddr_in6
*)sap
)->sin6_port
);
2410 snprintf(portbuf
, sizeof(portbuf
), ".%u.%u", port
>> 8, port
& 0xff);
2411 len
= strlcat(addrbuf
, portbuf
, sizeof(addrbuf
));
2413 netid_len
= strlen(da
->da_netid
);
2414 p
= xdr_reserve_space(xdr
, 4 + netid_len
);
2415 xdr_encode_opaque(p
, da
->da_netid
, netid_len
);
2417 p
= xdr_reserve_space(xdr
, 4 + len
);
2418 xdr_encode_opaque(p
, addrbuf
, len
);
2422 ff_layout_encode_nfstime(struct xdr_stream
*xdr
,
2425 struct timespec64 ts
;
2428 p
= xdr_reserve_space(xdr
, 12);
2429 ts
= ktime_to_timespec64(t
);
2430 p
= xdr_encode_hyper(p
, ts
.tv_sec
);
2431 *p
++ = cpu_to_be32(ts
.tv_nsec
);
2435 ff_layout_encode_io_latency(struct xdr_stream
*xdr
,
2436 struct nfs4_ff_io_stat
*stat
)
2440 p
= xdr_reserve_space(xdr
, 5 * 8);
2441 p
= xdr_encode_hyper(p
, stat
->ops_requested
);
2442 p
= xdr_encode_hyper(p
, stat
->bytes_requested
);
2443 p
= xdr_encode_hyper(p
, stat
->ops_completed
);
2444 p
= xdr_encode_hyper(p
, stat
->bytes_completed
);
2445 p
= xdr_encode_hyper(p
, stat
->bytes_not_delivered
);
2446 ff_layout_encode_nfstime(xdr
, stat
->total_busy_time
);
2447 ff_layout_encode_nfstime(xdr
, stat
->aggregate_completion_time
);
2451 ff_layout_encode_ff_layoutupdate(struct xdr_stream
*xdr
,
2452 const struct nfs42_layoutstat_devinfo
*devinfo
,
2453 struct nfs4_ff_layout_mirror
*mirror
)
2455 struct nfs4_pnfs_ds_addr
*da
;
2456 struct nfs4_pnfs_ds
*ds
= mirror
->mirror_ds
->ds
;
2457 struct nfs_fh
*fh
= &mirror
->fh_versions
[0];
2460 da
= list_first_entry(&ds
->ds_addrs
, struct nfs4_pnfs_ds_addr
, da_node
);
2461 dprintk("%s: DS %s: encoding address %s\n",
2462 __func__
, ds
->ds_remotestr
, da
->da_remotestr
);
2464 ff_layout_encode_netaddr(xdr
, da
);
2466 p
= xdr_reserve_space(xdr
, 4 + fh
->size
);
2467 xdr_encode_opaque(p
, fh
->data
, fh
->size
);
2468 /* ff_io_latency4 read */
2469 spin_lock(&mirror
->lock
);
2470 ff_layout_encode_io_latency(xdr
, &mirror
->read_stat
.io_stat
);
2471 /* ff_io_latency4 write */
2472 ff_layout_encode_io_latency(xdr
, &mirror
->write_stat
.io_stat
);
2473 spin_unlock(&mirror
->lock
);
2475 ff_layout_encode_nfstime(xdr
, ktime_sub(ktime_get(), mirror
->start_time
));
2477 p
= xdr_reserve_space(xdr
, 4);
2478 *p
= cpu_to_be32(false);
2482 ff_layout_encode_layoutstats(struct xdr_stream
*xdr
, const void *args
,
2483 const struct nfs4_xdr_opaque_data
*opaque
)
2485 struct nfs42_layoutstat_devinfo
*devinfo
= container_of(opaque
,
2486 struct nfs42_layoutstat_devinfo
, ld_private
);
2489 /* layoutupdate length */
2490 start
= xdr_reserve_space(xdr
, 4);
2491 ff_layout_encode_ff_layoutupdate(xdr
, devinfo
, opaque
->data
);
2493 *start
= cpu_to_be32((xdr
->p
- start
- 1) * 4);
2497 ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data
*opaque
)
2499 struct nfs4_ff_layout_mirror
*mirror
= opaque
->data
;
2501 ff_layout_put_mirror(mirror
);
2504 static const struct nfs4_xdr_opaque_ops layoutstat_ops
= {
2505 .encode
= ff_layout_encode_layoutstats
,
2506 .free
= ff_layout_free_layoutstats
,
2510 ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr
*lo
,
2511 struct nfs42_layoutstat_devinfo
*devinfo
,
2512 int dev_limit
, enum nfs4_ff_op_type type
)
2514 struct nfs4_flexfile_layout
*ff_layout
= FF_LAYOUT_FROM_HDR(lo
);
2515 struct nfs4_ff_layout_mirror
*mirror
;
2516 struct nfs4_deviceid_node
*dev
;
2519 list_for_each_entry(mirror
, &ff_layout
->mirrors
, mirrors
) {
2522 if (IS_ERR_OR_NULL(mirror
->mirror_ds
))
2524 if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL
,
2526 type
!= NFS4_FF_OP_LAYOUTRETURN
)
2528 /* mirror refcount put in cleanup_layoutstats */
2529 if (!refcount_inc_not_zero(&mirror
->ref
))
2531 dev
= &mirror
->mirror_ds
->id_node
;
2532 memcpy(&devinfo
->dev_id
, &dev
->deviceid
, NFS4_DEVICEID4_SIZE
);
2533 devinfo
->offset
= 0;
2534 devinfo
->length
= NFS4_MAX_UINT64
;
2535 spin_lock(&mirror
->lock
);
2536 devinfo
->read_count
= mirror
->read_stat
.io_stat
.ops_completed
;
2537 devinfo
->read_bytes
= mirror
->read_stat
.io_stat
.bytes_completed
;
2538 devinfo
->write_count
= mirror
->write_stat
.io_stat
.ops_completed
;
2539 devinfo
->write_bytes
= mirror
->write_stat
.io_stat
.bytes_completed
;
2540 spin_unlock(&mirror
->lock
);
2541 devinfo
->layout_type
= LAYOUT_FLEX_FILES
;
2542 devinfo
->ld_private
.ops
= &layoutstat_ops
;
2543 devinfo
->ld_private
.data
= mirror
;
2551 static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args
*args
)
2553 struct pnfs_layout_hdr
*lo
;
2554 struct nfs4_flexfile_layout
*ff_layout
;
2555 const int dev_count
= PNFS_LAYOUTSTATS_MAXDEV
;
2557 /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
2558 args
->devinfo
= kmalloc_array(dev_count
, sizeof(*args
->devinfo
),
2563 spin_lock(&args
->inode
->i_lock
);
2564 lo
= NFS_I(args
->inode
)->layout
;
2565 if (lo
&& pnfs_layout_is_valid(lo
)) {
2566 ff_layout
= FF_LAYOUT_FROM_HDR(lo
);
2567 args
->num_dev
= ff_layout_mirror_prepare_stats(
2568 &ff_layout
->generic_hdr
, &args
->devinfo
[0], dev_count
,
2569 NFS4_FF_OP_LAYOUTSTATS
);
2572 spin_unlock(&args
->inode
->i_lock
);
2573 if (!args
->num_dev
) {
2574 kfree(args
->devinfo
);
2575 args
->devinfo
= NULL
;
2583 ff_layout_set_layoutdriver(struct nfs_server
*server
,
2584 const struct nfs_fh
*dummy
)
2586 #if IS_ENABLED(CONFIG_NFS_V4_2)
2587 server
->caps
|= NFS_CAP_LAYOUTSTATS
| NFS_CAP_REBOOT_LAYOUTRETURN
;
2592 static const struct pnfs_commit_ops ff_layout_commit_ops
= {
2593 .setup_ds_info
= ff_layout_setup_ds_info
,
2594 .release_ds_info
= ff_layout_release_ds_info
,
2595 .mark_request_commit
= pnfs_layout_mark_request_commit
,
2596 .clear_request_commit
= pnfs_generic_clear_request_commit
,
2597 .scan_commit_lists
= pnfs_generic_scan_commit_lists
,
2598 .recover_commit_reqs
= pnfs_generic_recover_commit_reqs
,
2599 .commit_pagelist
= ff_layout_commit_pagelist
,
2602 static struct pnfs_layoutdriver_type flexfilelayout_type
= {
2603 .id
= LAYOUT_FLEX_FILES
,
2604 .name
= "LAYOUT_FLEX_FILES",
2605 .owner
= THIS_MODULE
,
2606 .flags
= PNFS_LAYOUTGET_ON_OPEN
,
2607 .max_layoutget_response
= 4096, /* 1 page or so... */
2608 .set_layoutdriver
= ff_layout_set_layoutdriver
,
2609 .alloc_layout_hdr
= ff_layout_alloc_layout_hdr
,
2610 .free_layout_hdr
= ff_layout_free_layout_hdr
,
2611 .alloc_lseg
= ff_layout_alloc_lseg
,
2612 .free_lseg
= ff_layout_free_lseg
,
2613 .add_lseg
= ff_layout_add_lseg
,
2614 .pg_read_ops
= &ff_layout_pg_read_ops
,
2615 .pg_write_ops
= &ff_layout_pg_write_ops
,
2616 .get_ds_info
= ff_layout_get_ds_info
,
2617 .free_deviceid_node
= ff_layout_free_deviceid_node
,
2618 .read_pagelist
= ff_layout_read_pagelist
,
2619 .write_pagelist
= ff_layout_write_pagelist
,
2620 .alloc_deviceid_node
= ff_layout_alloc_deviceid_node
,
2621 .prepare_layoutreturn
= ff_layout_prepare_layoutreturn
,
2622 .sync
= pnfs_nfs_generic_sync
,
2623 .prepare_layoutstats
= ff_layout_prepare_layoutstats
,
2624 .cancel_io
= ff_layout_cancel_io
,
2627 static int __init
nfs4flexfilelayout_init(void)
2629 printk(KERN_INFO
"%s: NFSv4 Flexfile Layout Driver Registering...\n",
2631 return pnfs_register_layoutdriver(&flexfilelayout_type
);
2634 static void __exit
nfs4flexfilelayout_exit(void)
2636 printk(KERN_INFO
"%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
2638 pnfs_unregister_layoutdriver(&flexfilelayout_type
);
2641 MODULE_ALIAS("nfs-layouttype4-4");
2643 MODULE_LICENSE("GPL");
2644 MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
2646 module_init(nfs4flexfilelayout_init
);
2647 module_exit(nfs4flexfilelayout_exit
);
2649 module_param(io_maxretrans
, ushort
, 0644);
2650 MODULE_PARM_DESC(io_maxretrans
, "The number of times the NFSv4.1 client "
2651 "retries an I/O request before returning an error. ");