2 * linux/fs/nfs/blocklayout/blocklayout.c
4 * Module for the NFSv4.1 pNFS block layout driver.
6 * Copyright (c) 2006 The Regents of the University of Michigan.
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
33 #include <linux/module.h>
34 #include <linux/init.h>
35 #include <linux/mount.h>
36 #include <linux/namei.h>
37 #include <linux/bio.h> /* struct bio */
38 #include <linux/prefetch.h>
39 #include <linux/pagevec.h>
42 #include "../nfs4session.h"
43 #include "../internal.h"
44 #include "blocklayout.h"
46 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
48 MODULE_LICENSE("GPL");
49 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
50 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
52 static bool is_hole(struct pnfs_block_extent
*be
)
54 switch (be
->be_state
) {
55 case PNFS_BLOCK_NONE_DATA
:
57 case PNFS_BLOCK_INVALID_DATA
:
58 return be
->be_tag
? false : true;
64 /* The data we are handed might be spread across several bios. We need
65 * to track when the last one is finished.
69 void (*pnfs_callback
) (void *data
);
73 static inline struct parallel_io
*alloc_parallel(void *data
)
75 struct parallel_io
*rv
;
77 rv
= kmalloc(sizeof(*rv
), GFP_NOFS
);
80 kref_init(&rv
->refcnt
);
85 static inline void get_parallel(struct parallel_io
*p
)
90 static void destroy_parallel(struct kref
*kref
)
92 struct parallel_io
*p
= container_of(kref
, struct parallel_io
, refcnt
);
94 dprintk("%s enter\n", __func__
);
95 p
->pnfs_callback(p
->data
);
99 static inline void put_parallel(struct parallel_io
*p
)
101 kref_put(&p
->refcnt
, destroy_parallel
);
105 bl_submit_bio(struct bio
*bio
)
108 get_parallel(bio
->bi_private
);
109 dprintk("%s submitting %s bio %u@%llu\n", __func__
,
110 bio_op(bio
) == READ
? "read" : "write",
111 bio
->bi_iter
.bi_size
,
112 (unsigned long long)bio
->bi_iter
.bi_sector
);
119 bl_alloc_init_bio(int npg
, struct block_device
*bdev
, sector_t disk_sector
,
120 bio_end_io_t end_io
, struct parallel_io
*par
)
124 npg
= min(npg
, BIO_MAX_PAGES
);
125 bio
= bio_alloc(GFP_NOIO
, npg
);
126 if (!bio
&& (current
->flags
& PF_MEMALLOC
)) {
127 while (!bio
&& (npg
/= 2))
128 bio
= bio_alloc(GFP_NOIO
, npg
);
132 bio
->bi_iter
.bi_sector
= disk_sector
;
133 bio_set_dev(bio
, bdev
);
134 bio
->bi_end_io
= end_io
;
135 bio
->bi_private
= par
;
140 static bool offset_in_map(u64 offset
, struct pnfs_block_dev_map
*map
)
142 return offset
>= map
->start
&& offset
< map
->start
+ map
->len
;
146 do_add_page_to_bio(struct bio
*bio
, int npg
, int rw
, sector_t isect
,
147 struct page
*page
, struct pnfs_block_dev_map
*map
,
148 struct pnfs_block_extent
*be
, bio_end_io_t end_io
,
149 struct parallel_io
*par
, unsigned int offset
, int *len
)
151 struct pnfs_block_dev
*dev
=
152 container_of(be
->be_device
, struct pnfs_block_dev
, node
);
155 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__
,
156 npg
, rw
, (unsigned long long)isect
, offset
, *len
);
158 /* translate to device offset */
159 isect
+= be
->be_v_offset
;
160 isect
-= be
->be_f_offset
;
162 /* translate to physical disk offset */
163 disk_addr
= (u64
)isect
<< SECTOR_SHIFT
;
164 if (!offset_in_map(disk_addr
, map
)) {
165 if (!dev
->map(dev
, disk_addr
, map
) || !offset_in_map(disk_addr
, map
))
166 return ERR_PTR(-EIO
);
167 bio
= bl_submit_bio(bio
);
169 disk_addr
+= map
->disk_offset
;
170 disk_addr
-= map
->start
;
172 /* limit length to what the device mapping allows */
173 end
= disk_addr
+ *len
;
174 if (end
>= map
->start
+ map
->len
)
175 *len
= map
->start
+ map
->len
- disk_addr
;
179 bio
= bl_alloc_init_bio(npg
, map
->bdev
,
180 disk_addr
>> SECTOR_SHIFT
, end_io
, par
);
182 return ERR_PTR(-ENOMEM
);
183 bio_set_op_attrs(bio
, rw
, 0);
185 if (bio_add_page(bio
, page
, *len
, offset
) < *len
) {
186 bio
= bl_submit_bio(bio
);
192 static void bl_mark_devices_unavailable(struct nfs_pgio_header
*header
, bool rw
)
194 struct pnfs_block_layout
*bl
= BLK_LSEG2EXT(header
->lseg
);
195 size_t bytes_left
= header
->args
.count
;
196 sector_t isect
, extent_length
= 0;
197 struct pnfs_block_extent be
;
199 isect
= header
->args
.offset
>> SECTOR_SHIFT
;
200 bytes_left
+= header
->args
.offset
- (isect
<< SECTOR_SHIFT
);
202 while (bytes_left
> 0) {
203 if (!ext_tree_lookup(bl
, isect
, &be
, rw
))
205 extent_length
= be
.be_length
- (isect
- be
.be_f_offset
);
206 nfs4_mark_deviceid_unavailable(be
.be_device
);
207 isect
+= extent_length
;
208 if (bytes_left
> extent_length
<< SECTOR_SHIFT
)
209 bytes_left
-= extent_length
<< SECTOR_SHIFT
;
215 static void bl_end_io_read(struct bio
*bio
)
217 struct parallel_io
*par
= bio
->bi_private
;
219 if (bio
->bi_status
) {
220 struct nfs_pgio_header
*header
= par
->data
;
222 if (!header
->pnfs_error
)
223 header
->pnfs_error
= -EIO
;
224 pnfs_set_lo_fail(header
->lseg
);
225 bl_mark_devices_unavailable(header
, false);
232 static void bl_read_cleanup(struct work_struct
*work
)
234 struct rpc_task
*task
;
235 struct nfs_pgio_header
*hdr
;
236 dprintk("%s enter\n", __func__
);
237 task
= container_of(work
, struct rpc_task
, u
.tk_work
);
238 hdr
= container_of(task
, struct nfs_pgio_header
, task
);
239 pnfs_ld_read_done(hdr
);
243 bl_end_par_io_read(void *data
)
245 struct nfs_pgio_header
*hdr
= data
;
247 hdr
->task
.tk_status
= hdr
->pnfs_error
;
248 INIT_WORK(&hdr
->task
.u
.tk_work
, bl_read_cleanup
);
249 schedule_work(&hdr
->task
.u
.tk_work
);
252 static enum pnfs_try_status
253 bl_read_pagelist(struct nfs_pgio_header
*header
)
255 struct pnfs_block_layout
*bl
= BLK_LSEG2EXT(header
->lseg
);
256 struct pnfs_block_dev_map map
= { .start
= NFS4_MAX_UINT64
};
257 struct bio
*bio
= NULL
;
258 struct pnfs_block_extent be
;
259 sector_t isect
, extent_length
= 0;
260 struct parallel_io
*par
;
261 loff_t f_offset
= header
->args
.offset
;
262 size_t bytes_left
= header
->args
.count
;
263 unsigned int pg_offset
= header
->args
.pgbase
, pg_len
;
264 struct page
**pages
= header
->args
.pages
;
265 int pg_index
= header
->args
.pgbase
>> PAGE_SHIFT
;
266 const bool is_dio
= (header
->dreq
!= NULL
);
267 struct blk_plug plug
;
270 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__
,
271 header
->page_array
.npages
, f_offset
,
272 (unsigned int)header
->args
.count
);
274 par
= alloc_parallel(header
);
276 return PNFS_NOT_ATTEMPTED
;
277 par
->pnfs_callback
= bl_end_par_io_read
;
279 blk_start_plug(&plug
);
281 isect
= (sector_t
) (f_offset
>> SECTOR_SHIFT
);
282 /* Code assumes extents are page-aligned */
283 for (i
= pg_index
; i
< header
->page_array
.npages
; i
++) {
284 if (extent_length
<= 0) {
285 /* We've used up the previous extent */
286 bio
= bl_submit_bio(bio
);
288 /* Get the next one */
289 if (!ext_tree_lookup(bl
, isect
, &be
, false)) {
290 header
->pnfs_error
= -EIO
;
293 extent_length
= be
.be_length
- (isect
- be
.be_f_offset
);
297 if (pg_offset
+ bytes_left
> PAGE_SIZE
)
298 pg_len
= PAGE_SIZE
- pg_offset
;
302 BUG_ON(pg_offset
!= 0);
307 bio
= bl_submit_bio(bio
);
308 /* Fill hole w/ zeroes w/o accessing device */
309 dprintk("%s Zeroing page for hole\n", __func__
);
310 zero_user_segment(pages
[i
], pg_offset
, pg_len
);
313 map
.start
= NFS4_MAX_UINT64
;
315 bio
= do_add_page_to_bio(bio
,
316 header
->page_array
.npages
- i
,
318 isect
, pages
[i
], &map
, &be
,
322 header
->pnfs_error
= PTR_ERR(bio
);
327 isect
+= (pg_len
>> SECTOR_SHIFT
);
328 extent_length
-= (pg_len
>> SECTOR_SHIFT
);
330 bytes_left
-= pg_len
;
333 if ((isect
<< SECTOR_SHIFT
) >= header
->inode
->i_size
) {
335 header
->res
.count
= header
->inode
->i_size
- header
->args
.offset
;
337 header
->res
.count
= (isect
<< SECTOR_SHIFT
) - header
->args
.offset
;
341 blk_finish_plug(&plug
);
343 return PNFS_ATTEMPTED
;
346 static void bl_end_io_write(struct bio
*bio
)
348 struct parallel_io
*par
= bio
->bi_private
;
349 struct nfs_pgio_header
*header
= par
->data
;
351 if (bio
->bi_status
) {
352 if (!header
->pnfs_error
)
353 header
->pnfs_error
= -EIO
;
354 pnfs_set_lo_fail(header
->lseg
);
355 bl_mark_devices_unavailable(header
, true);
361 /* Function scheduled for call during bl_end_par_io_write,
362 * it marks sectors as written and extends the commitlist.
364 static void bl_write_cleanup(struct work_struct
*work
)
366 struct rpc_task
*task
= container_of(work
, struct rpc_task
, u
.tk_work
);
367 struct nfs_pgio_header
*hdr
=
368 container_of(task
, struct nfs_pgio_header
, task
);
370 dprintk("%s enter\n", __func__
);
372 if (likely(!hdr
->pnfs_error
)) {
373 struct pnfs_block_layout
*bl
= BLK_LSEG2EXT(hdr
->lseg
);
374 u64 start
= hdr
->args
.offset
& (loff_t
)PAGE_MASK
;
375 u64 end
= (hdr
->args
.offset
+ hdr
->args
.count
+
376 PAGE_SIZE
- 1) & (loff_t
)PAGE_MASK
;
377 u64 lwb
= hdr
->args
.offset
+ hdr
->args
.count
;
379 ext_tree_mark_written(bl
, start
>> SECTOR_SHIFT
,
380 (end
- start
) >> SECTOR_SHIFT
, lwb
);
383 pnfs_ld_write_done(hdr
);
386 /* Called when last of bios associated with a bl_write_pagelist call finishes */
387 static void bl_end_par_io_write(void *data
)
389 struct nfs_pgio_header
*hdr
= data
;
391 hdr
->task
.tk_status
= hdr
->pnfs_error
;
392 hdr
->verf
.committed
= NFS_FILE_SYNC
;
393 INIT_WORK(&hdr
->task
.u
.tk_work
, bl_write_cleanup
);
394 schedule_work(&hdr
->task
.u
.tk_work
);
397 static enum pnfs_try_status
398 bl_write_pagelist(struct nfs_pgio_header
*header
, int sync
)
400 struct pnfs_block_layout
*bl
= BLK_LSEG2EXT(header
->lseg
);
401 struct pnfs_block_dev_map map
= { .start
= NFS4_MAX_UINT64
};
402 struct bio
*bio
= NULL
;
403 struct pnfs_block_extent be
;
404 sector_t isect
, extent_length
= 0;
405 struct parallel_io
*par
= NULL
;
406 loff_t offset
= header
->args
.offset
;
407 size_t count
= header
->args
.count
;
408 struct page
**pages
= header
->args
.pages
;
409 int pg_index
= header
->args
.pgbase
>> PAGE_SHIFT
;
411 struct blk_plug plug
;
414 dprintk("%s enter, %zu@%lld\n", __func__
, count
, offset
);
416 /* At this point, header->page_aray is a (sequential) list of nfs_pages.
417 * We want to write each, and if there is an error set pnfs_error
418 * to have it redone using nfs.
420 par
= alloc_parallel(header
);
422 return PNFS_NOT_ATTEMPTED
;
423 par
->pnfs_callback
= bl_end_par_io_write
;
425 blk_start_plug(&plug
);
427 /* we always write out the whole page */
428 offset
= offset
& (loff_t
)PAGE_MASK
;
429 isect
= offset
>> SECTOR_SHIFT
;
431 for (i
= pg_index
; i
< header
->page_array
.npages
; i
++) {
432 if (extent_length
<= 0) {
433 /* We've used up the previous extent */
434 bio
= bl_submit_bio(bio
);
435 /* Get the next one */
436 if (!ext_tree_lookup(bl
, isect
, &be
, true)) {
437 header
->pnfs_error
= -EINVAL
;
441 extent_length
= be
.be_length
- (isect
- be
.be_f_offset
);
445 bio
= do_add_page_to_bio(bio
, header
->page_array
.npages
- i
,
446 WRITE
, isect
, pages
[i
], &map
, &be
,
447 bl_end_io_write
, par
,
450 header
->pnfs_error
= PTR_ERR(bio
);
457 isect
+= (pg_len
>> SECTOR_SHIFT
);
458 extent_length
-= (pg_len
>> SECTOR_SHIFT
);
461 header
->res
.count
= header
->args
.count
;
464 blk_finish_plug(&plug
);
466 return PNFS_ATTEMPTED
;
469 static void bl_free_layout_hdr(struct pnfs_layout_hdr
*lo
)
471 struct pnfs_block_layout
*bl
= BLK_LO2EXT(lo
);
474 dprintk("%s enter\n", __func__
);
476 err
= ext_tree_remove(bl
, true, 0, LLONG_MAX
);
479 kfree_rcu(bl
, bl_layout
.plh_rcu
);
482 static struct pnfs_layout_hdr
*__bl_alloc_layout_hdr(struct inode
*inode
,
483 gfp_t gfp_flags
, bool is_scsi_layout
)
485 struct pnfs_block_layout
*bl
;
487 dprintk("%s enter\n", __func__
);
488 bl
= kzalloc(sizeof(*bl
), gfp_flags
);
492 bl
->bl_ext_rw
= RB_ROOT
;
493 bl
->bl_ext_ro
= RB_ROOT
;
494 spin_lock_init(&bl
->bl_ext_lock
);
496 bl
->bl_scsi_layout
= is_scsi_layout
;
497 return &bl
->bl_layout
;
500 static struct pnfs_layout_hdr
*bl_alloc_layout_hdr(struct inode
*inode
,
503 return __bl_alloc_layout_hdr(inode
, gfp_flags
, false);
506 static struct pnfs_layout_hdr
*sl_alloc_layout_hdr(struct inode
*inode
,
509 return __bl_alloc_layout_hdr(inode
, gfp_flags
, true);
512 static void bl_free_lseg(struct pnfs_layout_segment
*lseg
)
514 dprintk("%s enter\n", __func__
);
518 /* Tracks info needed to ensure extents in layout obey constraints of spec */
519 struct layout_verification
{
520 u32 mode
; /* R or RW */
521 u64 start
; /* Expected start of next non-COW extent */
522 u64 inval
; /* Start of INVAL coverage */
523 u64 cowread
; /* End of COW read coverage */
526 /* Verify the extent meets the layout requirements of the pnfs-block draft,
529 static int verify_extent(struct pnfs_block_extent
*be
,
530 struct layout_verification
*lv
)
532 if (lv
->mode
== IOMODE_READ
) {
533 if (be
->be_state
== PNFS_BLOCK_READWRITE_DATA
||
534 be
->be_state
== PNFS_BLOCK_INVALID_DATA
)
536 if (be
->be_f_offset
!= lv
->start
)
538 lv
->start
+= be
->be_length
;
541 /* lv->mode == IOMODE_RW */
542 if (be
->be_state
== PNFS_BLOCK_READWRITE_DATA
) {
543 if (be
->be_f_offset
!= lv
->start
)
545 if (lv
->cowread
> lv
->start
)
547 lv
->start
+= be
->be_length
;
548 lv
->inval
= lv
->start
;
550 } else if (be
->be_state
== PNFS_BLOCK_INVALID_DATA
) {
551 if (be
->be_f_offset
!= lv
->start
)
553 lv
->start
+= be
->be_length
;
555 } else if (be
->be_state
== PNFS_BLOCK_READ_DATA
) {
556 if (be
->be_f_offset
> lv
->start
)
558 if (be
->be_f_offset
< lv
->inval
)
560 if (be
->be_f_offset
< lv
->cowread
)
562 /* It looks like you might want to min this with lv->start,
563 * but you really don't.
565 lv
->inval
= lv
->inval
+ be
->be_length
;
566 lv
->cowread
= be
->be_f_offset
+ be
->be_length
;
572 static int decode_sector_number(__be32
**rp
, sector_t
*sp
)
576 *rp
= xdr_decode_hyper(*rp
, &s
);
578 printk(KERN_WARNING
"NFS: %s: sector not aligned\n", __func__
);
581 *sp
= s
>> SECTOR_SHIFT
;
585 static struct nfs4_deviceid_node
*
586 bl_find_get_deviceid(struct nfs_server
*server
,
587 const struct nfs4_deviceid
*id
, const struct cred
*cred
,
590 struct nfs4_deviceid_node
*node
;
591 unsigned long start
, end
;
594 node
= nfs4_find_get_deviceid(server
, id
, cred
, gfp_mask
);
596 return ERR_PTR(-ENODEV
);
598 if (test_bit(NFS_DEVICEID_UNAVAILABLE
, &node
->flags
) == 0)
602 start
= end
- PNFS_DEVICE_RETRY_TIMEOUT
;
603 if (!time_in_range(node
->timestamp_unavailable
, start
, end
)) {
604 nfs4_delete_deviceid(node
->ld
, node
->nfs_client
, id
);
607 return ERR_PTR(-ENODEV
);
611 bl_alloc_extent(struct xdr_stream
*xdr
, struct pnfs_layout_hdr
*lo
,
612 struct layout_verification
*lv
, struct list_head
*extents
,
615 struct pnfs_block_extent
*be
;
616 struct nfs4_deviceid id
;
620 p
= xdr_inline_decode(xdr
, 28 + NFS4_DEVICEID4_SIZE
);
624 be
= kzalloc(sizeof(*be
), GFP_NOFS
);
628 memcpy(&id
, p
, NFS4_DEVICEID4_SIZE
);
629 p
+= XDR_QUADLEN(NFS4_DEVICEID4_SIZE
);
631 be
->be_device
= bl_find_get_deviceid(NFS_SERVER(lo
->plh_inode
), &id
,
632 lo
->plh_lc_cred
, gfp_mask
);
633 if (IS_ERR(be
->be_device
)) {
634 error
= PTR_ERR(be
->be_device
);
639 * The next three values are read in as bytes, but stored in the
640 * extent structure in 512-byte granularity.
643 if (decode_sector_number(&p
, &be
->be_f_offset
) < 0)
644 goto out_put_deviceid
;
645 if (decode_sector_number(&p
, &be
->be_length
) < 0)
646 goto out_put_deviceid
;
647 if (decode_sector_number(&p
, &be
->be_v_offset
) < 0)
648 goto out_put_deviceid
;
649 be
->be_state
= be32_to_cpup(p
++);
651 error
= verify_extent(be
, lv
);
653 dprintk("%s: extent verification failed\n", __func__
);
654 goto out_put_deviceid
;
657 list_add_tail(&be
->be_list
, extents
);
661 nfs4_put_deviceid_node(be
->be_device
);
667 static struct pnfs_layout_segment
*
668 bl_alloc_lseg(struct pnfs_layout_hdr
*lo
, struct nfs4_layoutget_res
*lgr
,
671 struct layout_verification lv
= {
672 .mode
= lgr
->range
.iomode
,
673 .start
= lgr
->range
.offset
>> SECTOR_SHIFT
,
674 .inval
= lgr
->range
.offset
>> SECTOR_SHIFT
,
675 .cowread
= lgr
->range
.offset
>> SECTOR_SHIFT
,
677 struct pnfs_block_layout
*bl
= BLK_LO2EXT(lo
);
678 struct pnfs_layout_segment
*lseg
;
680 struct xdr_stream xdr
;
681 struct page
*scratch
;
687 dprintk("---> %s\n", __func__
);
689 lseg
= kzalloc(sizeof(*lseg
), gfp_mask
);
691 return ERR_PTR(-ENOMEM
);
694 scratch
= alloc_page(gfp_mask
);
698 xdr_init_decode_pages(&xdr
, &buf
,
699 lgr
->layoutp
->pages
, lgr
->layoutp
->len
);
700 xdr_set_scratch_page(&xdr
, scratch
);
703 p
= xdr_inline_decode(&xdr
, 4);
705 goto out_free_scratch
;
707 count
= be32_to_cpup(p
++);
708 dprintk("%s: number of extents %d\n", __func__
, count
);
711 * Decode individual extents, putting them in temporary staging area
712 * until whole layout is decoded to make error recovery easier.
714 for (i
= 0; i
< count
; i
++) {
715 status
= bl_alloc_extent(&xdr
, lo
, &lv
, &extents
, gfp_mask
);
717 goto process_extents
;
720 if (lgr
->range
.offset
+ lgr
->range
.length
!=
721 lv
.start
<< SECTOR_SHIFT
) {
722 dprintk("%s Final length mismatch\n", __func__
);
724 goto process_extents
;
727 if (lv
.start
< lv
.cowread
) {
728 dprintk("%s Final uncovered COW extent\n", __func__
);
733 while (!list_empty(&extents
)) {
734 struct pnfs_block_extent
*be
=
735 list_first_entry(&extents
, struct pnfs_block_extent
,
737 list_del(&be
->be_list
);
740 status
= ext_tree_insert(bl
, be
);
743 nfs4_put_deviceid_node(be
->be_device
);
749 __free_page(scratch
);
751 dprintk("%s returns %d\n", __func__
, status
);
754 /* Our extent block devices are unavailable */
755 set_bit(NFS_LSEG_UNAVAILABLE
, &lseg
->pls_flags
);
761 return ERR_PTR(status
);
766 bl_return_range(struct pnfs_layout_hdr
*lo
,
767 struct pnfs_layout_range
*range
)
769 struct pnfs_block_layout
*bl
= BLK_LO2EXT(lo
);
770 sector_t offset
= range
->offset
>> SECTOR_SHIFT
, end
;
772 if (range
->offset
% 8) {
773 dprintk("%s: offset %lld not block size aligned\n",
774 __func__
, range
->offset
);
778 if (range
->length
!= NFS4_MAX_UINT64
) {
779 if (range
->length
% 8) {
780 dprintk("%s: length %lld not block size aligned\n",
781 __func__
, range
->length
);
785 end
= offset
+ (range
->length
>> SECTOR_SHIFT
);
787 end
= round_down(NFS4_MAX_UINT64
, PAGE_SIZE
);
790 ext_tree_remove(bl
, range
->iomode
& IOMODE_RW
, offset
, end
);
794 bl_prepare_layoutcommit(struct nfs4_layoutcommit_args
*arg
)
796 return ext_tree_prepare_commit(arg
);
800 bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data
*lcdata
)
802 ext_tree_mark_committed(&lcdata
->args
, lcdata
->res
.status
);
806 bl_set_layoutdriver(struct nfs_server
*server
, const struct nfs_fh
*fh
)
808 dprintk("%s enter\n", __func__
);
810 if (server
->pnfs_blksize
== 0) {
811 dprintk("%s Server did not return blksize\n", __func__
);
814 if (server
->pnfs_blksize
> PAGE_SIZE
) {
815 printk(KERN_ERR
"%s: pNFS blksize %d not supported.\n",
816 __func__
, server
->pnfs_blksize
);
824 is_aligned_req(struct nfs_pageio_descriptor
*pgio
,
825 struct nfs_page
*req
, unsigned int alignment
, bool is_write
)
828 * Always accept buffered writes, higher layers take care of the
831 if (pgio
->pg_dreq
== NULL
)
834 if (!IS_ALIGNED(req
->wb_offset
, alignment
))
837 if (IS_ALIGNED(req
->wb_bytes
, alignment
))
841 (req_offset(req
) + req
->wb_bytes
== i_size_read(pgio
->pg_inode
))) {
843 * If the write goes up to the inode size, just write
844 * the full page. Data past the inode size is
845 * guaranteed to be zeroed by the higher level client
846 * code, and this behaviour is mandated by RFC 5663
856 bl_pg_init_read(struct nfs_pageio_descriptor
*pgio
, struct nfs_page
*req
)
858 if (!is_aligned_req(pgio
, req
, SECTOR_SIZE
, false)) {
859 nfs_pageio_reset_read_mds(pgio
);
863 pnfs_generic_pg_init_read(pgio
, req
);
866 test_bit(NFS_LSEG_UNAVAILABLE
, &pgio
->pg_lseg
->pls_flags
)) {
867 pnfs_error_mark_layout_for_return(pgio
->pg_inode
, pgio
->pg_lseg
);
868 pnfs_set_lo_fail(pgio
->pg_lseg
);
869 nfs_pageio_reset_read_mds(pgio
);
874 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
875 * of bytes (maximum @req->wb_bytes) that can be coalesced.
878 bl_pg_test_read(struct nfs_pageio_descriptor
*pgio
, struct nfs_page
*prev
,
879 struct nfs_page
*req
)
881 if (!is_aligned_req(pgio
, req
, SECTOR_SIZE
, false))
883 return pnfs_generic_pg_test(pgio
, prev
, req
);
887 * Return the number of contiguous bytes for a given inode
888 * starting at page frame idx.
890 static u64
pnfs_num_cont_bytes(struct inode
*inode
, pgoff_t idx
)
892 struct address_space
*mapping
= inode
->i_mapping
;
895 /* Optimize common case that writes from 0 to end of file */
896 end
= DIV_ROUND_UP(i_size_read(inode
), PAGE_SIZE
);
897 if (end
!= inode
->i_mapping
->nrpages
) {
899 end
= page_cache_next_miss(mapping
, idx
+ 1, ULONG_MAX
);
904 return i_size_read(inode
) - (idx
<< PAGE_SHIFT
);
906 return (end
- idx
) << PAGE_SHIFT
;
910 bl_pg_init_write(struct nfs_pageio_descriptor
*pgio
, struct nfs_page
*req
)
914 if (!is_aligned_req(pgio
, req
, PAGE_SIZE
, true)) {
915 nfs_pageio_reset_write_mds(pgio
);
919 if (pgio
->pg_dreq
== NULL
)
920 wb_size
= pnfs_num_cont_bytes(pgio
->pg_inode
,
923 wb_size
= nfs_dreq_bytes_left(pgio
->pg_dreq
);
925 pnfs_generic_pg_init_write(pgio
, req
, wb_size
);
928 test_bit(NFS_LSEG_UNAVAILABLE
, &pgio
->pg_lseg
->pls_flags
)) {
930 pnfs_error_mark_layout_for_return(pgio
->pg_inode
, pgio
->pg_lseg
);
931 pnfs_set_lo_fail(pgio
->pg_lseg
);
932 nfs_pageio_reset_write_mds(pgio
);
937 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
938 * of bytes (maximum @req->wb_bytes) that can be coalesced.
941 bl_pg_test_write(struct nfs_pageio_descriptor
*pgio
, struct nfs_page
*prev
,
942 struct nfs_page
*req
)
944 if (!is_aligned_req(pgio
, req
, PAGE_SIZE
, true))
946 return pnfs_generic_pg_test(pgio
, prev
, req
);
949 static const struct nfs_pageio_ops bl_pg_read_ops
= {
950 .pg_init
= bl_pg_init_read
,
951 .pg_test
= bl_pg_test_read
,
952 .pg_doio
= pnfs_generic_pg_readpages
,
953 .pg_cleanup
= pnfs_generic_pg_cleanup
,
956 static const struct nfs_pageio_ops bl_pg_write_ops
= {
957 .pg_init
= bl_pg_init_write
,
958 .pg_test
= bl_pg_test_write
,
959 .pg_doio
= pnfs_generic_pg_writepages
,
960 .pg_cleanup
= pnfs_generic_pg_cleanup
,
963 static struct pnfs_layoutdriver_type blocklayout_type
= {
964 .id
= LAYOUT_BLOCK_VOLUME
,
965 .name
= "LAYOUT_BLOCK_VOLUME",
966 .owner
= THIS_MODULE
,
967 .flags
= PNFS_LAYOUTRET_ON_SETATTR
|
968 PNFS_LAYOUTRET_ON_ERROR
|
969 PNFS_READ_WHOLE_PAGE
,
970 .read_pagelist
= bl_read_pagelist
,
971 .write_pagelist
= bl_write_pagelist
,
972 .alloc_layout_hdr
= bl_alloc_layout_hdr
,
973 .free_layout_hdr
= bl_free_layout_hdr
,
974 .alloc_lseg
= bl_alloc_lseg
,
975 .free_lseg
= bl_free_lseg
,
976 .return_range
= bl_return_range
,
977 .prepare_layoutcommit
= bl_prepare_layoutcommit
,
978 .cleanup_layoutcommit
= bl_cleanup_layoutcommit
,
979 .set_layoutdriver
= bl_set_layoutdriver
,
980 .alloc_deviceid_node
= bl_alloc_deviceid_node
,
981 .free_deviceid_node
= bl_free_deviceid_node
,
982 .pg_read_ops
= &bl_pg_read_ops
,
983 .pg_write_ops
= &bl_pg_write_ops
,
984 .sync
= pnfs_generic_sync
,
987 static struct pnfs_layoutdriver_type scsilayout_type
= {
989 .name
= "LAYOUT_SCSI",
990 .owner
= THIS_MODULE
,
991 .flags
= PNFS_LAYOUTRET_ON_SETATTR
|
992 PNFS_LAYOUTRET_ON_ERROR
|
993 PNFS_READ_WHOLE_PAGE
,
994 .read_pagelist
= bl_read_pagelist
,
995 .write_pagelist
= bl_write_pagelist
,
996 .alloc_layout_hdr
= sl_alloc_layout_hdr
,
997 .free_layout_hdr
= bl_free_layout_hdr
,
998 .alloc_lseg
= bl_alloc_lseg
,
999 .free_lseg
= bl_free_lseg
,
1000 .return_range
= bl_return_range
,
1001 .prepare_layoutcommit
= bl_prepare_layoutcommit
,
1002 .cleanup_layoutcommit
= bl_cleanup_layoutcommit
,
1003 .set_layoutdriver
= bl_set_layoutdriver
,
1004 .alloc_deviceid_node
= bl_alloc_deviceid_node
,
1005 .free_deviceid_node
= bl_free_deviceid_node
,
1006 .pg_read_ops
= &bl_pg_read_ops
,
1007 .pg_write_ops
= &bl_pg_write_ops
,
1008 .sync
= pnfs_generic_sync
,
1012 static int __init
nfs4blocklayout_init(void)
1016 dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__
);
1018 ret
= bl_init_pipefs();
1022 ret
= pnfs_register_layoutdriver(&blocklayout_type
);
1024 goto out_cleanup_pipe
;
1026 ret
= pnfs_register_layoutdriver(&scsilayout_type
);
1028 goto out_unregister_block
;
1031 out_unregister_block
:
1032 pnfs_unregister_layoutdriver(&blocklayout_type
);
1034 bl_cleanup_pipefs();
1039 static void __exit
nfs4blocklayout_exit(void)
1041 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1044 pnfs_unregister_layoutdriver(&scsilayout_type
);
1045 pnfs_unregister_layoutdriver(&blocklayout_type
);
1046 bl_cleanup_pipefs();
1049 MODULE_ALIAS("nfs-layouttype4-3");
1050 MODULE_ALIAS("nfs-layouttype4-5");
1052 module_init(nfs4blocklayout_init
);
1053 module_exit(nfs4blocklayout_exit
);