2 * linux/fs/nfs/blocklayout/blocklayout.c
4 * Module for the NFSv4.1 pNFS block layout driver.
6 * Copyright (c) 2006 The Regents of the University of Michigan.
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
33 #include <linux/module.h>
34 #include <linux/init.h>
35 #include <linux/mount.h>
36 #include <linux/namei.h>
37 #include <linux/bio.h> /* struct bio */
38 #include <linux/buffer_head.h> /* various write calls */
39 #include <linux/prefetch.h>
41 #include "blocklayout.h"
43 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
45 MODULE_LICENSE("GPL");
46 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
47 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
49 struct dentry
*bl_device_pipe
;
50 wait_queue_head_t bl_wq
;
52 static void print_page(struct page
*page
)
54 dprintk("PRINTPAGE page %p\n", page
);
55 dprintk(" PagePrivate %d\n", PagePrivate(page
));
56 dprintk(" PageUptodate %d\n", PageUptodate(page
));
57 dprintk(" PageError %d\n", PageError(page
));
58 dprintk(" PageDirty %d\n", PageDirty(page
));
59 dprintk(" PageReferenced %d\n", PageReferenced(page
));
60 dprintk(" PageLocked %d\n", PageLocked(page
));
61 dprintk(" PageWriteback %d\n", PageWriteback(page
));
62 dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page
));
66 /* Given the be associated with isect, determine if page data needs to be
69 static int is_hole(struct pnfs_block_extent
*be
, sector_t isect
)
71 if (be
->be_state
== PNFS_BLOCK_NONE_DATA
)
73 else if (be
->be_state
!= PNFS_BLOCK_INVALID_DATA
)
76 return !bl_is_sector_init(be
->be_inval
, isect
);
79 /* Given the be associated with isect, determine if page data can be
82 static int is_writable(struct pnfs_block_extent
*be
, sector_t isect
)
84 return (be
->be_state
== PNFS_BLOCK_READWRITE_DATA
||
85 be
->be_state
== PNFS_BLOCK_INVALID_DATA
);
88 /* The data we are handed might be spread across several bios. We need
89 * to track when the last one is finished.
93 void (*pnfs_callback
) (void *data
, int num_se
);
98 static inline struct parallel_io
*alloc_parallel(void *data
)
100 struct parallel_io
*rv
;
102 rv
= kmalloc(sizeof(*rv
), GFP_NOFS
);
105 kref_init(&rv
->refcnt
);
111 static inline void get_parallel(struct parallel_io
*p
)
113 kref_get(&p
->refcnt
);
116 static void destroy_parallel(struct kref
*kref
)
118 struct parallel_io
*p
= container_of(kref
, struct parallel_io
, refcnt
);
120 dprintk("%s enter\n", __func__
);
121 p
->pnfs_callback(p
->data
, p
->bse_count
);
125 static inline void put_parallel(struct parallel_io
*p
)
127 kref_put(&p
->refcnt
, destroy_parallel
);
131 bl_submit_bio(int rw
, struct bio
*bio
)
134 get_parallel(bio
->bi_private
);
135 dprintk("%s submitting %s bio %u@%llu\n", __func__
,
136 rw
== READ
? "read" : "write",
137 bio
->bi_size
, (unsigned long long)bio
->bi_sector
);
143 static struct bio
*bl_alloc_init_bio(int npg
, sector_t isect
,
144 struct pnfs_block_extent
*be
,
145 void (*end_io
)(struct bio
*, int err
),
146 struct parallel_io
*par
)
150 npg
= min(npg
, BIO_MAX_PAGES
);
151 bio
= bio_alloc(GFP_NOIO
, npg
);
152 if (!bio
&& (current
->flags
& PF_MEMALLOC
)) {
153 while (!bio
&& (npg
/= 2))
154 bio
= bio_alloc(GFP_NOIO
, npg
);
158 bio
->bi_sector
= isect
- be
->be_f_offset
+ be
->be_v_offset
;
159 bio
->bi_bdev
= be
->be_mdev
;
160 bio
->bi_end_io
= end_io
;
161 bio
->bi_private
= par
;
166 static struct bio
*bl_add_page_to_bio(struct bio
*bio
, int npg
, int rw
,
167 sector_t isect
, struct page
*page
,
168 struct pnfs_block_extent
*be
,
169 void (*end_io
)(struct bio
*, int err
),
170 struct parallel_io
*par
)
174 bio
= bl_alloc_init_bio(npg
, isect
, be
, end_io
, par
);
176 return ERR_PTR(-ENOMEM
);
178 if (bio_add_page(bio
, page
, PAGE_CACHE_SIZE
, 0) < PAGE_CACHE_SIZE
) {
179 bio
= bl_submit_bio(rw
, bio
);
185 /* This is basically copied from mpage_end_io_read */
186 static void bl_end_io_read(struct bio
*bio
, int err
)
188 struct parallel_io
*par
= bio
->bi_private
;
189 const int uptodate
= test_bit(BIO_UPTODATE
, &bio
->bi_flags
);
190 struct bio_vec
*bvec
= bio
->bi_io_vec
+ bio
->bi_vcnt
- 1;
191 struct nfs_read_data
*rdata
= (struct nfs_read_data
*)par
->data
;
194 struct page
*page
= bvec
->bv_page
;
196 if (--bvec
>= bio
->bi_io_vec
)
197 prefetchw(&bvec
->bv_page
->flags
);
199 SetPageUptodate(page
);
200 } while (bvec
>= bio
->bi_io_vec
);
202 if (!rdata
->pnfs_error
)
203 rdata
->pnfs_error
= -EIO
;
204 pnfs_set_lo_fail(rdata
->lseg
);
210 static void bl_read_cleanup(struct work_struct
*work
)
212 struct rpc_task
*task
;
213 struct nfs_read_data
*rdata
;
214 dprintk("%s enter\n", __func__
);
215 task
= container_of(work
, struct rpc_task
, u
.tk_work
);
216 rdata
= container_of(task
, struct nfs_read_data
, task
);
217 pnfs_ld_read_done(rdata
);
221 bl_end_par_io_read(void *data
, int unused
)
223 struct nfs_read_data
*rdata
= data
;
225 rdata
->task
.tk_status
= rdata
->pnfs_error
;
226 INIT_WORK(&rdata
->task
.u
.tk_work
, bl_read_cleanup
);
227 schedule_work(&rdata
->task
.u
.tk_work
);
230 static enum pnfs_try_status
231 bl_read_pagelist(struct nfs_read_data
*rdata
)
234 struct bio
*bio
= NULL
;
235 struct pnfs_block_extent
*be
= NULL
, *cow_read
= NULL
;
236 sector_t isect
, extent_length
= 0;
237 struct parallel_io
*par
;
238 loff_t f_offset
= rdata
->args
.offset
;
239 size_t count
= rdata
->args
.count
;
240 struct page
**pages
= rdata
->args
.pages
;
241 int pg_index
= rdata
->args
.pgbase
>> PAGE_CACHE_SHIFT
;
243 dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__
,
244 rdata
->npages
, f_offset
, count
);
246 par
= alloc_parallel(rdata
);
249 par
->pnfs_callback
= bl_end_par_io_read
;
250 /* At this point, we can no longer jump to use_mds */
252 isect
= (sector_t
) (f_offset
>> SECTOR_SHIFT
);
253 /* Code assumes extents are page-aligned */
254 for (i
= pg_index
; i
< rdata
->npages
; i
++) {
255 if (!extent_length
) {
256 /* We've used up the previous extent */
258 bl_put_extent(cow_read
);
259 bio
= bl_submit_bio(READ
, bio
);
260 /* Get the next one */
261 be
= bl_find_get_extent(BLK_LSEG2EXT(rdata
->lseg
),
264 rdata
->pnfs_error
= -EIO
;
267 extent_length
= be
->be_length
-
268 (isect
- be
->be_f_offset
);
270 sector_t cow_length
= cow_read
->be_length
-
271 (isect
- cow_read
->be_f_offset
);
272 extent_length
= min(extent_length
, cow_length
);
275 hole
= is_hole(be
, isect
);
276 if (hole
&& !cow_read
) {
277 bio
= bl_submit_bio(READ
, bio
);
278 /* Fill hole w/ zeroes w/o accessing device */
279 dprintk("%s Zeroing page for hole\n", __func__
);
280 zero_user_segment(pages
[i
], 0, PAGE_CACHE_SIZE
);
281 print_page(pages
[i
]);
282 SetPageUptodate(pages
[i
]);
284 struct pnfs_block_extent
*be_read
;
286 be_read
= (hole
&& cow_read
) ? cow_read
: be
;
287 bio
= bl_add_page_to_bio(bio
, rdata
->npages
- i
, READ
,
288 isect
, pages
[i
], be_read
,
289 bl_end_io_read
, par
);
291 rdata
->pnfs_error
= PTR_ERR(bio
);
296 isect
+= PAGE_CACHE_SECTORS
;
297 extent_length
-= PAGE_CACHE_SECTORS
;
299 if ((isect
<< SECTOR_SHIFT
) >= rdata
->inode
->i_size
) {
301 rdata
->res
.count
= rdata
->inode
->i_size
- f_offset
;
303 rdata
->res
.count
= (isect
<< SECTOR_SHIFT
) - f_offset
;
307 bl_put_extent(cow_read
);
308 bl_submit_bio(READ
, bio
);
310 return PNFS_ATTEMPTED
;
313 dprintk("Giving up and using normal NFS\n");
314 return PNFS_NOT_ATTEMPTED
;
317 static void mark_extents_written(struct pnfs_block_layout
*bl
,
318 __u64 offset
, __u32 count
)
321 struct pnfs_block_extent
*be
;
322 struct pnfs_block_short_extent
*se
;
324 dprintk("%s(%llu, %u)\n", __func__
, offset
, count
);
327 isect
= (offset
& (long)(PAGE_CACHE_MASK
)) >> SECTOR_SHIFT
;
328 end
= (offset
+ count
+ PAGE_CACHE_SIZE
- 1) & (long)(PAGE_CACHE_MASK
);
329 end
>>= SECTOR_SHIFT
;
330 while (isect
< end
) {
332 be
= bl_find_get_extent(bl
, isect
, NULL
);
333 BUG_ON(!be
); /* FIXME */
334 len
= min(end
, be
->be_f_offset
+ be
->be_length
) - isect
;
335 if (be
->be_state
== PNFS_BLOCK_INVALID_DATA
) {
336 se
= bl_pop_one_short_extent(be
->be_inval
);
338 bl_mark_for_commit(be
, isect
, len
, se
);
345 static void bl_end_io_write_zero(struct bio
*bio
, int err
)
347 struct parallel_io
*par
= bio
->bi_private
;
348 const int uptodate
= test_bit(BIO_UPTODATE
, &bio
->bi_flags
);
349 struct bio_vec
*bvec
= bio
->bi_io_vec
+ bio
->bi_vcnt
- 1;
350 struct nfs_write_data
*wdata
= (struct nfs_write_data
*)par
->data
;
353 struct page
*page
= bvec
->bv_page
;
355 if (--bvec
>= bio
->bi_io_vec
)
356 prefetchw(&bvec
->bv_page
->flags
);
357 /* This is the zeroing page we added */
358 end_page_writeback(page
);
359 page_cache_release(page
);
360 } while (bvec
>= bio
->bi_io_vec
);
362 if (unlikely(!uptodate
)) {
363 if (!wdata
->pnfs_error
)
364 wdata
->pnfs_error
= -EIO
;
365 pnfs_set_lo_fail(wdata
->lseg
);
371 static void bl_end_io_write(struct bio
*bio
, int err
)
373 struct parallel_io
*par
= bio
->bi_private
;
374 const int uptodate
= test_bit(BIO_UPTODATE
, &bio
->bi_flags
);
375 struct nfs_write_data
*wdata
= (struct nfs_write_data
*)par
->data
;
378 if (!wdata
->pnfs_error
)
379 wdata
->pnfs_error
= -EIO
;
380 pnfs_set_lo_fail(wdata
->lseg
);
386 /* Function scheduled for call during bl_end_par_io_write,
387 * it marks sectors as written and extends the commitlist.
389 static void bl_write_cleanup(struct work_struct
*work
)
391 struct rpc_task
*task
;
392 struct nfs_write_data
*wdata
;
393 dprintk("%s enter\n", __func__
);
394 task
= container_of(work
, struct rpc_task
, u
.tk_work
);
395 wdata
= container_of(task
, struct nfs_write_data
, task
);
396 if (likely(!wdata
->pnfs_error
)) {
397 /* Marks for LAYOUTCOMMIT */
398 mark_extents_written(BLK_LSEG2EXT(wdata
->lseg
),
399 wdata
->args
.offset
, wdata
->args
.count
);
401 pnfs_ld_write_done(wdata
);
404 /* Called when last of bios associated with a bl_write_pagelist call finishes */
405 static void bl_end_par_io_write(void *data
, int num_se
)
407 struct nfs_write_data
*wdata
= data
;
409 if (unlikely(wdata
->pnfs_error
)) {
410 bl_free_short_extents(&BLK_LSEG2EXT(wdata
->lseg
)->bl_inval
,
414 wdata
->task
.tk_status
= wdata
->pnfs_error
;
415 wdata
->verf
.committed
= NFS_FILE_SYNC
;
416 INIT_WORK(&wdata
->task
.u
.tk_work
, bl_write_cleanup
);
417 schedule_work(&wdata
->task
.u
.tk_work
);
420 /* FIXME STUB - mark intersection of layout and page as bad, so is not
423 static void mark_bad_read(void)
429 * map_block: map a requested I/0 block (isect) into an offset in the LVM
433 map_block(struct buffer_head
*bh
, sector_t isect
, struct pnfs_block_extent
*be
)
435 dprintk("%s enter be=%p\n", __func__
, be
);
437 set_buffer_mapped(bh
);
438 bh
->b_bdev
= be
->be_mdev
;
439 bh
->b_blocknr
= (isect
- be
->be_f_offset
+ be
->be_v_offset
) >>
440 (be
->be_mdev
->bd_inode
->i_blkbits
- SECTOR_SHIFT
);
442 dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
443 __func__
, (unsigned long long)isect
, (long)bh
->b_blocknr
,
448 /* Given an unmapped page, zero it or read in page for COW, page is locked
452 init_page_for_write(struct page
*page
, struct pnfs_block_extent
*cow_read
)
454 struct buffer_head
*bh
= NULL
;
458 dprintk("%s enter, %p\n", __func__
, page
);
459 BUG_ON(PageUptodate(page
));
461 zero_user_segment(page
, 0, PAGE_SIZE
);
462 SetPageUptodate(page
);
466 bh
= alloc_page_buffers(page
, PAGE_CACHE_SIZE
, 0);
472 isect
= (sector_t
) page
->index
<< PAGE_CACHE_SECTOR_SHIFT
;
473 map_block(bh
, isect
, cow_read
);
474 if (!bh_uptodate_or_lock(bh
))
475 ret
= bh_submit_read(bh
);
478 SetPageUptodate(page
);
481 bl_put_extent(cow_read
);
483 free_buffer_head(bh
);
485 /* Need to mark layout with bad read...should now
486 * just use nfs4 for reads and writes.
493 /* Find or create a zeroing page marked being writeback.
494 * Return ERR_PTR on error, NULL to indicate skip this page and page itself
495 * to indicate write out.
498 bl_find_get_zeroing_page(struct inode
*inode
, pgoff_t index
,
499 struct pnfs_block_extent
*cow_read
)
503 page
= find_get_page(inode
->i_mapping
, index
);
507 page
= find_or_create_page(inode
->i_mapping
, index
, GFP_NOFS
);
508 if (unlikely(!page
)) {
509 dprintk("%s oom\n", __func__
);
510 return ERR_PTR(-ENOMEM
);
515 /* PageDirty: Other will write this out
516 * PageWriteback: Other is writing this out
517 * PageUptodate: It was read before
519 if (PageDirty(page
) || PageWriteback(page
)) {
523 page_cache_release(page
);
532 if (!PageUptodate(page
)) {
533 /* New page, readin or zero it */
534 init_page_for_write(page
, cow_read
);
536 set_page_writeback(page
);
542 static enum pnfs_try_status
543 bl_write_pagelist(struct nfs_write_data
*wdata
, int sync
)
545 int i
, ret
, npg_zero
, pg_index
, last
= 0;
546 struct bio
*bio
= NULL
;
547 struct pnfs_block_extent
*be
= NULL
, *cow_read
= NULL
;
548 sector_t isect
, last_isect
= 0, extent_length
= 0;
549 struct parallel_io
*par
;
550 loff_t offset
= wdata
->args
.offset
;
551 size_t count
= wdata
->args
.count
;
552 struct page
**pages
= wdata
->args
.pages
;
557 NFS_SERVER(wdata
->inode
)->pnfs_blksize
>> PAGE_CACHE_SHIFT
;
559 dprintk("%s enter, %Zu@%lld\n", __func__
, count
, offset
);
560 /* At this point, wdata->pages is a (sequential) list of nfs_pages.
561 * We want to write each, and if there is an error set pnfs_error
562 * to have it redone using nfs.
564 par
= alloc_parallel(wdata
);
567 par
->pnfs_callback
= bl_end_par_io_write
;
568 /* At this point, have to be more careful with error handling */
570 isect
= (sector_t
) ((offset
& (long)PAGE_CACHE_MASK
) >> SECTOR_SHIFT
);
571 be
= bl_find_get_extent(BLK_LSEG2EXT(wdata
->lseg
), isect
, &cow_read
);
572 if (!be
|| !is_writable(be
, isect
)) {
573 dprintk("%s no matching extents!\n", __func__
);
577 /* First page inside INVALID extent */
578 if (be
->be_state
== PNFS_BLOCK_INVALID_DATA
) {
579 if (likely(!bl_push_one_short_extent(be
->be_inval
)))
583 temp
= offset
>> PAGE_CACHE_SHIFT
;
584 npg_zero
= do_div(temp
, npg_per_block
);
585 isect
= (sector_t
) (((offset
- npg_zero
* PAGE_CACHE_SIZE
) &
586 (long)PAGE_CACHE_MASK
) >> SECTOR_SHIFT
);
587 extent_length
= be
->be_length
- (isect
- be
->be_f_offset
);
590 dprintk("%s need to zero %d pages\n", __func__
, npg_zero
);
591 for (;npg_zero
> 0; npg_zero
--) {
592 if (bl_is_sector_init(be
->be_inval
, isect
)) {
593 dprintk("isect %llu already init\n",
594 (unsigned long long)isect
);
597 /* page ref released in bl_end_io_write_zero */
598 index
= isect
>> PAGE_CACHE_SECTOR_SHIFT
;
599 dprintk("%s zero %dth page: index %lu isect %llu\n",
600 __func__
, npg_zero
, index
,
601 (unsigned long long)isect
);
602 page
= bl_find_get_zeroing_page(wdata
->inode
, index
,
604 if (unlikely(IS_ERR(page
))) {
605 wdata
->pnfs_error
= PTR_ERR(page
);
607 } else if (page
== NULL
)
610 ret
= bl_mark_sectors_init(be
->be_inval
, isect
,
613 dprintk("%s bl_mark_sectors_init fail %d\n",
615 end_page_writeback(page
);
616 page_cache_release(page
);
617 wdata
->pnfs_error
= ret
;
620 if (likely(!bl_push_one_short_extent(be
->be_inval
)))
623 end_page_writeback(page
);
624 page_cache_release(page
);
625 wdata
->pnfs_error
= -ENOMEM
;
628 /* FIXME: This should be done in bi_end_io */
629 mark_extents_written(BLK_LSEG2EXT(wdata
->lseg
),
630 page
->index
<< PAGE_CACHE_SHIFT
,
633 bio
= bl_add_page_to_bio(bio
, npg_zero
, WRITE
,
635 bl_end_io_write_zero
, par
);
637 wdata
->pnfs_error
= PTR_ERR(bio
);
642 isect
+= PAGE_CACHE_SECTORS
;
643 extent_length
-= PAGE_CACHE_SECTORS
;
648 bio
= bl_submit_bio(WRITE
, bio
);
651 pg_index
= wdata
->args
.pgbase
>> PAGE_CACHE_SHIFT
;
652 for (i
= pg_index
; i
< wdata
->npages
; i
++) {
653 if (!extent_length
) {
654 /* We've used up the previous extent */
656 bio
= bl_submit_bio(WRITE
, bio
);
657 /* Get the next one */
658 be
= bl_find_get_extent(BLK_LSEG2EXT(wdata
->lseg
),
660 if (!be
|| !is_writable(be
, isect
)) {
661 wdata
->pnfs_error
= -EINVAL
;
664 if (be
->be_state
== PNFS_BLOCK_INVALID_DATA
) {
665 if (likely(!bl_push_one_short_extent(
669 wdata
->pnfs_error
= -ENOMEM
;
673 extent_length
= be
->be_length
-
674 (isect
- be
->be_f_offset
);
676 if (be
->be_state
== PNFS_BLOCK_INVALID_DATA
) {
677 ret
= bl_mark_sectors_init(be
->be_inval
, isect
,
680 dprintk("%s bl_mark_sectors_init fail %d\n",
682 wdata
->pnfs_error
= ret
;
686 bio
= bl_add_page_to_bio(bio
, wdata
->npages
- i
, WRITE
,
688 bl_end_io_write
, par
);
690 wdata
->pnfs_error
= PTR_ERR(bio
);
694 isect
+= PAGE_CACHE_SECTORS
;
696 extent_length
-= PAGE_CACHE_SECTORS
;
699 /* Last page inside INVALID extent */
700 if (be
->be_state
== PNFS_BLOCK_INVALID_DATA
) {
701 bio
= bl_submit_bio(WRITE
, bio
);
702 temp
= last_isect
>> PAGE_CACHE_SECTOR_SHIFT
;
703 npg_zero
= npg_per_block
- do_div(temp
, npg_per_block
);
704 if (npg_zero
< npg_per_block
) {
706 goto fill_invalid_ext
;
711 wdata
->res
.count
= (last_isect
<< SECTOR_SHIFT
) - (offset
);
712 if (count
< wdata
->res
.count
) {
713 wdata
->res
.count
= count
;
717 bl_submit_bio(WRITE
, bio
);
719 return PNFS_ATTEMPTED
;
723 return PNFS_NOT_ATTEMPTED
;
726 /* FIXME - range ignored */
728 release_extents(struct pnfs_block_layout
*bl
, struct pnfs_layout_range
*range
)
731 struct pnfs_block_extent
*be
;
733 spin_lock(&bl
->bl_ext_lock
);
734 for (i
= 0; i
< EXTENT_LISTS
; i
++) {
735 while (!list_empty(&bl
->bl_extents
[i
])) {
736 be
= list_first_entry(&bl
->bl_extents
[i
],
737 struct pnfs_block_extent
,
739 list_del(&be
->be_node
);
743 spin_unlock(&bl
->bl_ext_lock
);
747 release_inval_marks(struct pnfs_inval_markings
*marks
)
749 struct pnfs_inval_tracking
*pos
, *temp
;
750 struct pnfs_block_short_extent
*se
, *stemp
;
752 list_for_each_entry_safe(pos
, temp
, &marks
->im_tree
.mtt_stub
, it_link
) {
753 list_del(&pos
->it_link
);
757 list_for_each_entry_safe(se
, stemp
, &marks
->im_extents
, bse_node
) {
758 list_del(&se
->bse_node
);
764 static void bl_free_layout_hdr(struct pnfs_layout_hdr
*lo
)
766 struct pnfs_block_layout
*bl
= BLK_LO2EXT(lo
);
768 dprintk("%s enter\n", __func__
);
769 release_extents(bl
, NULL
);
770 release_inval_marks(&bl
->bl_inval
);
774 static struct pnfs_layout_hdr
*bl_alloc_layout_hdr(struct inode
*inode
,
777 struct pnfs_block_layout
*bl
;
779 dprintk("%s enter\n", __func__
);
780 bl
= kzalloc(sizeof(*bl
), gfp_flags
);
783 spin_lock_init(&bl
->bl_ext_lock
);
784 INIT_LIST_HEAD(&bl
->bl_extents
[0]);
785 INIT_LIST_HEAD(&bl
->bl_extents
[1]);
786 INIT_LIST_HEAD(&bl
->bl_commit
);
787 INIT_LIST_HEAD(&bl
->bl_committing
);
789 bl
->bl_blocksize
= NFS_SERVER(inode
)->pnfs_blksize
>> SECTOR_SHIFT
;
790 BL_INIT_INVAL_MARKS(&bl
->bl_inval
, bl
->bl_blocksize
);
791 return &bl
->bl_layout
;
794 static void bl_free_lseg(struct pnfs_layout_segment
*lseg
)
796 dprintk("%s enter\n", __func__
);
800 /* We pretty much ignore lseg, and store all data layout wide, so we
801 * can correctly merge.
803 static struct pnfs_layout_segment
*bl_alloc_lseg(struct pnfs_layout_hdr
*lo
,
804 struct nfs4_layoutget_res
*lgr
,
807 struct pnfs_layout_segment
*lseg
;
810 dprintk("%s enter\n", __func__
);
811 lseg
= kzalloc(sizeof(*lseg
), gfp_flags
);
813 return ERR_PTR(-ENOMEM
);
814 status
= nfs4_blk_process_layoutget(lo
, lgr
, gfp_flags
);
816 /* We don't want to call the full-blown bl_free_lseg,
817 * since on error extents were not touched.
820 return ERR_PTR(status
);
826 bl_encode_layoutcommit(struct pnfs_layout_hdr
*lo
, struct xdr_stream
*xdr
,
827 const struct nfs4_layoutcommit_args
*arg
)
829 dprintk("%s enter\n", __func__
);
830 encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo
), xdr
, arg
);
834 bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data
*lcdata
)
836 struct pnfs_layout_hdr
*lo
= NFS_I(lcdata
->args
.inode
)->layout
;
838 dprintk("%s enter\n", __func__
);
839 clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo
), &lcdata
->args
, lcdata
->res
.status
);
842 static void free_blk_mountid(struct block_mount_id
*mid
)
845 struct pnfs_block_dev
*dev
, *tmp
;
847 /* No need to take bm_lock as we are last user freeing bm_devlist */
848 list_for_each_entry_safe(dev
, tmp
, &mid
->bm_devlist
, bm_node
) {
849 list_del(&dev
->bm_node
);
850 bl_free_block_dev(dev
);
856 /* This is mostly copied from the filelayout's get_device_info function.
857 * It seems much of this should be at the generic pnfs level.
859 static struct pnfs_block_dev
*
860 nfs4_blk_get_deviceinfo(struct nfs_server
*server
, const struct nfs_fh
*fh
,
861 struct nfs4_deviceid
*d_id
)
863 struct pnfs_device
*dev
;
864 struct pnfs_block_dev
*rv
;
867 struct page
**pages
= NULL
;
871 * Use the session max response size as the basis for setting
872 * GETDEVICEINFO's maxcount
874 max_resp_sz
= server
->nfs_client
->cl_session
->fc_attrs
.max_resp_sz
;
875 max_pages
= max_resp_sz
>> PAGE_SHIFT
;
876 dprintk("%s max_resp_sz %u max_pages %d\n",
877 __func__
, max_resp_sz
, max_pages
);
879 dev
= kmalloc(sizeof(*dev
), GFP_NOFS
);
881 dprintk("%s kmalloc failed\n", __func__
);
882 return ERR_PTR(-ENOMEM
);
885 pages
= kzalloc(max_pages
* sizeof(struct page
*), GFP_NOFS
);
888 return ERR_PTR(-ENOMEM
);
890 for (i
= 0; i
< max_pages
; i
++) {
891 pages
[i
] = alloc_page(GFP_NOFS
);
893 rv
= ERR_PTR(-ENOMEM
);
898 memcpy(&dev
->dev_id
, d_id
, sizeof(*d_id
));
899 dev
->layout_type
= LAYOUT_BLOCK_VOLUME
;
902 dev
->pglen
= PAGE_SIZE
* max_pages
;
905 dprintk("%s: dev_id: %s\n", __func__
, dev
->dev_id
.data
);
906 rc
= nfs4_proc_getdeviceinfo(server
, dev
);
907 dprintk("%s getdevice info returns %d\n", __func__
, rc
);
913 rv
= nfs4_blk_decode_device(server
, dev
);
915 for (i
= 0; i
< max_pages
; i
++)
916 __free_page(pages
[i
]);
923 bl_set_layoutdriver(struct nfs_server
*server
, const struct nfs_fh
*fh
)
925 struct block_mount_id
*b_mt_id
= NULL
;
926 struct pnfs_devicelist
*dlist
= NULL
;
927 struct pnfs_block_dev
*bdev
;
928 LIST_HEAD(block_disklist
);
931 dprintk("%s enter\n", __func__
);
933 if (server
->pnfs_blksize
== 0) {
934 dprintk("%s Server did not return blksize\n", __func__
);
937 b_mt_id
= kzalloc(sizeof(struct block_mount_id
), GFP_NOFS
);
942 /* Initialize nfs4 block layout mount id */
943 spin_lock_init(&b_mt_id
->bm_lock
);
944 INIT_LIST_HEAD(&b_mt_id
->bm_devlist
);
946 dlist
= kmalloc(sizeof(struct pnfs_devicelist
), GFP_NOFS
);
952 while (!dlist
->eof
) {
953 status
= nfs4_proc_getdevicelist(server
, fh
, dlist
);
956 dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
957 __func__
, dlist
->num_devs
, dlist
->eof
);
958 for (i
= 0; i
< dlist
->num_devs
; i
++) {
959 bdev
= nfs4_blk_get_deviceinfo(server
, fh
,
962 status
= PTR_ERR(bdev
);
965 spin_lock(&b_mt_id
->bm_lock
);
966 list_add(&bdev
->bm_node
, &b_mt_id
->bm_devlist
);
967 spin_unlock(&b_mt_id
->bm_lock
);
970 dprintk("%s SUCCESS\n", __func__
);
971 server
->pnfs_ld_data
= b_mt_id
;
978 free_blk_mountid(b_mt_id
);
983 bl_clear_layoutdriver(struct nfs_server
*server
)
985 struct block_mount_id
*b_mt_id
= server
->pnfs_ld_data
;
987 dprintk("%s enter\n", __func__
);
988 free_blk_mountid(b_mt_id
);
989 dprintk("%s RETURNS\n", __func__
);
993 static const struct nfs_pageio_ops bl_pg_read_ops
= {
994 .pg_init
= pnfs_generic_pg_init_read
,
995 .pg_test
= pnfs_generic_pg_test
,
996 .pg_doio
= pnfs_generic_pg_readpages
,
999 static const struct nfs_pageio_ops bl_pg_write_ops
= {
1000 .pg_init
= pnfs_generic_pg_init_write
,
1001 .pg_test
= pnfs_generic_pg_test
,
1002 .pg_doio
= pnfs_generic_pg_writepages
,
1005 static struct pnfs_layoutdriver_type blocklayout_type
= {
1006 .id
= LAYOUT_BLOCK_VOLUME
,
1007 .name
= "LAYOUT_BLOCK_VOLUME",
1008 .read_pagelist
= bl_read_pagelist
,
1009 .write_pagelist
= bl_write_pagelist
,
1010 .alloc_layout_hdr
= bl_alloc_layout_hdr
,
1011 .free_layout_hdr
= bl_free_layout_hdr
,
1012 .alloc_lseg
= bl_alloc_lseg
,
1013 .free_lseg
= bl_free_lseg
,
1014 .encode_layoutcommit
= bl_encode_layoutcommit
,
1015 .cleanup_layoutcommit
= bl_cleanup_layoutcommit
,
1016 .set_layoutdriver
= bl_set_layoutdriver
,
1017 .clear_layoutdriver
= bl_clear_layoutdriver
,
1018 .pg_read_ops
= &bl_pg_read_ops
,
1019 .pg_write_ops
= &bl_pg_write_ops
,
1022 static const struct rpc_pipe_ops bl_upcall_ops
= {
1023 .upcall
= rpc_pipe_generic_upcall
,
1024 .downcall
= bl_pipe_downcall
,
1025 .destroy_msg
= bl_pipe_destroy_msg
,
1028 static int __init
nfs4blocklayout_init(void)
1030 struct vfsmount
*mnt
;
1034 dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__
);
1036 ret
= pnfs_register_layoutdriver(&blocklayout_type
);
1040 init_waitqueue_head(&bl_wq
);
1042 mnt
= rpc_get_mount();
1048 ret
= vfs_path_lookup(mnt
->mnt_root
,
1050 NFS_PIPE_DIRNAME
, 0, &path
);
1054 bl_device_pipe
= rpc_mkpipe(path
.dentry
, "blocklayout", NULL
,
1057 if (IS_ERR(bl_device_pipe
)) {
1058 ret
= PTR_ERR(bl_device_pipe
);
1067 pnfs_unregister_layoutdriver(&blocklayout_type
);
1071 static void __exit
nfs4blocklayout_exit(void)
1073 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1076 pnfs_unregister_layoutdriver(&blocklayout_type
);
1077 rpc_unlink(bl_device_pipe
);
1081 MODULE_ALIAS("nfs-layouttype4-3");
1083 module_init(nfs4blocklayout_init
);
1084 module_exit(nfs4blocklayout_exit
);