2 * linux/fs/nfs/blocklayout/blocklayoutdev.c
4 * Device operations for the pnfs nfs4 file layout driver.
6 * Copyright (c) 2006 The Regents of the University of Michigan.
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
32 #include <linux/module.h>
33 #include <linux/buffer_head.h> /* __bread */
35 #include <linux/genhd.h>
36 #include <linux/blkdev.h>
37 #include <linux/hash.h>
39 #include "blocklayout.h"
41 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
43 static int decode_sector_number(__be32
**rp
, sector_t
*sp
)
47 *rp
= xdr_decode_hyper(*rp
, &s
);
49 printk(KERN_WARNING
"%s: sector not aligned\n", __func__
);
52 *sp
= s
>> SECTOR_SHIFT
;
56 /* Open a block_device by device number. */
57 struct block_device
*nfs4_blkdev_get(dev_t dev
)
59 struct block_device
*bd
;
61 dprintk("%s enter\n", __func__
);
62 bd
= blkdev_get_by_dev(dev
, FMODE_READ
, NULL
);
67 dprintk("%s failed to open device : %ld\n",
68 __func__
, PTR_ERR(bd
));
73 * Release the block device
75 int nfs4_blkdev_put(struct block_device
*bdev
)
77 dprintk("%s for device %d:%d\n", __func__
, MAJOR(bdev
->bd_dev
),
79 return blkdev_put(bdev
, FMODE_READ
);
83 * Shouldn't there be a rpc_generic_upcall() to do this for us?
85 ssize_t
bl_pipe_upcall(struct file
*filp
, struct rpc_pipe_msg
*msg
,
86 char __user
*dst
, size_t buflen
)
88 char *data
= (char *)msg
->data
+ msg
->copied
;
89 size_t mlen
= min(msg
->len
- msg
->copied
, buflen
);
92 left
= copy_to_user(dst
, data
, mlen
);
104 static struct bl_dev_msg bl_mount_reply
;
106 ssize_t
bl_pipe_downcall(struct file
*filp
, const char __user
*src
,
109 if (mlen
!= sizeof (struct bl_dev_msg
))
112 if (copy_from_user(&bl_mount_reply
, src
, mlen
) != 0)
120 void bl_pipe_destroy_msg(struct rpc_pipe_msg
*msg
)
128 * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
130 struct pnfs_block_dev
*
131 nfs4_blk_decode_device(struct nfs_server
*server
,
132 struct pnfs_device
*dev
)
134 struct pnfs_block_dev
*rv
= NULL
;
135 struct block_device
*bd
= NULL
;
136 struct rpc_pipe_msg msg
;
137 struct bl_msg_hdr bl_msg
= {
138 .type
= BL_DEVICE_MOUNT
,
139 .totallen
= dev
->mincount
,
142 DECLARE_WAITQUEUE(wq
, current
);
143 struct bl_dev_msg
*reply
= &bl_mount_reply
;
146 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__
);
147 dprintk("%s: deviceid: %s, mincount: %d\n", __func__
, dev
->dev_id
.data
,
150 memset(&msg
, 0, sizeof(msg
));
151 msg
.data
= kzalloc(sizeof(bl_msg
) + dev
->mincount
, GFP_NOFS
);
153 rv
= ERR_PTR(-ENOMEM
);
157 memcpy(msg
.data
, &bl_msg
, sizeof(bl_msg
));
158 dataptr
= (uint8_t *) msg
.data
;
160 offset
= sizeof(bl_msg
);
161 for (i
= 0; len
> 0; i
++) {
162 memcpy(&dataptr
[offset
], page_address(dev
->pages
[i
]),
163 len
< PAGE_CACHE_SIZE
? len
: PAGE_CACHE_SIZE
);
164 len
-= PAGE_CACHE_SIZE
;
165 offset
+= PAGE_CACHE_SIZE
;
167 msg
.len
= sizeof(bl_msg
) + dev
->mincount
;
169 dprintk("%s CALLING USERSPACE DAEMON\n", __func__
);
170 add_wait_queue(&bl_wq
, &wq
);
171 if (rpc_queue_upcall(bl_device_pipe
->d_inode
, &msg
) < 0) {
172 remove_wait_queue(&bl_wq
, &wq
);
176 set_current_state(TASK_UNINTERRUPTIBLE
);
178 __set_current_state(TASK_RUNNING
);
179 remove_wait_queue(&bl_wq
, &wq
);
181 if (reply
->status
!= BL_DEVICE_REQUEST_PROC
) {
182 dprintk("%s failed to open device: %d\n",
183 __func__
, reply
->status
);
184 rv
= ERR_PTR(-EINVAL
);
188 bd
= nfs4_blkdev_get(MKDEV(reply
->major
, reply
->minor
));
190 dprintk("%s failed to open device : %ld\n",
191 __func__
, PTR_ERR(bd
));
195 rv
= kzalloc(sizeof(*rv
), GFP_NOFS
);
197 rv
= ERR_PTR(-ENOMEM
);
202 memcpy(&rv
->bm_mdevid
, &dev
->dev_id
, sizeof(struct nfs4_deviceid
));
203 dprintk("%s Created device %s with bd_block_size %u\n",
205 bd
->bd_disk
->disk_name
,
213 /* Map deviceid returned by the server to constructed block_device */
214 static struct block_device
*translate_devid(struct pnfs_layout_hdr
*lo
,
215 struct nfs4_deviceid
*id
)
217 struct block_device
*rv
= NULL
;
218 struct block_mount_id
*mid
;
219 struct pnfs_block_dev
*dev
;
221 dprintk("%s enter, lo=%p, id=%p\n", __func__
, lo
, id
);
223 spin_lock(&mid
->bm_lock
);
224 list_for_each_entry(dev
, &mid
->bm_devlist
, bm_node
) {
225 if (memcmp(id
->data
, dev
->bm_mdevid
.data
,
226 NFS4_DEVICEID4_SIZE
) == 0) {
232 spin_unlock(&mid
->bm_lock
);
233 dprintk("%s returning %p\n", __func__
, rv
);
237 /* Tracks info needed to ensure extents in layout obey constraints of spec */
238 struct layout_verification
{
239 u32 mode
; /* R or RW */
240 u64 start
; /* Expected start of next non-COW extent */
241 u64 inval
; /* Start of INVAL coverage */
242 u64 cowread
; /* End of COW read coverage */
245 /* Verify the extent meets the layout requirements of the pnfs-block draft,
248 static int verify_extent(struct pnfs_block_extent
*be
,
249 struct layout_verification
*lv
)
251 if (lv
->mode
== IOMODE_READ
) {
252 if (be
->be_state
== PNFS_BLOCK_READWRITE_DATA
||
253 be
->be_state
== PNFS_BLOCK_INVALID_DATA
)
255 if (be
->be_f_offset
!= lv
->start
)
257 lv
->start
+= be
->be_length
;
260 /* lv->mode == IOMODE_RW */
261 if (be
->be_state
== PNFS_BLOCK_READWRITE_DATA
) {
262 if (be
->be_f_offset
!= lv
->start
)
264 if (lv
->cowread
> lv
->start
)
266 lv
->start
+= be
->be_length
;
267 lv
->inval
= lv
->start
;
269 } else if (be
->be_state
== PNFS_BLOCK_INVALID_DATA
) {
270 if (be
->be_f_offset
!= lv
->start
)
272 lv
->start
+= be
->be_length
;
274 } else if (be
->be_state
== PNFS_BLOCK_READ_DATA
) {
275 if (be
->be_f_offset
> lv
->start
)
277 if (be
->be_f_offset
< lv
->inval
)
279 if (be
->be_f_offset
< lv
->cowread
)
281 /* It looks like you might want to min this with lv->start,
282 * but you really don't.
284 lv
->inval
= lv
->inval
+ be
->be_length
;
285 lv
->cowread
= be
->be_f_offset
+ be
->be_length
;
291 /* XDR decode pnfs_block_layout4 structure */
293 nfs4_blk_process_layoutget(struct pnfs_layout_hdr
*lo
,
294 struct nfs4_layoutget_res
*lgr
, gfp_t gfp_flags
)
296 struct pnfs_block_layout
*bl
= BLK_LO2EXT(lo
);
297 int i
, status
= -EIO
;
299 struct pnfs_block_extent
*be
= NULL
, *save
;
300 struct xdr_stream stream
;
302 struct page
*scratch
;
304 struct layout_verification lv
= {
305 .mode
= lgr
->range
.iomode
,
306 .start
= lgr
->range
.offset
>> SECTOR_SHIFT
,
307 .inval
= lgr
->range
.offset
>> SECTOR_SHIFT
,
308 .cowread
= lgr
->range
.offset
>> SECTOR_SHIFT
,
312 dprintk("---> %s\n", __func__
);
314 scratch
= alloc_page(gfp_flags
);
318 xdr_init_decode_pages(&stream
, &buf
, lgr
->layoutp
->pages
, lgr
->layoutp
->len
);
319 xdr_set_scratch_buffer(&stream
, page_address(scratch
), PAGE_SIZE
);
321 p
= xdr_inline_decode(&stream
, 4);
325 count
= be32_to_cpup(p
++);
327 dprintk("%s enter, number of extents %i\n", __func__
, count
);
328 p
= xdr_inline_decode(&stream
, (28 + NFS4_DEVICEID4_SIZE
) * count
);
332 /* Decode individual extents, putting them in temporary
333 * staging area until whole layout is decoded to make error
336 for (i
= 0; i
< count
; i
++) {
337 be
= bl_alloc_extent();
342 memcpy(&be
->be_devid
, p
, NFS4_DEVICEID4_SIZE
);
343 p
+= XDR_QUADLEN(NFS4_DEVICEID4_SIZE
);
344 be
->be_mdev
= translate_devid(lo
, &be
->be_devid
);
348 /* The next three values are read in as bytes,
349 * but stored as 512-byte sector lengths
351 if (decode_sector_number(&p
, &be
->be_f_offset
) < 0)
353 if (decode_sector_number(&p
, &be
->be_length
) < 0)
355 if (decode_sector_number(&p
, &be
->be_v_offset
) < 0)
357 be
->be_state
= be32_to_cpup(p
++);
358 if (be
->be_state
== PNFS_BLOCK_INVALID_DATA
)
359 be
->be_inval
= &bl
->bl_inval
;
360 if (verify_extent(be
, &lv
)) {
361 dprintk("%s verify failed\n", __func__
);
364 list_add_tail(&be
->be_node
, &extents
);
366 if (lgr
->range
.offset
+ lgr
->range
.length
!=
367 lv
.start
<< SECTOR_SHIFT
) {
368 dprintk("%s Final length mismatch\n", __func__
);
372 if (lv
.start
< lv
.cowread
) {
373 dprintk("%s Final uncovered COW extent\n", __func__
);
377 /* Extents decoded properly, now try to merge them in to
378 * existing layout extents.
380 spin_lock(&bl
->bl_ext_lock
);
381 list_for_each_entry_safe(be
, save
, &extents
, be_node
) {
382 list_del(&be
->be_node
);
383 status
= bl_add_merge_extent(bl
, be
);
385 spin_unlock(&bl
->bl_ext_lock
);
386 /* This is a fairly catastrophic error, as the
387 * entire layout extent lists are now corrupted.
388 * We should have some way to distinguish this.
394 spin_unlock(&bl
->bl_ext_lock
);
397 __free_page(scratch
);
398 dprintk("%s returns %i\n", __func__
, status
);
403 while (!list_empty(&extents
)) {
404 be
= list_first_entry(&extents
, struct pnfs_block_extent
,
406 list_del(&be
->be_node
);