2 * pNFS Objects layout driver high level definitions
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 #include <scsi/osd_initiator.h>
41 #include "objlayout.h"
43 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
45 * Create a objlayout layout structure for the given inode and return it.
47 struct pnfs_layout_hdr
*
48 objlayout_alloc_layout_hdr(struct inode
*inode
, gfp_t gfp_flags
)
50 struct objlayout
*objlay
;
52 objlay
= kzalloc(sizeof(struct objlayout
), gfp_flags
);
54 spin_lock_init(&objlay
->lock
);
55 INIT_LIST_HEAD(&objlay
->err_list
);
57 dprintk("%s: Return %p\n", __func__
, objlay
);
58 return &objlay
->pnfs_layout
;
62 * Free an objlayout layout structure
65 objlayout_free_layout_hdr(struct pnfs_layout_hdr
*lo
)
67 struct objlayout
*objlay
= OBJLAYOUT(lo
);
69 dprintk("%s: objlay %p\n", __func__
, objlay
);
71 WARN_ON(!list_empty(&objlay
->err_list
));
76 * Unmarshall layout and store it in pnfslay.
78 struct pnfs_layout_segment
*
79 objlayout_alloc_lseg(struct pnfs_layout_hdr
*pnfslay
,
80 struct nfs4_layoutget_res
*lgr
,
84 struct xdr_stream stream
;
85 struct xdr_buf buf
= {
86 .pages
= lgr
->layoutp
->pages
,
87 .page_len
= lgr
->layoutp
->len
,
88 .buflen
= lgr
->layoutp
->len
,
89 .len
= lgr
->layoutp
->len
,
92 struct pnfs_layout_segment
*lseg
;
94 dprintk("%s: Begin pnfslay %p\n", __func__
, pnfslay
);
96 scratch
= alloc_page(gfp_flags
);
100 xdr_init_decode(&stream
, &buf
, NULL
);
101 xdr_set_scratch_buffer(&stream
, page_address(scratch
), PAGE_SIZE
);
103 status
= objio_alloc_lseg(&lseg
, pnfslay
, &lgr
->range
, &stream
, gfp_flags
);
104 if (unlikely(status
)) {
105 dprintk("%s: objio_alloc_lseg Return err %d\n", __func__
,
110 __free_page(scratch
);
112 dprintk("%s: Return %p\n", __func__
, lseg
);
116 __free_page(scratch
);
118 dprintk("%s: Err Return=>%d\n", __func__
, status
);
119 return ERR_PTR(status
);
123 * Free a layout segement
126 objlayout_free_lseg(struct pnfs_layout_segment
*lseg
)
128 dprintk("%s: freeing layout segment %p\n", __func__
, lseg
);
133 objio_free_lseg(lseg
);
140 end_offset(u64 start
, u64 len
)
145 return end
>= start
? end
: NFS4_MAX_UINT64
;
148 /* last octet in a range */
150 last_byte_offset(u64 start
, u64 len
)
156 return end
> start
? end
- 1 : NFS4_MAX_UINT64
;
159 void _fix_verify_io_params(struct pnfs_layout_segment
*lseg
,
160 struct page
***p_pages
, unsigned *p_pgbase
,
161 u64 offset
, unsigned long count
)
165 BUG_ON(offset
< lseg
->pls_range
.offset
);
166 lseg_end_offset
= end_offset(lseg
->pls_range
.offset
,
167 lseg
->pls_range
.length
);
168 BUG_ON(offset
>= lseg_end_offset
);
169 WARN_ON(offset
+ count
> lseg_end_offset
);
171 if (*p_pgbase
> PAGE_SIZE
) {
172 dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__
, *p_pgbase
);
173 *p_pages
+= *p_pgbase
>> PAGE_SHIFT
;
174 *p_pgbase
&= ~PAGE_MASK
;
179 * I/O done common code
182 objlayout_iodone(struct objlayout_io_res
*oir
)
184 if (likely(oir
->status
>= 0)) {
185 objio_free_result(oir
);
187 struct objlayout
*objlay
= oir
->objlay
;
189 spin_lock(&objlay
->lock
);
190 objlay
->delta_space_valid
= OBJ_DSU_INVALID
;
191 list_add(&objlay
->err_list
, &oir
->err_list
);
192 spin_unlock(&objlay
->lock
);
197 * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
199 * The @index component IO failed (error returned from target). Register
200 * the error for later reporting at layout-return.
203 objlayout_io_set_result(struct objlayout_io_res
*oir
, unsigned index
,
204 struct pnfs_osd_objid
*pooid
, int osd_error
,
205 u64 offset
, u64 length
, bool is_write
)
207 struct pnfs_osd_ioerr
*ioerr
= &oir
->ioerrs
[index
];
209 BUG_ON(index
>= oir
->num_comps
);
211 ioerr
->oer_component
= *pooid
;
212 ioerr
->oer_comp_offset
= offset
;
213 ioerr
->oer_comp_length
= length
;
214 ioerr
->oer_iswrite
= is_write
;
215 ioerr
->oer_errno
= osd_error
;
217 dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
218 "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
219 __func__
, index
, ioerr
->oer_errno
,
221 _DEVID_LO(&ioerr
->oer_component
.oid_device_id
),
222 _DEVID_HI(&ioerr
->oer_component
.oid_device_id
),
223 ioerr
->oer_component
.oid_partition_id
,
224 ioerr
->oer_component
.oid_object_id
,
225 ioerr
->oer_comp_offset
,
226 ioerr
->oer_comp_length
);
228 /* User need not call if no error is reported */
229 ioerr
->oer_errno
= 0;
233 /* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
234 * This is because the osd completion is called with ints-off from
237 static void _rpc_read_complete(struct work_struct
*work
)
239 struct rpc_task
*task
;
240 struct nfs_read_data
*rdata
;
242 dprintk("%s enter\n", __func__
);
243 task
= container_of(work
, struct rpc_task
, u
.tk_work
);
244 rdata
= container_of(task
, struct nfs_read_data
, task
);
246 pnfs_ld_read_done(rdata
);
250 objlayout_read_done(struct objlayout_io_res
*oir
, ssize_t status
, bool sync
)
252 struct nfs_read_data
*rdata
= oir
->rpcdata
;
254 oir
->status
= rdata
->task
.tk_status
= status
;
256 rdata
->res
.count
= status
;
257 objlayout_iodone(oir
);
258 /* must not use oir after this point */
260 dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__
,
261 status
, rdata
->res
.eof
, sync
);
264 pnfs_ld_read_done(rdata
);
266 INIT_WORK(&rdata
->task
.u
.tk_work
, _rpc_read_complete
);
267 schedule_work(&rdata
->task
.u
.tk_work
);
272 * Perform sync or async reads.
275 objlayout_read_pagelist(struct nfs_read_data
*rdata
)
277 loff_t offset
= rdata
->args
.offset
;
278 size_t count
= rdata
->args
.count
;
282 eof
= i_size_read(rdata
->inode
);
283 if (unlikely(offset
+ count
> eof
)) {
286 rdata
->res
.count
= 0;
288 /*FIXME: do we need to call pnfs_ld_read_done() */
291 count
= eof
- offset
;
294 rdata
->res
.eof
= (offset
+ count
) >= eof
;
295 _fix_verify_io_params(rdata
->lseg
, &rdata
->args
.pages
,
297 rdata
->args
.offset
, rdata
->args
.count
);
299 dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
300 __func__
, rdata
->inode
->i_ino
, offset
, count
, rdata
->res
.eof
);
302 err
= objio_read_pagelist(rdata
);
305 rdata
->pnfs_error
= err
;
306 dprintk("%s: Returned Error %d\n", __func__
, err
);
307 return PNFS_NOT_ATTEMPTED
;
309 return PNFS_ATTEMPTED
;
312 /* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
313 * This is because the osd completion is called with ints-off from
316 static void _rpc_write_complete(struct work_struct
*work
)
318 struct rpc_task
*task
;
319 struct nfs_write_data
*wdata
;
321 dprintk("%s enter\n", __func__
);
322 task
= container_of(work
, struct rpc_task
, u
.tk_work
);
323 wdata
= container_of(task
, struct nfs_write_data
, task
);
325 pnfs_ld_write_done(wdata
);
329 objlayout_write_done(struct objlayout_io_res
*oir
, ssize_t status
, bool sync
)
331 struct nfs_write_data
*wdata
= oir
->rpcdata
;
333 oir
->status
= wdata
->task
.tk_status
= status
;
335 wdata
->res
.count
= status
;
336 wdata
->verf
.committed
= oir
->committed
;
338 objlayout_iodone(oir
);
339 /* must not use oir after this point */
341 dprintk("%s: Return status %zd committed %d sync=%d\n", __func__
,
342 status
, wdata
->verf
.committed
, sync
);
345 pnfs_ld_write_done(wdata
);
347 INIT_WORK(&wdata
->task
.u
.tk_work
, _rpc_write_complete
);
348 schedule_work(&wdata
->task
.u
.tk_work
);
353 * Perform sync or async writes.
356 objlayout_write_pagelist(struct nfs_write_data
*wdata
,
361 _fix_verify_io_params(wdata
->lseg
, &wdata
->args
.pages
,
363 wdata
->args
.offset
, wdata
->args
.count
);
365 err
= objio_write_pagelist(wdata
, how
);
367 wdata
->pnfs_error
= err
;
368 dprintk("%s: Returned Error %d\n", __func__
, err
);
369 return PNFS_NOT_ATTEMPTED
;
371 return PNFS_ATTEMPTED
;
375 objlayout_encode_layoutcommit(struct pnfs_layout_hdr
*pnfslay
,
376 struct xdr_stream
*xdr
,
377 const struct nfs4_layoutcommit_args
*args
)
379 struct objlayout
*objlay
= OBJLAYOUT(pnfslay
);
380 struct pnfs_osd_layoutupdate lou
;
383 dprintk("%s: Begin\n", __func__
);
385 spin_lock(&objlay
->lock
);
386 lou
.dsu_valid
= (objlay
->delta_space_valid
== OBJ_DSU_VALID
);
387 lou
.dsu_delta
= objlay
->delta_space_used
;
388 objlay
->delta_space_used
= 0;
389 objlay
->delta_space_valid
= OBJ_DSU_INIT
;
390 lou
.olu_ioerr_flag
= !list_empty(&objlay
->err_list
);
391 spin_unlock(&objlay
->lock
);
393 start
= xdr_reserve_space(xdr
, 4);
395 BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr
, &lou
));
397 *start
= cpu_to_be32((xdr
->p
- start
- 1) * 4);
399 dprintk("%s: Return delta_space_used %lld err %d\n", __func__
,
400 lou
.dsu_delta
, lou
.olu_ioerr_flag
);
404 err_prio(u32 oer_errno
)
410 case PNFS_OSD_ERR_RESOURCE
:
411 return OSD_ERR_PRI_RESOURCE
;
412 case PNFS_OSD_ERR_BAD_CRED
:
413 return OSD_ERR_PRI_BAD_CRED
;
414 case PNFS_OSD_ERR_NO_ACCESS
:
415 return OSD_ERR_PRI_NO_ACCESS
;
416 case PNFS_OSD_ERR_UNREACHABLE
:
417 return OSD_ERR_PRI_UNREACHABLE
;
418 case PNFS_OSD_ERR_NOT_FOUND
:
419 return OSD_ERR_PRI_NOT_FOUND
;
420 case PNFS_OSD_ERR_NO_SPACE
:
421 return OSD_ERR_PRI_NO_SPACE
;
425 case PNFS_OSD_ERR_EIO
:
426 return OSD_ERR_PRI_EIO
;
431 merge_ioerr(struct pnfs_osd_ioerr
*dest_err
,
432 const struct pnfs_osd_ioerr
*src_err
)
434 u64 dest_end
, src_end
;
436 if (!dest_err
->oer_errno
) {
437 *dest_err
= *src_err
;
438 /* accumulated device must be blank */
439 memset(&dest_err
->oer_component
.oid_device_id
, 0,
440 sizeof(dest_err
->oer_component
.oid_device_id
));
445 if (dest_err
->oer_component
.oid_partition_id
!=
446 src_err
->oer_component
.oid_partition_id
)
447 dest_err
->oer_component
.oid_partition_id
= 0;
449 if (dest_err
->oer_component
.oid_object_id
!=
450 src_err
->oer_component
.oid_object_id
)
451 dest_err
->oer_component
.oid_object_id
= 0;
453 if (dest_err
->oer_comp_offset
> src_err
->oer_comp_offset
)
454 dest_err
->oer_comp_offset
= src_err
->oer_comp_offset
;
456 dest_end
= end_offset(dest_err
->oer_comp_offset
,
457 dest_err
->oer_comp_length
);
458 src_end
= end_offset(src_err
->oer_comp_offset
,
459 src_err
->oer_comp_length
);
460 if (dest_end
< src_end
)
463 dest_err
->oer_comp_length
= dest_end
- dest_err
->oer_comp_offset
;
465 if ((src_err
->oer_iswrite
== dest_err
->oer_iswrite
) &&
466 (err_prio(src_err
->oer_errno
) > err_prio(dest_err
->oer_errno
))) {
467 dest_err
->oer_errno
= src_err
->oer_errno
;
468 } else if (src_err
->oer_iswrite
) {
469 dest_err
->oer_iswrite
= true;
470 dest_err
->oer_errno
= src_err
->oer_errno
;
475 encode_accumulated_error(struct objlayout
*objlay
, __be32
*p
)
477 struct objlayout_io_res
*oir
, *tmp
;
478 struct pnfs_osd_ioerr accumulated_err
= {.oer_errno
= 0};
480 list_for_each_entry_safe(oir
, tmp
, &objlay
->err_list
, err_list
) {
483 for (i
= 0; i
< oir
->num_comps
; i
++) {
484 struct pnfs_osd_ioerr
*ioerr
= &oir
->ioerrs
[i
];
486 if (!ioerr
->oer_errno
)
489 printk(KERN_ERR
"%s: err[%d]: errno=%d is_write=%d "
490 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
491 "offset=0x%llx length=0x%llx\n",
492 __func__
, i
, ioerr
->oer_errno
,
494 _DEVID_LO(&ioerr
->oer_component
.oid_device_id
),
495 _DEVID_HI(&ioerr
->oer_component
.oid_device_id
),
496 ioerr
->oer_component
.oid_partition_id
,
497 ioerr
->oer_component
.oid_object_id
,
498 ioerr
->oer_comp_offset
,
499 ioerr
->oer_comp_length
);
501 merge_ioerr(&accumulated_err
, ioerr
);
503 list_del(&oir
->err_list
);
504 objio_free_result(oir
);
507 pnfs_osd_xdr_encode_ioerr(p
, &accumulated_err
);
511 objlayout_encode_layoutreturn(struct pnfs_layout_hdr
*pnfslay
,
512 struct xdr_stream
*xdr
,
513 const struct nfs4_layoutreturn_args
*args
)
515 struct objlayout
*objlay
= OBJLAYOUT(pnfslay
);
516 struct objlayout_io_res
*oir
, *tmp
;
519 dprintk("%s: Begin\n", __func__
);
520 start
= xdr_reserve_space(xdr
, 4);
523 spin_lock(&objlay
->lock
);
525 list_for_each_entry_safe(oir
, tmp
, &objlay
->err_list
, err_list
) {
526 __be32
*last_xdr
= NULL
, *p
;
530 for (i
= 0; i
< oir
->num_comps
; i
++) {
531 struct pnfs_osd_ioerr
*ioerr
= &oir
->ioerrs
[i
];
533 if (!ioerr
->oer_errno
)
536 dprintk("%s: err[%d]: errno=%d is_write=%d "
537 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
538 "offset=0x%llx length=0x%llx\n",
539 __func__
, i
, ioerr
->oer_errno
,
541 _DEVID_LO(&ioerr
->oer_component
.oid_device_id
),
542 _DEVID_HI(&ioerr
->oer_component
.oid_device_id
),
543 ioerr
->oer_component
.oid_partition_id
,
544 ioerr
->oer_component
.oid_object_id
,
545 ioerr
->oer_comp_offset
,
546 ioerr
->oer_comp_length
);
548 p
= pnfs_osd_xdr_ioerr_reserve_space(xdr
);
551 break; /* accumulated_error */
555 pnfs_osd_xdr_encode_ioerr(p
, &oir
->ioerrs
[i
]);
558 /* TODO: use xdr_write_pages */
560 /* no space for even one error descriptor */
563 /* we've encountered a situation with lots and lots of
564 * errors and no space to encode them all. Use the last
565 * available slot to report the union of all the
568 encode_accumulated_error(objlay
, last_xdr
);
571 list_del(&oir
->err_list
);
572 objio_free_result(oir
);
575 spin_unlock(&objlay
->lock
);
577 *start
= cpu_to_be32((xdr
->p
- start
- 1) * 4);
578 dprintk("%s: Return\n", __func__
);
583 * Get Device Info API for io engines
585 struct objlayout_deviceinfo
{
587 struct pnfs_osd_deviceaddr da
; /* This must be last */
590 /* Initialize and call nfs_getdeviceinfo, then decode and return a
591 * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
594 int objlayout_get_deviceinfo(struct pnfs_layout_hdr
*pnfslay
,
595 struct nfs4_deviceid
*d_id
, struct pnfs_osd_deviceaddr
**deviceaddr
,
598 struct objlayout_deviceinfo
*odi
;
599 struct pnfs_device pd
;
600 struct super_block
*sb
;
601 struct page
*page
, **pages
;
605 page
= alloc_page(gfp_flags
);
612 memcpy(&pd
.dev_id
, d_id
, sizeof(*d_id
));
613 pd
.layout_type
= LAYOUT_OSD2_OBJECTS
;
616 pd
.pglen
= PAGE_SIZE
;
619 sb
= pnfslay
->plh_inode
->i_sb
;
620 err
= nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay
->plh_inode
), &pd
);
621 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__
, err
);
625 p
= page_address(page
);
626 odi
= kzalloc(sizeof(*odi
), gfp_flags
);
631 pnfs_osd_xdr_decode_deviceaddr(&odi
->da
, p
);
633 *deviceaddr
= &odi
->da
;
641 void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr
*deviceaddr
)
643 struct objlayout_deviceinfo
*odi
= container_of(deviceaddr
,
644 struct objlayout_deviceinfo
,
647 __free_page(odi
->page
);