2 * pNFS Objects layout driver high level definitions
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 #include <scsi/osd_initiator.h>
41 #include "objlayout.h"
43 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
45 * Create a objlayout layout structure for the given inode and return it.
47 struct pnfs_layout_hdr
*
48 objlayout_alloc_layout_hdr(struct inode
*inode
, gfp_t gfp_flags
)
50 struct objlayout
*objlay
;
52 objlay
= kzalloc(sizeof(struct objlayout
), gfp_flags
);
54 spin_lock_init(&objlay
->lock
);
55 INIT_LIST_HEAD(&objlay
->err_list
);
57 dprintk("%s: Return %p\n", __func__
, objlay
);
58 return &objlay
->pnfs_layout
;
62 * Free an objlayout layout structure
65 objlayout_free_layout_hdr(struct pnfs_layout_hdr
*lo
)
67 struct objlayout
*objlay
= OBJLAYOUT(lo
);
69 dprintk("%s: objlay %p\n", __func__
, objlay
);
71 WARN_ON(!list_empty(&objlay
->err_list
));
76 * Unmarshall layout and store it in pnfslay.
78 struct pnfs_layout_segment
*
79 objlayout_alloc_lseg(struct pnfs_layout_hdr
*pnfslay
,
80 struct nfs4_layoutget_res
*lgr
,
84 struct xdr_stream stream
;
85 struct xdr_buf buf
= {
86 .pages
= lgr
->layoutp
->pages
,
87 .page_len
= lgr
->layoutp
->len
,
88 .buflen
= lgr
->layoutp
->len
,
89 .len
= lgr
->layoutp
->len
,
92 struct pnfs_layout_segment
*lseg
;
94 dprintk("%s: Begin pnfslay %p\n", __func__
, pnfslay
);
96 scratch
= alloc_page(gfp_flags
);
100 xdr_init_decode(&stream
, &buf
, NULL
);
101 xdr_set_scratch_buffer(&stream
, page_address(scratch
), PAGE_SIZE
);
103 status
= objio_alloc_lseg(&lseg
, pnfslay
, &lgr
->range
, &stream
, gfp_flags
);
104 if (unlikely(status
)) {
105 dprintk("%s: objio_alloc_lseg Return err %d\n", __func__
,
110 __free_page(scratch
);
112 dprintk("%s: Return %p\n", __func__
, lseg
);
116 __free_page(scratch
);
118 dprintk("%s: Err Return=>%d\n", __func__
, status
);
119 return ERR_PTR(status
);
123 * Free a layout segement
126 objlayout_free_lseg(struct pnfs_layout_segment
*lseg
)
128 dprintk("%s: freeing layout segment %p\n", __func__
, lseg
);
133 objio_free_lseg(lseg
);
140 end_offset(u64 start
, u64 len
)
145 return end
>= start
? end
: NFS4_MAX_UINT64
;
148 /* last octet in a range */
150 last_byte_offset(u64 start
, u64 len
)
156 return end
> start
? end
- 1 : NFS4_MAX_UINT64
;
159 void _fix_verify_io_params(struct pnfs_layout_segment
*lseg
,
160 struct page
***p_pages
, unsigned *p_pgbase
,
161 u64 offset
, unsigned long count
)
165 BUG_ON(offset
< lseg
->pls_range
.offset
);
166 lseg_end_offset
= end_offset(lseg
->pls_range
.offset
,
167 lseg
->pls_range
.length
);
168 BUG_ON(offset
>= lseg_end_offset
);
169 WARN_ON(offset
+ count
> lseg_end_offset
);
171 if (*p_pgbase
> PAGE_SIZE
) {
172 dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__
, *p_pgbase
);
173 *p_pages
+= *p_pgbase
>> PAGE_SHIFT
;
174 *p_pgbase
&= ~PAGE_MASK
;
179 * I/O done common code
182 objlayout_iodone(struct objlayout_io_res
*oir
)
184 if (likely(oir
->status
>= 0)) {
185 objio_free_result(oir
);
187 struct objlayout
*objlay
= oir
->objlay
;
189 spin_lock(&objlay
->lock
);
190 objlay
->delta_space_valid
= OBJ_DSU_INVALID
;
191 list_add(&objlay
->err_list
, &oir
->err_list
);
192 spin_unlock(&objlay
->lock
);
197 * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
199 * The @index component IO failed (error returned from target). Register
200 * the error for later reporting at layout-return.
203 objlayout_io_set_result(struct objlayout_io_res
*oir
, unsigned index
,
204 struct pnfs_osd_objid
*pooid
, int osd_error
,
205 u64 offset
, u64 length
, bool is_write
)
207 struct pnfs_osd_ioerr
*ioerr
= &oir
->ioerrs
[index
];
209 BUG_ON(index
>= oir
->num_comps
);
211 ioerr
->oer_component
= *pooid
;
212 ioerr
->oer_comp_offset
= offset
;
213 ioerr
->oer_comp_length
= length
;
214 ioerr
->oer_iswrite
= is_write
;
215 ioerr
->oer_errno
= osd_error
;
217 dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
218 "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
219 __func__
, index
, ioerr
->oer_errno
,
221 _DEVID_LO(&ioerr
->oer_component
.oid_device_id
),
222 _DEVID_HI(&ioerr
->oer_component
.oid_device_id
),
223 ioerr
->oer_component
.oid_partition_id
,
224 ioerr
->oer_component
.oid_object_id
,
225 ioerr
->oer_comp_offset
,
226 ioerr
->oer_comp_length
);
228 /* User need not call if no error is reported */
229 ioerr
->oer_errno
= 0;
233 /* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
234 * This is because the osd completion is called with ints-off from
237 static void _rpc_read_complete(struct work_struct
*work
)
239 struct rpc_task
*task
;
240 struct nfs_read_data
*rdata
;
242 dprintk("%s enter\n", __func__
);
243 task
= container_of(work
, struct rpc_task
, u
.tk_work
);
244 rdata
= container_of(task
, struct nfs_read_data
, task
);
246 pnfs_ld_read_done(rdata
);
250 objlayout_read_done(struct objlayout_io_res
*oir
, ssize_t status
, bool sync
)
252 struct nfs_read_data
*rdata
= oir
->rpcdata
;
254 oir
->status
= rdata
->task
.tk_status
= status
;
256 rdata
->res
.count
= status
;
258 rdata
->pnfs_error
= status
;
259 objlayout_iodone(oir
);
260 /* must not use oir after this point */
262 dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__
,
263 status
, rdata
->res
.eof
, sync
);
266 pnfs_ld_read_done(rdata
);
268 INIT_WORK(&rdata
->task
.u
.tk_work
, _rpc_read_complete
);
269 schedule_work(&rdata
->task
.u
.tk_work
);
274 * Perform sync or async reads.
277 objlayout_read_pagelist(struct nfs_read_data
*rdata
)
279 loff_t offset
= rdata
->args
.offset
;
280 size_t count
= rdata
->args
.count
;
284 eof
= i_size_read(rdata
->inode
);
285 if (unlikely(offset
+ count
> eof
)) {
288 rdata
->res
.count
= 0;
290 /*FIXME: do we need to call pnfs_ld_read_done() */
293 count
= eof
- offset
;
296 rdata
->res
.eof
= (offset
+ count
) >= eof
;
297 _fix_verify_io_params(rdata
->lseg
, &rdata
->args
.pages
,
299 rdata
->args
.offset
, rdata
->args
.count
);
301 dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
302 __func__
, rdata
->inode
->i_ino
, offset
, count
, rdata
->res
.eof
);
304 err
= objio_read_pagelist(rdata
);
307 rdata
->pnfs_error
= err
;
308 dprintk("%s: Returned Error %d\n", __func__
, err
);
309 return PNFS_NOT_ATTEMPTED
;
311 return PNFS_ATTEMPTED
;
314 /* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
315 * This is because the osd completion is called with ints-off from
318 static void _rpc_write_complete(struct work_struct
*work
)
320 struct rpc_task
*task
;
321 struct nfs_write_data
*wdata
;
323 dprintk("%s enter\n", __func__
);
324 task
= container_of(work
, struct rpc_task
, u
.tk_work
);
325 wdata
= container_of(task
, struct nfs_write_data
, task
);
327 pnfs_ld_write_done(wdata
);
331 objlayout_write_done(struct objlayout_io_res
*oir
, ssize_t status
, bool sync
)
333 struct nfs_write_data
*wdata
= oir
->rpcdata
;
335 oir
->status
= wdata
->task
.tk_status
= status
;
337 wdata
->res
.count
= status
;
338 wdata
->verf
.committed
= oir
->committed
;
340 wdata
->pnfs_error
= status
;
342 objlayout_iodone(oir
);
343 /* must not use oir after this point */
345 dprintk("%s: Return status %zd committed %d sync=%d\n", __func__
,
346 status
, wdata
->verf
.committed
, sync
);
349 pnfs_ld_write_done(wdata
);
351 INIT_WORK(&wdata
->task
.u
.tk_work
, _rpc_write_complete
);
352 schedule_work(&wdata
->task
.u
.tk_work
);
357 * Perform sync or async writes.
360 objlayout_write_pagelist(struct nfs_write_data
*wdata
,
365 _fix_verify_io_params(wdata
->lseg
, &wdata
->args
.pages
,
367 wdata
->args
.offset
, wdata
->args
.count
);
369 err
= objio_write_pagelist(wdata
, how
);
371 wdata
->pnfs_error
= err
;
372 dprintk("%s: Returned Error %d\n", __func__
, err
);
373 return PNFS_NOT_ATTEMPTED
;
375 return PNFS_ATTEMPTED
;
379 objlayout_encode_layoutcommit(struct pnfs_layout_hdr
*pnfslay
,
380 struct xdr_stream
*xdr
,
381 const struct nfs4_layoutcommit_args
*args
)
383 struct objlayout
*objlay
= OBJLAYOUT(pnfslay
);
384 struct pnfs_osd_layoutupdate lou
;
387 dprintk("%s: Begin\n", __func__
);
389 spin_lock(&objlay
->lock
);
390 lou
.dsu_valid
= (objlay
->delta_space_valid
== OBJ_DSU_VALID
);
391 lou
.dsu_delta
= objlay
->delta_space_used
;
392 objlay
->delta_space_used
= 0;
393 objlay
->delta_space_valid
= OBJ_DSU_INIT
;
394 lou
.olu_ioerr_flag
= !list_empty(&objlay
->err_list
);
395 spin_unlock(&objlay
->lock
);
397 start
= xdr_reserve_space(xdr
, 4);
399 BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr
, &lou
));
401 *start
= cpu_to_be32((xdr
->p
- start
- 1) * 4);
403 dprintk("%s: Return delta_space_used %lld err %d\n", __func__
,
404 lou
.dsu_delta
, lou
.olu_ioerr_flag
);
408 err_prio(u32 oer_errno
)
414 case PNFS_OSD_ERR_RESOURCE
:
415 return OSD_ERR_PRI_RESOURCE
;
416 case PNFS_OSD_ERR_BAD_CRED
:
417 return OSD_ERR_PRI_BAD_CRED
;
418 case PNFS_OSD_ERR_NO_ACCESS
:
419 return OSD_ERR_PRI_NO_ACCESS
;
420 case PNFS_OSD_ERR_UNREACHABLE
:
421 return OSD_ERR_PRI_UNREACHABLE
;
422 case PNFS_OSD_ERR_NOT_FOUND
:
423 return OSD_ERR_PRI_NOT_FOUND
;
424 case PNFS_OSD_ERR_NO_SPACE
:
425 return OSD_ERR_PRI_NO_SPACE
;
429 case PNFS_OSD_ERR_EIO
:
430 return OSD_ERR_PRI_EIO
;
435 merge_ioerr(struct pnfs_osd_ioerr
*dest_err
,
436 const struct pnfs_osd_ioerr
*src_err
)
438 u64 dest_end
, src_end
;
440 if (!dest_err
->oer_errno
) {
441 *dest_err
= *src_err
;
442 /* accumulated device must be blank */
443 memset(&dest_err
->oer_component
.oid_device_id
, 0,
444 sizeof(dest_err
->oer_component
.oid_device_id
));
449 if (dest_err
->oer_component
.oid_partition_id
!=
450 src_err
->oer_component
.oid_partition_id
)
451 dest_err
->oer_component
.oid_partition_id
= 0;
453 if (dest_err
->oer_component
.oid_object_id
!=
454 src_err
->oer_component
.oid_object_id
)
455 dest_err
->oer_component
.oid_object_id
= 0;
457 if (dest_err
->oer_comp_offset
> src_err
->oer_comp_offset
)
458 dest_err
->oer_comp_offset
= src_err
->oer_comp_offset
;
460 dest_end
= end_offset(dest_err
->oer_comp_offset
,
461 dest_err
->oer_comp_length
);
462 src_end
= end_offset(src_err
->oer_comp_offset
,
463 src_err
->oer_comp_length
);
464 if (dest_end
< src_end
)
467 dest_err
->oer_comp_length
= dest_end
- dest_err
->oer_comp_offset
;
469 if ((src_err
->oer_iswrite
== dest_err
->oer_iswrite
) &&
470 (err_prio(src_err
->oer_errno
) > err_prio(dest_err
->oer_errno
))) {
471 dest_err
->oer_errno
= src_err
->oer_errno
;
472 } else if (src_err
->oer_iswrite
) {
473 dest_err
->oer_iswrite
= true;
474 dest_err
->oer_errno
= src_err
->oer_errno
;
479 encode_accumulated_error(struct objlayout
*objlay
, __be32
*p
)
481 struct objlayout_io_res
*oir
, *tmp
;
482 struct pnfs_osd_ioerr accumulated_err
= {.oer_errno
= 0};
484 list_for_each_entry_safe(oir
, tmp
, &objlay
->err_list
, err_list
) {
487 for (i
= 0; i
< oir
->num_comps
; i
++) {
488 struct pnfs_osd_ioerr
*ioerr
= &oir
->ioerrs
[i
];
490 if (!ioerr
->oer_errno
)
493 printk(KERN_ERR
"%s: err[%d]: errno=%d is_write=%d "
494 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
495 "offset=0x%llx length=0x%llx\n",
496 __func__
, i
, ioerr
->oer_errno
,
498 _DEVID_LO(&ioerr
->oer_component
.oid_device_id
),
499 _DEVID_HI(&ioerr
->oer_component
.oid_device_id
),
500 ioerr
->oer_component
.oid_partition_id
,
501 ioerr
->oer_component
.oid_object_id
,
502 ioerr
->oer_comp_offset
,
503 ioerr
->oer_comp_length
);
505 merge_ioerr(&accumulated_err
, ioerr
);
507 list_del(&oir
->err_list
);
508 objio_free_result(oir
);
511 pnfs_osd_xdr_encode_ioerr(p
, &accumulated_err
);
515 objlayout_encode_layoutreturn(struct pnfs_layout_hdr
*pnfslay
,
516 struct xdr_stream
*xdr
,
517 const struct nfs4_layoutreturn_args
*args
)
519 struct objlayout
*objlay
= OBJLAYOUT(pnfslay
);
520 struct objlayout_io_res
*oir
, *tmp
;
523 dprintk("%s: Begin\n", __func__
);
524 start
= xdr_reserve_space(xdr
, 4);
527 spin_lock(&objlay
->lock
);
529 list_for_each_entry_safe(oir
, tmp
, &objlay
->err_list
, err_list
) {
530 __be32
*last_xdr
= NULL
, *p
;
534 for (i
= 0; i
< oir
->num_comps
; i
++) {
535 struct pnfs_osd_ioerr
*ioerr
= &oir
->ioerrs
[i
];
537 if (!ioerr
->oer_errno
)
540 dprintk("%s: err[%d]: errno=%d is_write=%d "
541 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
542 "offset=0x%llx length=0x%llx\n",
543 __func__
, i
, ioerr
->oer_errno
,
545 _DEVID_LO(&ioerr
->oer_component
.oid_device_id
),
546 _DEVID_HI(&ioerr
->oer_component
.oid_device_id
),
547 ioerr
->oer_component
.oid_partition_id
,
548 ioerr
->oer_component
.oid_object_id
,
549 ioerr
->oer_comp_offset
,
550 ioerr
->oer_comp_length
);
552 p
= pnfs_osd_xdr_ioerr_reserve_space(xdr
);
555 break; /* accumulated_error */
559 pnfs_osd_xdr_encode_ioerr(p
, &oir
->ioerrs
[i
]);
562 /* TODO: use xdr_write_pages */
564 /* no space for even one error descriptor */
567 /* we've encountered a situation with lots and lots of
568 * errors and no space to encode them all. Use the last
569 * available slot to report the union of all the
572 encode_accumulated_error(objlay
, last_xdr
);
575 list_del(&oir
->err_list
);
576 objio_free_result(oir
);
579 spin_unlock(&objlay
->lock
);
581 *start
= cpu_to_be32((xdr
->p
- start
- 1) * 4);
582 dprintk("%s: Return\n", __func__
);
587 * Get Device Info API for io engines
589 struct objlayout_deviceinfo
{
591 struct pnfs_osd_deviceaddr da
; /* This must be last */
594 /* Initialize and call nfs_getdeviceinfo, then decode and return a
595 * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
598 int objlayout_get_deviceinfo(struct pnfs_layout_hdr
*pnfslay
,
599 struct nfs4_deviceid
*d_id
, struct pnfs_osd_deviceaddr
**deviceaddr
,
602 struct objlayout_deviceinfo
*odi
;
603 struct pnfs_device pd
;
604 struct super_block
*sb
;
605 struct page
*page
, **pages
;
609 page
= alloc_page(gfp_flags
);
616 memcpy(&pd
.dev_id
, d_id
, sizeof(*d_id
));
617 pd
.layout_type
= LAYOUT_OSD2_OBJECTS
;
620 pd
.pglen
= PAGE_SIZE
;
623 sb
= pnfslay
->plh_inode
->i_sb
;
624 err
= nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay
->plh_inode
), &pd
);
625 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__
, err
);
629 p
= page_address(page
);
630 odi
= kzalloc(sizeof(*odi
), gfp_flags
);
635 pnfs_osd_xdr_decode_deviceaddr(&odi
->da
, p
);
637 *deviceaddr
= &odi
->da
;
645 void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr
*deviceaddr
)
647 struct objlayout_deviceinfo
*odi
= container_of(deviceaddr
,
648 struct objlayout_deviceinfo
,
651 __free_page(odi
->page
);