2 * pNFS Objects layout driver high level definitions
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 #include <scsi/osd_initiator.h>
41 #include "objlayout.h"
43 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
45 * Create a objlayout layout structure for the given inode and return it.
47 struct pnfs_layout_hdr
*
48 objlayout_alloc_layout_hdr(struct inode
*inode
, gfp_t gfp_flags
)
50 struct objlayout
*objlay
;
52 objlay
= kzalloc(sizeof(struct objlayout
), gfp_flags
);
54 spin_lock_init(&objlay
->lock
);
55 INIT_LIST_HEAD(&objlay
->err_list
);
57 dprintk("%s: Return %p\n", __func__
, objlay
);
58 return &objlay
->pnfs_layout
;
62 * Free an objlayout layout structure
65 objlayout_free_layout_hdr(struct pnfs_layout_hdr
*lo
)
67 struct objlayout
*objlay
= OBJLAYOUT(lo
);
69 dprintk("%s: objlay %p\n", __func__
, objlay
);
71 WARN_ON(!list_empty(&objlay
->err_list
));
76 * Unmarshall layout and store it in pnfslay.
78 struct pnfs_layout_segment
*
79 objlayout_alloc_lseg(struct pnfs_layout_hdr
*pnfslay
,
80 struct nfs4_layoutget_res
*lgr
,
84 struct xdr_stream stream
;
85 struct xdr_buf buf
= {
86 .pages
= lgr
->layoutp
->pages
,
87 .page_len
= lgr
->layoutp
->len
,
88 .buflen
= lgr
->layoutp
->len
,
89 .len
= lgr
->layoutp
->len
,
92 struct pnfs_layout_segment
*lseg
;
94 dprintk("%s: Begin pnfslay %p\n", __func__
, pnfslay
);
96 scratch
= alloc_page(gfp_flags
);
100 xdr_init_decode(&stream
, &buf
, NULL
);
101 xdr_set_scratch_buffer(&stream
, page_address(scratch
), PAGE_SIZE
);
103 status
= objio_alloc_lseg(&lseg
, pnfslay
, &lgr
->range
, &stream
, gfp_flags
);
104 if (unlikely(status
)) {
105 dprintk("%s: objio_alloc_lseg Return err %d\n", __func__
,
110 __free_page(scratch
);
112 dprintk("%s: Return %p\n", __func__
, lseg
);
116 __free_page(scratch
);
118 dprintk("%s: Err Return=>%d\n", __func__
, status
);
119 return ERR_PTR(status
);
123 * Free a layout segement
126 objlayout_free_lseg(struct pnfs_layout_segment
*lseg
)
128 dprintk("%s: freeing layout segment %p\n", __func__
, lseg
);
133 objio_free_lseg(lseg
);
140 end_offset(u64 start
, u64 len
)
145 return end
>= start
? end
: NFS4_MAX_UINT64
;
148 /* last octet in a range */
150 last_byte_offset(u64 start
, u64 len
)
156 return end
> start
? end
- 1 : NFS4_MAX_UINT64
;
159 static struct objlayout_io_state
*
160 objlayout_alloc_io_state(struct pnfs_layout_hdr
*pnfs_layout_type
,
165 struct pnfs_layout_segment
*lseg
,
169 struct objlayout_io_state
*state
;
172 dprintk("%s: allocating io_state\n", __func__
);
173 if (objio_alloc_io_state(lseg
, &state
, gfp_flags
))
176 BUG_ON(offset
< lseg
->pls_range
.offset
);
177 lseg_end_offset
= end_offset(lseg
->pls_range
.offset
,
178 lseg
->pls_range
.length
);
179 BUG_ON(offset
>= lseg_end_offset
);
180 if (offset
+ count
> lseg_end_offset
) {
181 count
= lseg
->pls_range
.length
-
182 (offset
- lseg
->pls_range
.offset
);
183 dprintk("%s: truncated count %Zd\n", __func__
, count
);
186 if (pgbase
> PAGE_SIZE
) {
187 pages
+= pgbase
>> PAGE_SHIFT
;
188 pgbase
&= ~PAGE_MASK
;
191 INIT_LIST_HEAD(&state
->err_list
);
193 state
->rpcdata
= rpcdata
;
194 state
->pages
= pages
;
195 state
->pgbase
= pgbase
;
196 state
->nr_pages
= (pgbase
+ count
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
;
197 state
->offset
= offset
;
198 state
->count
= count
;
205 objlayout_free_io_state(struct objlayout_io_state
*state
)
207 dprintk("%s: freeing io_state\n", __func__
);
208 if (unlikely(!state
))
211 objio_free_io_state(state
);
215 * I/O done common code
218 objlayout_iodone(struct objlayout_io_state
*state
)
220 dprintk("%s: state %p status\n", __func__
, state
);
222 if (likely(state
->status
>= 0)) {
223 objlayout_free_io_state(state
);
225 struct objlayout
*objlay
= OBJLAYOUT(state
->lseg
->pls_layout
);
227 spin_lock(&objlay
->lock
);
228 objlay
->delta_space_valid
= OBJ_DSU_INVALID
;
229 list_add(&objlay
->err_list
, &state
->err_list
);
230 spin_unlock(&objlay
->lock
);
235 * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
237 * The @index component IO failed (error returned from target). Register
238 * the error for later reporting at layout-return.
241 objlayout_io_set_result(struct objlayout_io_state
*state
, unsigned index
,
242 struct pnfs_osd_objid
*pooid
, int osd_error
,
243 u64 offset
, u64 length
, bool is_write
)
245 struct pnfs_osd_ioerr
*ioerr
= &state
->ioerrs
[index
];
247 BUG_ON(index
>= state
->num_comps
);
249 ioerr
->oer_component
= *pooid
;
250 ioerr
->oer_comp_offset
= offset
;
251 ioerr
->oer_comp_length
= length
;
252 ioerr
->oer_iswrite
= is_write
;
253 ioerr
->oer_errno
= osd_error
;
255 dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
256 "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
257 __func__
, index
, ioerr
->oer_errno
,
259 _DEVID_LO(&ioerr
->oer_component
.oid_device_id
),
260 _DEVID_HI(&ioerr
->oer_component
.oid_device_id
),
261 ioerr
->oer_component
.oid_partition_id
,
262 ioerr
->oer_component
.oid_object_id
,
263 ioerr
->oer_comp_offset
,
264 ioerr
->oer_comp_length
);
266 /* User need not call if no error is reported */
267 ioerr
->oer_errno
= 0;
271 /* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
272 * This is because the osd completion is called with ints-off from
275 static void _rpc_read_complete(struct work_struct
*work
)
277 struct rpc_task
*task
;
278 struct nfs_read_data
*rdata
;
280 dprintk("%s enter\n", __func__
);
281 task
= container_of(work
, struct rpc_task
, u
.tk_work
);
282 rdata
= container_of(task
, struct nfs_read_data
, task
);
284 pnfs_ld_read_done(rdata
);
288 objlayout_read_done(struct objlayout_io_state
*state
, ssize_t status
, bool sync
)
290 int eof
= state
->eof
;
291 struct nfs_read_data
*rdata
;
293 state
->status
= status
;
294 dprintk("%s: Begin status=%zd eof=%d\n", __func__
, status
, eof
);
295 rdata
= state
->rpcdata
;
296 rdata
->task
.tk_status
= status
;
298 rdata
->res
.count
= status
;
299 rdata
->res
.eof
= eof
;
301 objlayout_iodone(state
);
302 /* must not use state after this point */
305 pnfs_ld_read_done(rdata
);
307 INIT_WORK(&rdata
->task
.u
.tk_work
, _rpc_read_complete
);
308 schedule_work(&rdata
->task
.u
.tk_work
);
313 * Perform sync or async reads.
316 objlayout_read_pagelist(struct nfs_read_data
*rdata
)
318 loff_t offset
= rdata
->args
.offset
;
319 size_t count
= rdata
->args
.count
;
320 struct objlayout_io_state
*state
;
324 dprintk("%s: Begin inode %p offset %llu count %d\n",
325 __func__
, rdata
->inode
, offset
, (int)count
);
327 eof
= i_size_read(rdata
->inode
);
328 if (unlikely(offset
+ count
> eof
)) {
331 rdata
->res
.count
= 0;
335 count
= eof
- offset
;
338 state
= objlayout_alloc_io_state(NFS_I(rdata
->inode
)->layout
,
339 rdata
->args
.pages
, rdata
->args
.pgbase
,
343 if (unlikely(!state
)) {
348 state
->eof
= state
->offset
+ state
->count
>= eof
;
350 status
= objio_read_pagelist(state
);
352 dprintk("%s: Return status %Zd\n", __func__
, status
);
353 rdata
->pnfs_error
= status
;
354 return PNFS_ATTEMPTED
;
357 /* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
358 * This is because the osd completion is called with ints-off from
361 static void _rpc_write_complete(struct work_struct
*work
)
363 struct rpc_task
*task
;
364 struct nfs_write_data
*wdata
;
366 dprintk("%s enter\n", __func__
);
367 task
= container_of(work
, struct rpc_task
, u
.tk_work
);
368 wdata
= container_of(task
, struct nfs_write_data
, task
);
370 pnfs_ld_write_done(wdata
);
374 objlayout_write_done(struct objlayout_io_state
*state
, ssize_t status
,
377 struct nfs_write_data
*wdata
;
379 dprintk("%s: Begin\n", __func__
);
380 wdata
= state
->rpcdata
;
381 state
->status
= status
;
382 wdata
->task
.tk_status
= status
;
384 wdata
->res
.count
= status
;
385 wdata
->verf
.committed
= state
->committed
;
386 dprintk("%s: Return status %d committed %d\n",
387 __func__
, wdata
->task
.tk_status
,
388 wdata
->verf
.committed
);
390 dprintk("%s: Return status %d\n",
391 __func__
, wdata
->task
.tk_status
);
392 objlayout_iodone(state
);
393 /* must not use state after this point */
396 pnfs_ld_write_done(wdata
);
398 INIT_WORK(&wdata
->task
.u
.tk_work
, _rpc_write_complete
);
399 schedule_work(&wdata
->task
.u
.tk_work
);
404 * Perform sync or async writes.
407 objlayout_write_pagelist(struct nfs_write_data
*wdata
,
410 struct objlayout_io_state
*state
;
413 dprintk("%s: Begin inode %p offset %llu count %u\n",
414 __func__
, wdata
->inode
, wdata
->args
.offset
, wdata
->args
.count
);
416 state
= objlayout_alloc_io_state(NFS_I(wdata
->inode
)->layout
,
423 if (unlikely(!state
)) {
428 state
->sync
= how
& FLUSH_SYNC
;
430 status
= objio_write_pagelist(state
, how
& FLUSH_STABLE
);
432 dprintk("%s: Return status %Zd\n", __func__
, status
);
433 wdata
->pnfs_error
= status
;
434 return PNFS_ATTEMPTED
;
438 objlayout_encode_layoutcommit(struct pnfs_layout_hdr
*pnfslay
,
439 struct xdr_stream
*xdr
,
440 const struct nfs4_layoutcommit_args
*args
)
442 struct objlayout
*objlay
= OBJLAYOUT(pnfslay
);
443 struct pnfs_osd_layoutupdate lou
;
446 dprintk("%s: Begin\n", __func__
);
448 spin_lock(&objlay
->lock
);
449 lou
.dsu_valid
= (objlay
->delta_space_valid
== OBJ_DSU_VALID
);
450 lou
.dsu_delta
= objlay
->delta_space_used
;
451 objlay
->delta_space_used
= 0;
452 objlay
->delta_space_valid
= OBJ_DSU_INIT
;
453 lou
.olu_ioerr_flag
= !list_empty(&objlay
->err_list
);
454 spin_unlock(&objlay
->lock
);
456 start
= xdr_reserve_space(xdr
, 4);
458 BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr
, &lou
));
460 *start
= cpu_to_be32((xdr
->p
- start
- 1) * 4);
462 dprintk("%s: Return delta_space_used %lld err %d\n", __func__
,
463 lou
.dsu_delta
, lou
.olu_ioerr_flag
);
467 err_prio(u32 oer_errno
)
473 case PNFS_OSD_ERR_RESOURCE
:
474 return OSD_ERR_PRI_RESOURCE
;
475 case PNFS_OSD_ERR_BAD_CRED
:
476 return OSD_ERR_PRI_BAD_CRED
;
477 case PNFS_OSD_ERR_NO_ACCESS
:
478 return OSD_ERR_PRI_NO_ACCESS
;
479 case PNFS_OSD_ERR_UNREACHABLE
:
480 return OSD_ERR_PRI_UNREACHABLE
;
481 case PNFS_OSD_ERR_NOT_FOUND
:
482 return OSD_ERR_PRI_NOT_FOUND
;
483 case PNFS_OSD_ERR_NO_SPACE
:
484 return OSD_ERR_PRI_NO_SPACE
;
488 case PNFS_OSD_ERR_EIO
:
489 return OSD_ERR_PRI_EIO
;
494 merge_ioerr(struct pnfs_osd_ioerr
*dest_err
,
495 const struct pnfs_osd_ioerr
*src_err
)
497 u64 dest_end
, src_end
;
499 if (!dest_err
->oer_errno
) {
500 *dest_err
= *src_err
;
501 /* accumulated device must be blank */
502 memset(&dest_err
->oer_component
.oid_device_id
, 0,
503 sizeof(dest_err
->oer_component
.oid_device_id
));
508 if (dest_err
->oer_component
.oid_partition_id
!=
509 src_err
->oer_component
.oid_partition_id
)
510 dest_err
->oer_component
.oid_partition_id
= 0;
512 if (dest_err
->oer_component
.oid_object_id
!=
513 src_err
->oer_component
.oid_object_id
)
514 dest_err
->oer_component
.oid_object_id
= 0;
516 if (dest_err
->oer_comp_offset
> src_err
->oer_comp_offset
)
517 dest_err
->oer_comp_offset
= src_err
->oer_comp_offset
;
519 dest_end
= end_offset(dest_err
->oer_comp_offset
,
520 dest_err
->oer_comp_length
);
521 src_end
= end_offset(src_err
->oer_comp_offset
,
522 src_err
->oer_comp_length
);
523 if (dest_end
< src_end
)
526 dest_err
->oer_comp_length
= dest_end
- dest_err
->oer_comp_offset
;
528 if ((src_err
->oer_iswrite
== dest_err
->oer_iswrite
) &&
529 (err_prio(src_err
->oer_errno
) > err_prio(dest_err
->oer_errno
))) {
530 dest_err
->oer_errno
= src_err
->oer_errno
;
531 } else if (src_err
->oer_iswrite
) {
532 dest_err
->oer_iswrite
= true;
533 dest_err
->oer_errno
= src_err
->oer_errno
;
538 encode_accumulated_error(struct objlayout
*objlay
, __be32
*p
)
540 struct objlayout_io_state
*state
, *tmp
;
541 struct pnfs_osd_ioerr accumulated_err
= {.oer_errno
= 0};
543 list_for_each_entry_safe(state
, tmp
, &objlay
->err_list
, err_list
) {
546 for (i
= 0; i
< state
->num_comps
; i
++) {
547 struct pnfs_osd_ioerr
*ioerr
= &state
->ioerrs
[i
];
549 if (!ioerr
->oer_errno
)
552 printk(KERN_ERR
"%s: err[%d]: errno=%d is_write=%d "
553 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
554 "offset=0x%llx length=0x%llx\n",
555 __func__
, i
, ioerr
->oer_errno
,
557 _DEVID_LO(&ioerr
->oer_component
.oid_device_id
),
558 _DEVID_HI(&ioerr
->oer_component
.oid_device_id
),
559 ioerr
->oer_component
.oid_partition_id
,
560 ioerr
->oer_component
.oid_object_id
,
561 ioerr
->oer_comp_offset
,
562 ioerr
->oer_comp_length
);
564 merge_ioerr(&accumulated_err
, ioerr
);
566 list_del(&state
->err_list
);
567 objlayout_free_io_state(state
);
570 pnfs_osd_xdr_encode_ioerr(p
, &accumulated_err
);
574 objlayout_encode_layoutreturn(struct pnfs_layout_hdr
*pnfslay
,
575 struct xdr_stream
*xdr
,
576 const struct nfs4_layoutreturn_args
*args
)
578 struct objlayout
*objlay
= OBJLAYOUT(pnfslay
);
579 struct objlayout_io_state
*state
, *tmp
;
582 dprintk("%s: Begin\n", __func__
);
583 start
= xdr_reserve_space(xdr
, 4);
586 spin_lock(&objlay
->lock
);
588 list_for_each_entry_safe(state
, tmp
, &objlay
->err_list
, err_list
) {
589 __be32
*last_xdr
= NULL
, *p
;
593 for (i
= 0; i
< state
->num_comps
; i
++) {
594 struct pnfs_osd_ioerr
*ioerr
= &state
->ioerrs
[i
];
596 if (!ioerr
->oer_errno
)
599 dprintk("%s: err[%d]: errno=%d is_write=%d "
600 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
601 "offset=0x%llx length=0x%llx\n",
602 __func__
, i
, ioerr
->oer_errno
,
604 _DEVID_LO(&ioerr
->oer_component
.oid_device_id
),
605 _DEVID_HI(&ioerr
->oer_component
.oid_device_id
),
606 ioerr
->oer_component
.oid_partition_id
,
607 ioerr
->oer_component
.oid_object_id
,
608 ioerr
->oer_comp_offset
,
609 ioerr
->oer_comp_length
);
611 p
= pnfs_osd_xdr_ioerr_reserve_space(xdr
);
614 break; /* accumulated_error */
618 pnfs_osd_xdr_encode_ioerr(p
, &state
->ioerrs
[i
]);
621 /* TODO: use xdr_write_pages */
623 /* no space for even one error descriptor */
626 /* we've encountered a situation with lots and lots of
627 * errors and no space to encode them all. Use the last
628 * available slot to report the union of all the
631 encode_accumulated_error(objlay
, last_xdr
);
634 list_del(&state
->err_list
);
635 objlayout_free_io_state(state
);
638 spin_unlock(&objlay
->lock
);
640 *start
= cpu_to_be32((xdr
->p
- start
- 1) * 4);
641 dprintk("%s: Return\n", __func__
);
646 * Get Device Info API for io engines
648 struct objlayout_deviceinfo
{
650 struct pnfs_osd_deviceaddr da
; /* This must be last */
653 /* Initialize and call nfs_getdeviceinfo, then decode and return a
654 * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
657 int objlayout_get_deviceinfo(struct pnfs_layout_hdr
*pnfslay
,
658 struct nfs4_deviceid
*d_id
, struct pnfs_osd_deviceaddr
**deviceaddr
,
661 struct objlayout_deviceinfo
*odi
;
662 struct pnfs_device pd
;
663 struct super_block
*sb
;
664 struct page
*page
, **pages
;
668 page
= alloc_page(gfp_flags
);
675 memcpy(&pd
.dev_id
, d_id
, sizeof(*d_id
));
676 pd
.layout_type
= LAYOUT_OSD2_OBJECTS
;
679 pd
.pglen
= PAGE_SIZE
;
682 sb
= pnfslay
->plh_inode
->i_sb
;
683 err
= nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay
->plh_inode
), &pd
);
684 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__
, err
);
688 p
= page_address(page
);
689 odi
= kzalloc(sizeof(*odi
), gfp_flags
);
694 pnfs_osd_xdr_decode_deviceaddr(&odi
->da
, p
);
696 *deviceaddr
= &odi
->da
;
704 void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr
*deviceaddr
)
706 struct objlayout_deviceinfo
*odi
= container_of(deviceaddr
,
707 struct objlayout_deviceinfo
,
710 __free_page(odi
->page
);