2 * pNFS Objects layout implementation over open-osd initiator library
4 * Copyright (C) 2009 Panasas Inc. [year of first publication]
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 #include <linux/module.h>
41 #include <scsi/osd_ore.h>
43 #include "objlayout.h"
45 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
47 struct objio_dev_ent
{
48 struct nfs4_deviceid_node id_node
;
53 objio_free_deviceid_node(struct nfs4_deviceid_node
*d
)
55 struct objio_dev_ent
*de
= container_of(d
, struct objio_dev_ent
, id_node
);
57 dprintk("%s: free od=%p\n", __func__
, de
->od
.od
);
58 osduld_put_device(de
->od
.od
);
62 static struct objio_dev_ent
*_dev_list_find(const struct nfs_server
*nfss
,
63 const struct nfs4_deviceid
*d_id
)
65 struct nfs4_deviceid_node
*d
;
66 struct objio_dev_ent
*de
;
68 d
= nfs4_find_get_deviceid(nfss
->pnfs_curr_ld
, nfss
->nfs_client
, d_id
);
72 de
= container_of(d
, struct objio_dev_ent
, id_node
);
76 static struct objio_dev_ent
*
77 _dev_list_add(const struct nfs_server
*nfss
,
78 const struct nfs4_deviceid
*d_id
, struct osd_dev
*od
,
81 struct nfs4_deviceid_node
*d
;
82 struct objio_dev_ent
*de
= kzalloc(sizeof(*de
), gfp_flags
);
83 struct objio_dev_ent
*n
;
86 dprintk("%s: -ENOMEM od=%p\n", __func__
, od
);
90 dprintk("%s: Adding od=%p\n", __func__
, od
);
91 nfs4_init_deviceid_node(&de
->id_node
,
97 d
= nfs4_insert_deviceid_node(&de
->id_node
);
98 n
= container_of(d
, struct objio_dev_ent
, id_node
);
100 dprintk("%s: Race with other n->od=%p\n", __func__
, n
->od
.od
);
101 objio_free_deviceid_node(&de
->id_node
);
108 struct objio_segment
{
109 struct pnfs_layout_segment lseg
;
111 struct ore_layout layout
;
112 struct ore_components oc
;
115 static inline struct objio_segment
*
116 OBJIO_LSEG(struct pnfs_layout_segment
*lseg
)
118 return container_of(lseg
, struct objio_segment
, lseg
);
123 struct objlayout_io_res oir
;
126 /*FIXME: Support for extra_bytes at ore_get_rw_state() */
127 struct ore_io_state
*ios
;
130 /* Send and wait for a get_device_info of devices in the layout,
131 then look them up with the osd_initiator library */
132 static int objio_devices_lookup(struct pnfs_layout_hdr
*pnfslay
,
133 struct objio_segment
*objio_seg
, unsigned c
, struct nfs4_deviceid
*d_id
,
136 struct pnfs_osd_deviceaddr
*deviceaddr
;
137 struct objio_dev_ent
*ode
;
139 struct osd_dev_info odi
;
142 ode
= _dev_list_find(NFS_SERVER(pnfslay
->plh_inode
), d_id
);
144 objio_seg
->oc
.ods
[c
] = &ode
->od
; /* must use container_of */
148 err
= objlayout_get_deviceinfo(pnfslay
, d_id
, &deviceaddr
, gfp_flags
);
150 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
151 __func__
, _DEVID_LO(d_id
), _DEVID_HI(d_id
), err
);
155 odi
.systemid_len
= deviceaddr
->oda_systemid
.len
;
156 if (odi
.systemid_len
> sizeof(odi
.systemid
)) {
157 dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
158 __func__
, sizeof(odi
.systemid
));
161 } else if (odi
.systemid_len
)
162 memcpy(odi
.systemid
, deviceaddr
->oda_systemid
.data
,
164 odi
.osdname_len
= deviceaddr
->oda_osdname
.len
;
165 odi
.osdname
= (u8
*)deviceaddr
->oda_osdname
.data
;
167 if (!odi
.osdname_len
&& !odi
.systemid_len
) {
168 dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
174 od
= osduld_info_lookup(&odi
);
175 if (unlikely(IS_ERR(od
))) {
177 dprintk("%s: osduld_info_lookup => %d\n", __func__
, err
);
181 ode
= _dev_list_add(NFS_SERVER(pnfslay
->plh_inode
), d_id
, od
,
183 objio_seg
->oc
.ods
[c
] = &ode
->od
; /* must use container_of */
184 dprintk("Adding new dev_id(%llx:%llx)\n",
185 _DEVID_LO(d_id
), _DEVID_HI(d_id
));
187 objlayout_put_deviceinfo(deviceaddr
);
191 static void copy_single_comp(struct ore_components
*oc
, unsigned c
,
192 struct pnfs_osd_object_cred
*src_comp
)
194 struct ore_comp
*ocomp
= &oc
->comps
[c
];
196 WARN_ON(src_comp
->oc_cap_key
.cred_len
> 0); /* libosd is NO_SEC only */
197 WARN_ON(src_comp
->oc_cap
.cred_len
> sizeof(ocomp
->cred
));
199 ocomp
->obj
.partition
= src_comp
->oc_object_id
.oid_partition_id
;
200 ocomp
->obj
.id
= src_comp
->oc_object_id
.oid_object_id
;
202 memcpy(ocomp
->cred
, src_comp
->oc_cap
.cred
, sizeof(ocomp
->cred
));
205 int __alloc_objio_seg(unsigned numdevs
, gfp_t gfp_flags
,
206 struct objio_segment
**pseg
)
208 struct __alloc_objio_segment
{
209 struct objio_segment olseg
;
210 struct ore_dev
*ods
[numdevs
];
211 struct ore_comp comps
[numdevs
];
214 aolseg
= kzalloc(sizeof(*aolseg
), gfp_flags
);
215 if (unlikely(!aolseg
)) {
216 dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__
,
217 numdevs
, sizeof(*aolseg
));
221 aolseg
->olseg
.oc
.numdevs
= numdevs
;
222 aolseg
->olseg
.oc
.single_comp
= EC_MULTPLE_COMPS
;
223 aolseg
->olseg
.oc
.comps
= aolseg
->comps
;
224 aolseg
->olseg
.oc
.ods
= aolseg
->ods
;
226 *pseg
= &aolseg
->olseg
;
230 int objio_alloc_lseg(struct pnfs_layout_segment
**outp
,
231 struct pnfs_layout_hdr
*pnfslay
,
232 struct pnfs_layout_range
*range
,
233 struct xdr_stream
*xdr
,
236 struct objio_segment
*objio_seg
;
237 struct pnfs_osd_xdr_decode_layout_iter iter
;
238 struct pnfs_osd_layout layout
;
239 struct pnfs_osd_object_cred src_comp
;
243 err
= pnfs_osd_xdr_decode_layout_map(&layout
, &iter
, xdr
);
247 err
= __alloc_objio_seg(layout
.olo_num_comps
, gfp_flags
, &objio_seg
);
251 objio_seg
->layout
.stripe_unit
= layout
.olo_map
.odm_stripe_unit
;
252 objio_seg
->layout
.group_width
= layout
.olo_map
.odm_group_width
;
253 objio_seg
->layout
.group_depth
= layout
.olo_map
.odm_group_depth
;
254 objio_seg
->layout
.mirrors_p1
= layout
.olo_map
.odm_mirror_cnt
+ 1;
255 objio_seg
->layout
.raid_algorithm
= layout
.olo_map
.odm_raid_algorithm
;
257 err
= ore_verify_layout(layout
.olo_map
.odm_num_comps
,
262 objio_seg
->oc
.first_dev
= layout
.olo_comps_index
;
264 while (pnfs_osd_xdr_decode_layout_comp(&src_comp
, &iter
, xdr
, &err
)) {
265 copy_single_comp(&objio_seg
->oc
, cur_comp
, &src_comp
);
266 err
= objio_devices_lookup(pnfslay
, objio_seg
, cur_comp
,
267 &src_comp
.oc_object_id
.oid_device_id
,
273 /* pnfs_osd_xdr_decode_layout_comp returns false on error */
277 *outp
= &objio_seg
->lseg
;
282 dprintk("%s: Error: return %d\n", __func__
, err
);
287 void objio_free_lseg(struct pnfs_layout_segment
*lseg
)
290 struct objio_segment
*objio_seg
= OBJIO_LSEG(lseg
);
292 for (i
= 0; i
< objio_seg
->oc
.numdevs
; i
++) {
293 struct ore_dev
*od
= objio_seg
->oc
.ods
[i
];
294 struct objio_dev_ent
*ode
;
298 ode
= container_of(od
, typeof(*ode
), od
);
299 nfs4_put_deviceid_node(&ode
->id_node
);
305 objio_alloc_io_state(struct pnfs_layout_hdr
*pnfs_layout_type
, bool is_reading
,
306 struct pnfs_layout_segment
*lseg
, struct page
**pages
, unsigned pgbase
,
307 loff_t offset
, size_t count
, void *rpcdata
, gfp_t gfp_flags
,
308 struct objio_state
**outp
)
310 struct objio_segment
*objio_seg
= OBJIO_LSEG(lseg
);
311 struct ore_io_state
*ios
;
313 struct __alloc_objio_state
{
314 struct objio_state objios
;
315 struct pnfs_osd_ioerr ioerrs
[objio_seg
->oc
.numdevs
];
318 aos
= kzalloc(sizeof(*aos
), gfp_flags
);
322 objlayout_init_ioerrs(&aos
->objios
.oir
, objio_seg
->oc
.numdevs
,
323 aos
->ioerrs
, rpcdata
, pnfs_layout_type
);
325 ret
= ore_get_rw_state(&objio_seg
->layout
, &objio_seg
->oc
, is_reading
,
326 offset
, count
, &ios
);
333 ios
->pgbase
= pgbase
;
335 BUG_ON(ios
->nr_pages
> (pgbase
+ count
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
);
337 aos
->objios
.sync
= 0;
338 aos
->objios
.ios
= ios
;
339 *outp
= &aos
->objios
;
343 void objio_free_result(struct objlayout_io_res
*oir
)
345 struct objio_state
*objios
= container_of(oir
, struct objio_state
, oir
);
347 ore_put_io_state(objios
->ios
);
351 enum pnfs_osd_errno
osd_pri_2_pnfs_err(enum osd_err_priority oep
)
354 case OSD_ERR_PRI_NO_ERROR
:
355 return (enum pnfs_osd_errno
)0;
357 case OSD_ERR_PRI_CLEAR_PAGES
:
361 case OSD_ERR_PRI_RESOURCE
:
362 return PNFS_OSD_ERR_RESOURCE
;
363 case OSD_ERR_PRI_BAD_CRED
:
364 return PNFS_OSD_ERR_BAD_CRED
;
365 case OSD_ERR_PRI_NO_ACCESS
:
366 return PNFS_OSD_ERR_NO_ACCESS
;
367 case OSD_ERR_PRI_UNREACHABLE
:
368 return PNFS_OSD_ERR_UNREACHABLE
;
369 case OSD_ERR_PRI_NOT_FOUND
:
370 return PNFS_OSD_ERR_NOT_FOUND
;
371 case OSD_ERR_PRI_NO_SPACE
:
372 return PNFS_OSD_ERR_NO_SPACE
;
376 case OSD_ERR_PRI_EIO
:
377 return PNFS_OSD_ERR_EIO
;
381 static void __on_dev_error(struct ore_io_state
*ios
,
382 struct ore_dev
*od
, unsigned dev_index
, enum osd_err_priority oep
,
383 u64 dev_offset
, u64 dev_len
)
385 struct objio_state
*objios
= ios
->private;
386 struct pnfs_osd_objid pooid
;
387 struct objio_dev_ent
*ode
= container_of(od
, typeof(*ode
), od
);
388 /* FIXME: what to do with more-then-one-group layouts. We need to
389 * translate from ore_io_state index to oc->comps index
391 unsigned comp
= dev_index
;
393 pooid
.oid_device_id
= ode
->id_node
.deviceid
;
394 pooid
.oid_partition_id
= ios
->oc
->comps
[comp
].obj
.partition
;
395 pooid
.oid_object_id
= ios
->oc
->comps
[comp
].obj
.id
;
397 objlayout_io_set_result(&objios
->oir
, comp
,
398 &pooid
, osd_pri_2_pnfs_err(oep
),
399 dev_offset
, dev_len
, !ios
->reading
);
405 static void _read_done(struct ore_io_state
*ios
, void *private)
407 struct objio_state
*objios
= private;
409 int ret
= ore_check_io(ios
, &__on_dev_error
);
411 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
414 status
= ios
->length
;
418 objlayout_read_done(&objios
->oir
, status
, objios
->sync
);
421 int objio_read_pagelist(struct nfs_read_data
*rdata
)
423 struct objio_state
*objios
;
426 ret
= objio_alloc_io_state(NFS_I(rdata
->inode
)->layout
, true,
427 rdata
->lseg
, rdata
->args
.pages
, rdata
->args
.pgbase
,
428 rdata
->args
.offset
, rdata
->args
.count
, rdata
,
429 GFP_KERNEL
, &objios
);
433 objios
->ios
->done
= _read_done
;
434 dprintk("%s: offset=0x%llx length=0x%x\n", __func__
,
435 rdata
->args
.offset
, rdata
->args
.count
);
436 return ore_read(objios
->ios
);
442 static void _write_done(struct ore_io_state
*ios
, void *private)
444 struct objio_state
*objios
= private;
446 int ret
= ore_check_io(ios
, &__on_dev_error
);
448 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
451 /* FIXME: should be based on the OSD's persistence model
452 * See OSD2r05 Section 4.13 Data persistence model */
453 objios
->oir
.committed
= NFS_FILE_SYNC
;
454 status
= ios
->length
;
459 objlayout_write_done(&objios
->oir
, status
, objios
->sync
);
462 static struct page
*__r4w_get_page(void *priv
, u64 offset
, bool *uptodate
)
464 struct objio_state
*objios
= priv
;
465 struct nfs_write_data
*wdata
= objios
->oir
.rpcdata
;
466 pgoff_t index
= offset
/ PAGE_SIZE
;
467 struct page
*page
= find_get_page(wdata
->inode
->i_mapping
, index
);
470 page
= find_or_create_page(wdata
->inode
->i_mapping
,
472 if (unlikely(!page
)) {
473 dprintk("%s: grab_cache_page Failed index=0x%lx\n",
479 if (PageDirty(page
) || PageWriteback(page
))
482 *uptodate
= PageUptodate(page
);
483 dprintk("%s: index=0x%lx uptodate=%d\n", __func__
, index
, *uptodate
);
487 static void __r4w_put_page(void *priv
, struct page
*page
)
489 dprintk("%s: index=0x%lx\n", __func__
, page
->index
);
490 page_cache_release(page
);
494 static const struct _ore_r4w_op _r4w_op
= {
495 .get_page
= &__r4w_get_page
,
496 .put_page
= &__r4w_put_page
,
499 int objio_write_pagelist(struct nfs_write_data
*wdata
, int how
)
501 struct objio_state
*objios
;
504 ret
= objio_alloc_io_state(NFS_I(wdata
->inode
)->layout
, false,
505 wdata
->lseg
, wdata
->args
.pages
, wdata
->args
.pgbase
,
506 wdata
->args
.offset
, wdata
->args
.count
, wdata
, GFP_NOFS
,
511 objios
->sync
= 0 != (how
& FLUSH_SYNC
);
512 objios
->ios
->r4w
= &_r4w_op
;
515 objios
->ios
->done
= _write_done
;
517 dprintk("%s: offset=0x%llx length=0x%x\n", __func__
,
518 wdata
->args
.offset
, wdata
->args
.count
);
519 ret
= ore_write(objios
->ios
);
524 _write_done(objios
->ios
, objios
);
529 static bool objio_pg_test(struct nfs_pageio_descriptor
*pgio
,
530 struct nfs_page
*prev
, struct nfs_page
*req
)
532 if (!pnfs_generic_pg_test(pgio
, prev
, req
))
535 return pgio
->pg_count
+ req
->wb_bytes
<=
536 OBJIO_LSEG(pgio
->pg_lseg
)->layout
.max_io_length
;
539 static const struct nfs_pageio_ops objio_pg_read_ops
= {
540 .pg_init
= pnfs_generic_pg_init_read
,
541 .pg_test
= objio_pg_test
,
542 .pg_doio
= pnfs_generic_pg_readpages
,
545 static const struct nfs_pageio_ops objio_pg_write_ops
= {
546 .pg_init
= pnfs_generic_pg_init_write
,
547 .pg_test
= objio_pg_test
,
548 .pg_doio
= pnfs_generic_pg_writepages
,
551 static struct pnfs_layoutdriver_type objlayout_type
= {
552 .id
= LAYOUT_OSD2_OBJECTS
,
553 .name
= "LAYOUT_OSD2_OBJECTS",
554 .flags
= PNFS_LAYOUTRET_ON_SETATTR
|
555 PNFS_LAYOUTRET_ON_ERROR
,
557 .alloc_layout_hdr
= objlayout_alloc_layout_hdr
,
558 .free_layout_hdr
= objlayout_free_layout_hdr
,
560 .alloc_lseg
= objlayout_alloc_lseg
,
561 .free_lseg
= objlayout_free_lseg
,
563 .read_pagelist
= objlayout_read_pagelist
,
564 .write_pagelist
= objlayout_write_pagelist
,
565 .pg_read_ops
= &objio_pg_read_ops
,
566 .pg_write_ops
= &objio_pg_write_ops
,
568 .free_deviceid_node
= objio_free_deviceid_node
,
570 .encode_layoutcommit
= objlayout_encode_layoutcommit
,
571 .encode_layoutreturn
= objlayout_encode_layoutreturn
,
574 MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
575 MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
576 MODULE_LICENSE("GPL");
581 int ret
= pnfs_register_layoutdriver(&objlayout_type
);
585 "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
588 printk(KERN_INFO
"%s: Registered OSD pNFS Layout Driver\n",
596 pnfs_unregister_layoutdriver(&objlayout_type
);
597 printk(KERN_INFO
"%s: Unregistered OSD pNFS Layout Driver\n",
601 MODULE_ALIAS("nfs-layouttype4-2");
603 module_init(objlayout_init
);
604 module_exit(objlayout_exit
);