2 * pNFS Objects layout implementation over open-osd initiator library
4 * Copyright (C) 2009 Panasas Inc. [year of first publication]
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 #include <linux/module.h>
41 #include <scsi/osd_ore.h>
43 #include "objlayout.h"
44 #include "../internal.h"
46 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
48 struct objio_dev_ent
{
49 struct nfs4_deviceid_node id_node
;
54 objio_free_deviceid_node(struct nfs4_deviceid_node
*d
)
56 struct objio_dev_ent
*de
= container_of(d
, struct objio_dev_ent
, id_node
);
58 dprintk("%s: free od=%p\n", __func__
, de
->od
.od
);
59 osduld_put_device(de
->od
.od
);
63 static struct objio_dev_ent
*_dev_list_find(const struct nfs_server
*nfss
,
64 const struct nfs4_deviceid
*d_id
)
66 struct nfs4_deviceid_node
*d
;
67 struct objio_dev_ent
*de
;
69 d
= nfs4_find_get_deviceid(nfss
->pnfs_curr_ld
, nfss
->nfs_client
, d_id
);
73 de
= container_of(d
, struct objio_dev_ent
, id_node
);
77 static struct objio_dev_ent
*
78 _dev_list_add(const struct nfs_server
*nfss
,
79 const struct nfs4_deviceid
*d_id
, struct osd_dev
*od
,
82 struct nfs4_deviceid_node
*d
;
83 struct objio_dev_ent
*de
= kzalloc(sizeof(*de
), gfp_flags
);
84 struct objio_dev_ent
*n
;
87 dprintk("%s: -ENOMEM od=%p\n", __func__
, od
);
91 dprintk("%s: Adding od=%p\n", __func__
, od
);
92 nfs4_init_deviceid_node(&de
->id_node
,
98 d
= nfs4_insert_deviceid_node(&de
->id_node
);
99 n
= container_of(d
, struct objio_dev_ent
, id_node
);
101 dprintk("%s: Race with other n->od=%p\n", __func__
, n
->od
.od
);
102 objio_free_deviceid_node(&de
->id_node
);
109 struct objio_segment
{
110 struct pnfs_layout_segment lseg
;
112 struct ore_layout layout
;
113 struct ore_components oc
;
116 static inline struct objio_segment
*
117 OBJIO_LSEG(struct pnfs_layout_segment
*lseg
)
119 return container_of(lseg
, struct objio_segment
, lseg
);
124 struct objlayout_io_res oir
;
127 /*FIXME: Support for extra_bytes at ore_get_rw_state() */
128 struct ore_io_state
*ios
;
131 /* Send and wait for a get_device_info of devices in the layout,
132 then look them up with the osd_initiator library */
133 static int objio_devices_lookup(struct pnfs_layout_hdr
*pnfslay
,
134 struct objio_segment
*objio_seg
, unsigned c
, struct nfs4_deviceid
*d_id
,
137 struct pnfs_osd_deviceaddr
*deviceaddr
;
138 struct objio_dev_ent
*ode
;
140 struct osd_dev_info odi
;
141 bool retry_flag
= true;
144 ode
= _dev_list_find(NFS_SERVER(pnfslay
->plh_inode
), d_id
);
146 objio_seg
->oc
.ods
[c
] = &ode
->od
; /* must use container_of */
150 err
= objlayout_get_deviceinfo(pnfslay
, d_id
, &deviceaddr
, gfp_flags
);
152 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
153 __func__
, _DEVID_LO(d_id
), _DEVID_HI(d_id
), err
);
157 odi
.systemid_len
= deviceaddr
->oda_systemid
.len
;
158 if (odi
.systemid_len
> sizeof(odi
.systemid
)) {
159 dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
160 __func__
, sizeof(odi
.systemid
));
163 } else if (odi
.systemid_len
)
164 memcpy(odi
.systemid
, deviceaddr
->oda_systemid
.data
,
166 odi
.osdname_len
= deviceaddr
->oda_osdname
.len
;
167 odi
.osdname
= (u8
*)deviceaddr
->oda_osdname
.data
;
169 if (!odi
.osdname_len
&& !odi
.systemid_len
) {
170 dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
177 od
= osduld_info_lookup(&odi
);
178 if (unlikely(IS_ERR(od
))) {
180 dprintk("%s: osduld_info_lookup => %d\n", __func__
, err
);
181 if (err
== -ENODEV
&& retry_flag
) {
182 err
= objlayout_autologin(deviceaddr
);
191 ode
= _dev_list_add(NFS_SERVER(pnfslay
->plh_inode
), d_id
, od
,
193 objio_seg
->oc
.ods
[c
] = &ode
->od
; /* must use container_of */
194 dprintk("Adding new dev_id(%llx:%llx)\n",
195 _DEVID_LO(d_id
), _DEVID_HI(d_id
));
197 objlayout_put_deviceinfo(deviceaddr
);
201 static void copy_single_comp(struct ore_components
*oc
, unsigned c
,
202 struct pnfs_osd_object_cred
*src_comp
)
204 struct ore_comp
*ocomp
= &oc
->comps
[c
];
206 WARN_ON(src_comp
->oc_cap_key
.cred_len
> 0); /* libosd is NO_SEC only */
207 WARN_ON(src_comp
->oc_cap
.cred_len
> sizeof(ocomp
->cred
));
209 ocomp
->obj
.partition
= src_comp
->oc_object_id
.oid_partition_id
;
210 ocomp
->obj
.id
= src_comp
->oc_object_id
.oid_object_id
;
212 memcpy(ocomp
->cred
, src_comp
->oc_cap
.cred
, sizeof(ocomp
->cred
));
215 static int __alloc_objio_seg(unsigned numdevs
, gfp_t gfp_flags
,
216 struct objio_segment
**pseg
)
218 /* This is the in memory structure of the objio_segment
220 * struct __alloc_objio_segment {
221 * struct objio_segment olseg;
222 * struct ore_dev *ods[numdevs];
223 * struct ore_comp comps[numdevs];
225 * NOTE: The code as above compiles and runs perfectly. It is elegant,
226 * type safe and compact. At some Past time Linus has decided he does not
227 * like variable length arrays, For the sake of this principal we uglify
230 struct objio_segment
*lseg
;
231 size_t lseg_size
= sizeof(*lseg
) +
232 numdevs
* sizeof(lseg
->oc
.ods
[0]) +
233 numdevs
* sizeof(*lseg
->oc
.comps
);
235 lseg
= kzalloc(lseg_size
, gfp_flags
);
236 if (unlikely(!lseg
)) {
237 dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__
,
242 lseg
->oc
.numdevs
= numdevs
;
243 lseg
->oc
.single_comp
= EC_MULTPLE_COMPS
;
244 lseg
->oc
.ods
= (void *)(lseg
+ 1);
245 lseg
->oc
.comps
= (void *)(lseg
->oc
.ods
+ numdevs
);
251 int objio_alloc_lseg(struct pnfs_layout_segment
**outp
,
252 struct pnfs_layout_hdr
*pnfslay
,
253 struct pnfs_layout_range
*range
,
254 struct xdr_stream
*xdr
,
257 struct objio_segment
*objio_seg
;
258 struct pnfs_osd_xdr_decode_layout_iter iter
;
259 struct pnfs_osd_layout layout
;
260 struct pnfs_osd_object_cred src_comp
;
264 err
= pnfs_osd_xdr_decode_layout_map(&layout
, &iter
, xdr
);
268 err
= __alloc_objio_seg(layout
.olo_num_comps
, gfp_flags
, &objio_seg
);
272 objio_seg
->layout
.stripe_unit
= layout
.olo_map
.odm_stripe_unit
;
273 objio_seg
->layout
.group_width
= layout
.olo_map
.odm_group_width
;
274 objio_seg
->layout
.group_depth
= layout
.olo_map
.odm_group_depth
;
275 objio_seg
->layout
.mirrors_p1
= layout
.olo_map
.odm_mirror_cnt
+ 1;
276 objio_seg
->layout
.raid_algorithm
= layout
.olo_map
.odm_raid_algorithm
;
278 err
= ore_verify_layout(layout
.olo_map
.odm_num_comps
,
283 objio_seg
->oc
.first_dev
= layout
.olo_comps_index
;
285 while (pnfs_osd_xdr_decode_layout_comp(&src_comp
, &iter
, xdr
, &err
)) {
286 copy_single_comp(&objio_seg
->oc
, cur_comp
, &src_comp
);
287 err
= objio_devices_lookup(pnfslay
, objio_seg
, cur_comp
,
288 &src_comp
.oc_object_id
.oid_device_id
,
294 /* pnfs_osd_xdr_decode_layout_comp returns false on error */
298 *outp
= &objio_seg
->lseg
;
303 dprintk("%s: Error: return %d\n", __func__
, err
);
308 void objio_free_lseg(struct pnfs_layout_segment
*lseg
)
311 struct objio_segment
*objio_seg
= OBJIO_LSEG(lseg
);
313 for (i
= 0; i
< objio_seg
->oc
.numdevs
; i
++) {
314 struct ore_dev
*od
= objio_seg
->oc
.ods
[i
];
315 struct objio_dev_ent
*ode
;
319 ode
= container_of(od
, typeof(*ode
), od
);
320 nfs4_put_deviceid_node(&ode
->id_node
);
326 objio_alloc_io_state(struct pnfs_layout_hdr
*pnfs_layout_type
, bool is_reading
,
327 struct pnfs_layout_segment
*lseg
, struct page
**pages
, unsigned pgbase
,
328 loff_t offset
, size_t count
, void *rpcdata
, gfp_t gfp_flags
,
329 struct objio_state
**outp
)
331 struct objio_segment
*objio_seg
= OBJIO_LSEG(lseg
);
332 struct ore_io_state
*ios
;
334 struct __alloc_objio_state
{
335 struct objio_state objios
;
336 struct pnfs_osd_ioerr ioerrs
[objio_seg
->oc
.numdevs
];
339 aos
= kzalloc(sizeof(*aos
), gfp_flags
);
343 objlayout_init_ioerrs(&aos
->objios
.oir
, objio_seg
->oc
.numdevs
,
344 aos
->ioerrs
, rpcdata
, pnfs_layout_type
);
346 ret
= ore_get_rw_state(&objio_seg
->layout
, &objio_seg
->oc
, is_reading
,
347 offset
, count
, &ios
);
354 ios
->pgbase
= pgbase
;
356 BUG_ON(ios
->nr_pages
> (pgbase
+ count
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
);
358 aos
->objios
.sync
= 0;
359 aos
->objios
.ios
= ios
;
360 *outp
= &aos
->objios
;
364 void objio_free_result(struct objlayout_io_res
*oir
)
366 struct objio_state
*objios
= container_of(oir
, struct objio_state
, oir
);
368 ore_put_io_state(objios
->ios
);
372 static enum pnfs_osd_errno
osd_pri_2_pnfs_err(enum osd_err_priority oep
)
375 case OSD_ERR_PRI_NO_ERROR
:
376 return (enum pnfs_osd_errno
)0;
378 case OSD_ERR_PRI_CLEAR_PAGES
:
382 case OSD_ERR_PRI_RESOURCE
:
383 return PNFS_OSD_ERR_RESOURCE
;
384 case OSD_ERR_PRI_BAD_CRED
:
385 return PNFS_OSD_ERR_BAD_CRED
;
386 case OSD_ERR_PRI_NO_ACCESS
:
387 return PNFS_OSD_ERR_NO_ACCESS
;
388 case OSD_ERR_PRI_UNREACHABLE
:
389 return PNFS_OSD_ERR_UNREACHABLE
;
390 case OSD_ERR_PRI_NOT_FOUND
:
391 return PNFS_OSD_ERR_NOT_FOUND
;
392 case OSD_ERR_PRI_NO_SPACE
:
393 return PNFS_OSD_ERR_NO_SPACE
;
397 case OSD_ERR_PRI_EIO
:
398 return PNFS_OSD_ERR_EIO
;
402 static void __on_dev_error(struct ore_io_state
*ios
,
403 struct ore_dev
*od
, unsigned dev_index
, enum osd_err_priority oep
,
404 u64 dev_offset
, u64 dev_len
)
406 struct objio_state
*objios
= ios
->private;
407 struct pnfs_osd_objid pooid
;
408 struct objio_dev_ent
*ode
= container_of(od
, typeof(*ode
), od
);
409 /* FIXME: what to do with more-then-one-group layouts. We need to
410 * translate from ore_io_state index to oc->comps index
412 unsigned comp
= dev_index
;
414 pooid
.oid_device_id
= ode
->id_node
.deviceid
;
415 pooid
.oid_partition_id
= ios
->oc
->comps
[comp
].obj
.partition
;
416 pooid
.oid_object_id
= ios
->oc
->comps
[comp
].obj
.id
;
418 objlayout_io_set_result(&objios
->oir
, comp
,
419 &pooid
, osd_pri_2_pnfs_err(oep
),
420 dev_offset
, dev_len
, !ios
->reading
);
426 static void _read_done(struct ore_io_state
*ios
, void *private)
428 struct objio_state
*objios
= private;
430 int ret
= ore_check_io(ios
, &__on_dev_error
);
432 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
435 status
= ios
->length
;
439 objlayout_read_done(&objios
->oir
, status
, objios
->sync
);
442 int objio_read_pagelist(struct nfs_read_data
*rdata
)
444 struct nfs_pgio_header
*hdr
= rdata
->header
;
445 struct objio_state
*objios
;
448 ret
= objio_alloc_io_state(NFS_I(hdr
->inode
)->layout
, true,
449 hdr
->lseg
, rdata
->args
.pages
, rdata
->args
.pgbase
,
450 rdata
->args
.offset
, rdata
->args
.count
, rdata
,
451 GFP_KERNEL
, &objios
);
455 objios
->ios
->done
= _read_done
;
456 dprintk("%s: offset=0x%llx length=0x%x\n", __func__
,
457 rdata
->args
.offset
, rdata
->args
.count
);
458 ret
= ore_read(objios
->ios
);
460 objio_free_result(&objios
->oir
);
467 static void _write_done(struct ore_io_state
*ios
, void *private)
469 struct objio_state
*objios
= private;
471 int ret
= ore_check_io(ios
, &__on_dev_error
);
473 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
476 /* FIXME: should be based on the OSD's persistence model
477 * See OSD2r05 Section 4.13 Data persistence model */
478 objios
->oir
.committed
= NFS_FILE_SYNC
;
479 status
= ios
->length
;
484 objlayout_write_done(&objios
->oir
, status
, objios
->sync
);
487 static struct page
*__r4w_get_page(void *priv
, u64 offset
, bool *uptodate
)
489 struct objio_state
*objios
= priv
;
490 struct nfs_write_data
*wdata
= objios
->oir
.rpcdata
;
491 struct address_space
*mapping
= wdata
->header
->inode
->i_mapping
;
492 pgoff_t index
= offset
/ PAGE_SIZE
;
494 loff_t i_size
= i_size_read(wdata
->header
->inode
);
496 if (offset
>= i_size
) {
498 dprintk("%s: g_zero_page index=0x%lx\n", __func__
, index
);
502 page
= find_get_page(mapping
, index
);
504 page
= find_or_create_page(mapping
, index
, GFP_NOFS
);
505 if (unlikely(!page
)) {
506 dprintk("%s: grab_cache_page Failed index=0x%lx\n",
512 if (PageDirty(page
) || PageWriteback(page
))
515 *uptodate
= PageUptodate(page
);
516 dprintk("%s: index=0x%lx uptodate=%d\n", __func__
, index
, *uptodate
);
520 static void __r4w_put_page(void *priv
, struct page
*page
)
522 dprintk("%s: index=0x%lx\n", __func__
,
523 (page
== ZERO_PAGE(0)) ? -1UL : page
->index
);
524 if (ZERO_PAGE(0) != page
)
525 page_cache_release(page
);
529 static const struct _ore_r4w_op _r4w_op
= {
530 .get_page
= &__r4w_get_page
,
531 .put_page
= &__r4w_put_page
,
534 int objio_write_pagelist(struct nfs_write_data
*wdata
, int how
)
536 struct nfs_pgio_header
*hdr
= wdata
->header
;
537 struct objio_state
*objios
;
540 ret
= objio_alloc_io_state(NFS_I(hdr
->inode
)->layout
, false,
541 hdr
->lseg
, wdata
->args
.pages
, wdata
->args
.pgbase
,
542 wdata
->args
.offset
, wdata
->args
.count
, wdata
, GFP_NOFS
,
547 objios
->sync
= 0 != (how
& FLUSH_SYNC
);
548 objios
->ios
->r4w
= &_r4w_op
;
551 objios
->ios
->done
= _write_done
;
553 dprintk("%s: offset=0x%llx length=0x%x\n", __func__
,
554 wdata
->args
.offset
, wdata
->args
.count
);
555 ret
= ore_write(objios
->ios
);
557 objio_free_result(&objios
->oir
);
562 _write_done(objios
->ios
, objios
);
567 static bool objio_pg_test(struct nfs_pageio_descriptor
*pgio
,
568 struct nfs_page
*prev
, struct nfs_page
*req
)
570 if (!pnfs_generic_pg_test(pgio
, prev
, req
))
573 return pgio
->pg_count
+ req
->wb_bytes
<=
574 (unsigned long)pgio
->pg_layout_private
;
577 static void objio_init_read(struct nfs_pageio_descriptor
*pgio
, struct nfs_page
*req
)
579 pnfs_generic_pg_init_read(pgio
, req
);
580 if (unlikely(pgio
->pg_lseg
== NULL
))
581 return; /* Not pNFS */
583 pgio
->pg_layout_private
= (void *)
584 OBJIO_LSEG(pgio
->pg_lseg
)->layout
.max_io_length
;
587 static bool aligned_on_raid_stripe(u64 offset
, struct ore_layout
*layout
,
588 unsigned long *stripe_end
)
591 unsigned stripe_size
;
593 if (layout
->raid_algorithm
== PNFS_OSD_RAID_0
)
596 stripe_size
= layout
->stripe_unit
*
597 (layout
->group_width
- layout
->parity
);
599 div_u64_rem(offset
, stripe_size
, &stripe_off
);
603 *stripe_end
= stripe_size
- stripe_off
;
607 static void objio_init_write(struct nfs_pageio_descriptor
*pgio
, struct nfs_page
*req
)
609 unsigned long stripe_end
= 0;
612 if (pgio
->pg_dreq
== NULL
)
613 wb_size
= i_size_read(pgio
->pg_inode
) - req_offset(req
);
615 wb_size
= nfs_dreq_bytes_left(pgio
->pg_dreq
);
617 pnfs_generic_pg_init_write(pgio
, req
, wb_size
);
618 if (unlikely(pgio
->pg_lseg
== NULL
))
619 return; /* Not pNFS */
621 if (req
->wb_offset
||
622 !aligned_on_raid_stripe(req
->wb_index
* PAGE_SIZE
,
623 &OBJIO_LSEG(pgio
->pg_lseg
)->layout
,
625 pgio
->pg_layout_private
= (void *)stripe_end
;
627 pgio
->pg_layout_private
= (void *)
628 OBJIO_LSEG(pgio
->pg_lseg
)->layout
.max_io_length
;
632 static const struct nfs_pageio_ops objio_pg_read_ops
= {
633 .pg_init
= objio_init_read
,
634 .pg_test
= objio_pg_test
,
635 .pg_doio
= pnfs_generic_pg_readpages
,
638 static const struct nfs_pageio_ops objio_pg_write_ops
= {
639 .pg_init
= objio_init_write
,
640 .pg_test
= objio_pg_test
,
641 .pg_doio
= pnfs_generic_pg_writepages
,
644 static struct pnfs_layoutdriver_type objlayout_type
= {
645 .id
= LAYOUT_OSD2_OBJECTS
,
646 .name
= "LAYOUT_OSD2_OBJECTS",
647 .flags
= PNFS_LAYOUTRET_ON_SETATTR
|
648 PNFS_LAYOUTRET_ON_ERROR
,
650 .owner
= THIS_MODULE
,
651 .alloc_layout_hdr
= objlayout_alloc_layout_hdr
,
652 .free_layout_hdr
= objlayout_free_layout_hdr
,
654 .alloc_lseg
= objlayout_alloc_lseg
,
655 .free_lseg
= objlayout_free_lseg
,
657 .read_pagelist
= objlayout_read_pagelist
,
658 .write_pagelist
= objlayout_write_pagelist
,
659 .pg_read_ops
= &objio_pg_read_ops
,
660 .pg_write_ops
= &objio_pg_write_ops
,
662 .free_deviceid_node
= objio_free_deviceid_node
,
664 .encode_layoutcommit
= objlayout_encode_layoutcommit
,
665 .encode_layoutreturn
= objlayout_encode_layoutreturn
,
668 MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
669 MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
670 MODULE_LICENSE("GPL");
675 int ret
= pnfs_register_layoutdriver(&objlayout_type
);
679 "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n",
682 printk(KERN_INFO
"NFS: %s: Registered OSD pNFS Layout Driver\n",
690 pnfs_unregister_layoutdriver(&objlayout_type
);
691 printk(KERN_INFO
"NFS: %s: Unregistered OSD pNFS Layout Driver\n",
695 MODULE_ALIAS("nfs-layouttype4-2");
697 module_init(objlayout_init
);
698 module_exit(objlayout_exit
);