2 * pNFS Objects layout implementation over open-osd initiator library
4 * Copyright (C) 2009 Panasas Inc. [year of first publication]
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <ooo@electrozaur.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 #include <linux/module.h>
41 #include <scsi/osd_ore.h>
43 #include "objlayout.h"
44 #include "../internal.h"
46 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
48 struct objio_dev_ent
{
49 struct nfs4_deviceid_node id_node
;
54 objio_free_deviceid_node(struct nfs4_deviceid_node
*d
)
56 struct objio_dev_ent
*de
= container_of(d
, struct objio_dev_ent
, id_node
);
58 dprintk("%s: free od=%p\n", __func__
, de
->od
.od
);
59 osduld_put_device(de
->od
.od
);
63 struct objio_segment
{
64 struct pnfs_layout_segment lseg
;
66 struct ore_layout layout
;
67 struct ore_components oc
;
70 static inline struct objio_segment
*
71 OBJIO_LSEG(struct pnfs_layout_segment
*lseg
)
73 return container_of(lseg
, struct objio_segment
, lseg
);
78 struct objlayout_io_res oir
;
81 /*FIXME: Support for extra_bytes at ore_get_rw_state() */
82 struct ore_io_state
*ios
;
85 /* Send and wait for a get_device_info of devices in the layout,
86 then look them up with the osd_initiator library */
87 struct nfs4_deviceid_node
*
88 objio_alloc_deviceid_node(struct nfs_server
*server
, struct pnfs_device
*pdev
,
91 struct pnfs_osd_deviceaddr
*deviceaddr
;
92 struct objio_dev_ent
*ode
= NULL
;
94 struct osd_dev_info odi
;
95 bool retry_flag
= true;
99 deviceaddr
= kzalloc(sizeof(*deviceaddr
), gfp_flags
);
103 p
= page_address(pdev
->pages
[0]);
104 pnfs_osd_xdr_decode_deviceaddr(deviceaddr
, p
);
106 odi
.systemid_len
= deviceaddr
->oda_systemid
.len
;
107 if (odi
.systemid_len
> sizeof(odi
.systemid
)) {
108 dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
109 __func__
, sizeof(odi
.systemid
));
112 } else if (odi
.systemid_len
)
113 memcpy(odi
.systemid
, deviceaddr
->oda_systemid
.data
,
115 odi
.osdname_len
= deviceaddr
->oda_osdname
.len
;
116 odi
.osdname
= (u8
*)deviceaddr
->oda_osdname
.data
;
118 if (!odi
.osdname_len
&& !odi
.systemid_len
) {
119 dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
126 od
= osduld_info_lookup(&odi
);
127 if (unlikely(IS_ERR(od
))) {
129 dprintk("%s: osduld_info_lookup => %d\n", __func__
, err
);
130 if (err
== -ENODEV
&& retry_flag
) {
131 err
= objlayout_autologin(deviceaddr
);
140 dprintk("Adding new dev_id(%llx:%llx)\n",
141 _DEVID_LO(&pdev
->dev_id
), _DEVID_HI(&pdev
->dev_id
));
143 ode
= kzalloc(sizeof(*ode
), gfp_flags
);
145 dprintk("%s: -ENOMEM od=%p\n", __func__
, od
);
149 nfs4_init_deviceid_node(&ode
->id_node
, server
, &pdev
->dev_id
);
153 return &ode
->id_node
;
160 static void copy_single_comp(struct ore_components
*oc
, unsigned c
,
161 struct pnfs_osd_object_cred
*src_comp
)
163 struct ore_comp
*ocomp
= &oc
->comps
[c
];
165 WARN_ON(src_comp
->oc_cap_key
.cred_len
> 0); /* libosd is NO_SEC only */
166 WARN_ON(src_comp
->oc_cap
.cred_len
> sizeof(ocomp
->cred
));
168 ocomp
->obj
.partition
= src_comp
->oc_object_id
.oid_partition_id
;
169 ocomp
->obj
.id
= src_comp
->oc_object_id
.oid_object_id
;
171 memcpy(ocomp
->cred
, src_comp
->oc_cap
.cred
, sizeof(ocomp
->cred
));
174 static int __alloc_objio_seg(unsigned numdevs
, gfp_t gfp_flags
,
175 struct objio_segment
**pseg
)
177 /* This is the in memory structure of the objio_segment
179 * struct __alloc_objio_segment {
180 * struct objio_segment olseg;
181 * struct ore_dev *ods[numdevs];
182 * struct ore_comp comps[numdevs];
184 * NOTE: The code as above compiles and runs perfectly. It is elegant,
185 * type safe and compact. At some Past time Linus has decided he does not
186 * like variable length arrays, For the sake of this principal we uglify
189 struct objio_segment
*lseg
;
190 size_t lseg_size
= sizeof(*lseg
) +
191 numdevs
* sizeof(lseg
->oc
.ods
[0]) +
192 numdevs
* sizeof(*lseg
->oc
.comps
);
194 lseg
= kzalloc(lseg_size
, gfp_flags
);
195 if (unlikely(!lseg
)) {
196 dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__
,
201 lseg
->oc
.numdevs
= numdevs
;
202 lseg
->oc
.single_comp
= EC_MULTPLE_COMPS
;
203 lseg
->oc
.ods
= (void *)(lseg
+ 1);
204 lseg
->oc
.comps
= (void *)(lseg
->oc
.ods
+ numdevs
);
210 int objio_alloc_lseg(struct pnfs_layout_segment
**outp
,
211 struct pnfs_layout_hdr
*pnfslay
,
212 struct pnfs_layout_range
*range
,
213 struct xdr_stream
*xdr
,
216 struct nfs_server
*server
= NFS_SERVER(pnfslay
->plh_inode
);
217 struct objio_segment
*objio_seg
;
218 struct pnfs_osd_xdr_decode_layout_iter iter
;
219 struct pnfs_osd_layout layout
;
220 struct pnfs_osd_object_cred src_comp
;
224 err
= pnfs_osd_xdr_decode_layout_map(&layout
, &iter
, xdr
);
228 err
= __alloc_objio_seg(layout
.olo_num_comps
, gfp_flags
, &objio_seg
);
232 objio_seg
->layout
.stripe_unit
= layout
.olo_map
.odm_stripe_unit
;
233 objio_seg
->layout
.group_width
= layout
.olo_map
.odm_group_width
;
234 objio_seg
->layout
.group_depth
= layout
.olo_map
.odm_group_depth
;
235 objio_seg
->layout
.mirrors_p1
= layout
.olo_map
.odm_mirror_cnt
+ 1;
236 objio_seg
->layout
.raid_algorithm
= layout
.olo_map
.odm_raid_algorithm
;
238 err
= ore_verify_layout(layout
.olo_map
.odm_num_comps
,
243 objio_seg
->oc
.first_dev
= layout
.olo_comps_index
;
245 while (pnfs_osd_xdr_decode_layout_comp(&src_comp
, &iter
, xdr
, &err
)) {
246 struct nfs4_deviceid_node
*d
;
247 struct objio_dev_ent
*ode
;
249 copy_single_comp(&objio_seg
->oc
, cur_comp
, &src_comp
);
251 d
= nfs4_find_get_deviceid(server
,
252 &src_comp
.oc_object_id
.oid_device_id
,
253 pnfslay
->plh_lc_cred
, gfp_flags
);
259 ode
= container_of(d
, struct objio_dev_ent
, id_node
);
260 objio_seg
->oc
.ods
[cur_comp
++] = &ode
->od
;
262 /* pnfs_osd_xdr_decode_layout_comp returns false on error */
266 *outp
= &objio_seg
->lseg
;
271 dprintk("%s: Error: return %d\n", __func__
, err
);
276 void objio_free_lseg(struct pnfs_layout_segment
*lseg
)
279 struct objio_segment
*objio_seg
= OBJIO_LSEG(lseg
);
281 for (i
= 0; i
< objio_seg
->oc
.numdevs
; i
++) {
282 struct ore_dev
*od
= objio_seg
->oc
.ods
[i
];
283 struct objio_dev_ent
*ode
;
287 ode
= container_of(od
, typeof(*ode
), od
);
288 nfs4_put_deviceid_node(&ode
->id_node
);
294 objio_alloc_io_state(struct pnfs_layout_hdr
*pnfs_layout_type
, bool is_reading
,
295 struct pnfs_layout_segment
*lseg
, struct page
**pages
, unsigned pgbase
,
296 loff_t offset
, size_t count
, void *rpcdata
, gfp_t gfp_flags
,
297 struct objio_state
**outp
)
299 struct objio_segment
*objio_seg
= OBJIO_LSEG(lseg
);
300 struct ore_io_state
*ios
;
302 struct __alloc_objio_state
{
303 struct objio_state objios
;
304 struct pnfs_osd_ioerr ioerrs
[objio_seg
->oc
.numdevs
];
307 aos
= kzalloc(sizeof(*aos
), gfp_flags
);
311 objlayout_init_ioerrs(&aos
->objios
.oir
, objio_seg
->oc
.numdevs
,
312 aos
->ioerrs
, rpcdata
, pnfs_layout_type
);
314 ret
= ore_get_rw_state(&objio_seg
->layout
, &objio_seg
->oc
, is_reading
,
315 offset
, count
, &ios
);
322 ios
->pgbase
= pgbase
;
324 BUG_ON(ios
->nr_pages
> (pgbase
+ count
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
);
326 aos
->objios
.sync
= 0;
327 aos
->objios
.ios
= ios
;
328 *outp
= &aos
->objios
;
332 void objio_free_result(struct objlayout_io_res
*oir
)
334 struct objio_state
*objios
= container_of(oir
, struct objio_state
, oir
);
336 ore_put_io_state(objios
->ios
);
340 static enum pnfs_osd_errno
osd_pri_2_pnfs_err(enum osd_err_priority oep
)
343 case OSD_ERR_PRI_NO_ERROR
:
344 return (enum pnfs_osd_errno
)0;
346 case OSD_ERR_PRI_CLEAR_PAGES
:
350 case OSD_ERR_PRI_RESOURCE
:
351 return PNFS_OSD_ERR_RESOURCE
;
352 case OSD_ERR_PRI_BAD_CRED
:
353 return PNFS_OSD_ERR_BAD_CRED
;
354 case OSD_ERR_PRI_NO_ACCESS
:
355 return PNFS_OSD_ERR_NO_ACCESS
;
356 case OSD_ERR_PRI_UNREACHABLE
:
357 return PNFS_OSD_ERR_UNREACHABLE
;
358 case OSD_ERR_PRI_NOT_FOUND
:
359 return PNFS_OSD_ERR_NOT_FOUND
;
360 case OSD_ERR_PRI_NO_SPACE
:
361 return PNFS_OSD_ERR_NO_SPACE
;
365 case OSD_ERR_PRI_EIO
:
366 return PNFS_OSD_ERR_EIO
;
370 static void __on_dev_error(struct ore_io_state
*ios
,
371 struct ore_dev
*od
, unsigned dev_index
, enum osd_err_priority oep
,
372 u64 dev_offset
, u64 dev_len
)
374 struct objio_state
*objios
= ios
->private;
375 struct pnfs_osd_objid pooid
;
376 struct objio_dev_ent
*ode
= container_of(od
, typeof(*ode
), od
);
377 /* FIXME: what to do with more-then-one-group layouts. We need to
378 * translate from ore_io_state index to oc->comps index
380 unsigned comp
= dev_index
;
382 pooid
.oid_device_id
= ode
->id_node
.deviceid
;
383 pooid
.oid_partition_id
= ios
->oc
->comps
[comp
].obj
.partition
;
384 pooid
.oid_object_id
= ios
->oc
->comps
[comp
].obj
.id
;
386 objlayout_io_set_result(&objios
->oir
, comp
,
387 &pooid
, osd_pri_2_pnfs_err(oep
),
388 dev_offset
, dev_len
, !ios
->reading
);
394 static void _read_done(struct ore_io_state
*ios
, void *private)
396 struct objio_state
*objios
= private;
398 int ret
= ore_check_io(ios
, &__on_dev_error
);
400 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
403 status
= ios
->length
;
407 objlayout_read_done(&objios
->oir
, status
, objios
->sync
);
410 int objio_read_pagelist(struct nfs_pgio_header
*hdr
)
412 struct objio_state
*objios
;
415 ret
= objio_alloc_io_state(NFS_I(hdr
->inode
)->layout
, true,
416 hdr
->lseg
, hdr
->args
.pages
, hdr
->args
.pgbase
,
417 hdr
->args
.offset
, hdr
->args
.count
, hdr
,
418 GFP_KERNEL
, &objios
);
422 objios
->ios
->done
= _read_done
;
423 dprintk("%s: offset=0x%llx length=0x%x\n", __func__
,
424 hdr
->args
.offset
, hdr
->args
.count
);
425 ret
= ore_read(objios
->ios
);
427 objio_free_result(&objios
->oir
);
434 static void _write_done(struct ore_io_state
*ios
, void *private)
436 struct objio_state
*objios
= private;
438 int ret
= ore_check_io(ios
, &__on_dev_error
);
440 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
443 /* FIXME: should be based on the OSD's persistence model
444 * See OSD2r05 Section 4.13 Data persistence model */
445 objios
->oir
.committed
= NFS_FILE_SYNC
;
446 status
= ios
->length
;
451 objlayout_write_done(&objios
->oir
, status
, objios
->sync
);
454 static struct page
*__r4w_get_page(void *priv
, u64 offset
, bool *uptodate
)
456 struct objio_state
*objios
= priv
;
457 struct nfs_pgio_header
*hdr
= objios
->oir
.rpcdata
;
458 struct address_space
*mapping
= hdr
->inode
->i_mapping
;
459 pgoff_t index
= offset
/ PAGE_SIZE
;
461 loff_t i_size
= i_size_read(hdr
->inode
);
463 if (offset
>= i_size
) {
465 dprintk("%s: g_zero_page index=0x%lx\n", __func__
, index
);
469 page
= find_get_page(mapping
, index
);
471 page
= find_or_create_page(mapping
, index
, GFP_NOFS
);
472 if (unlikely(!page
)) {
473 dprintk("%s: grab_cache_page Failed index=0x%lx\n",
479 if (PageDirty(page
) || PageWriteback(page
))
482 *uptodate
= PageUptodate(page
);
483 dprintk("%s: index=0x%lx uptodate=%d\n", __func__
, index
, *uptodate
);
487 static void __r4w_put_page(void *priv
, struct page
*page
)
489 dprintk("%s: index=0x%lx\n", __func__
,
490 (page
== ZERO_PAGE(0)) ? -1UL : page
->index
);
491 if (ZERO_PAGE(0) != page
)
492 page_cache_release(page
);
496 static const struct _ore_r4w_op _r4w_op
= {
497 .get_page
= &__r4w_get_page
,
498 .put_page
= &__r4w_put_page
,
501 int objio_write_pagelist(struct nfs_pgio_header
*hdr
, int how
)
503 struct objio_state
*objios
;
506 ret
= objio_alloc_io_state(NFS_I(hdr
->inode
)->layout
, false,
507 hdr
->lseg
, hdr
->args
.pages
, hdr
->args
.pgbase
,
508 hdr
->args
.offset
, hdr
->args
.count
, hdr
, GFP_NOFS
,
513 objios
->sync
= 0 != (how
& FLUSH_SYNC
);
514 objios
->ios
->r4w
= &_r4w_op
;
517 objios
->ios
->done
= _write_done
;
519 dprintk("%s: offset=0x%llx length=0x%x\n", __func__
,
520 hdr
->args
.offset
, hdr
->args
.count
);
521 ret
= ore_write(objios
->ios
);
523 objio_free_result(&objios
->oir
);
528 _write_done(objios
->ios
, objios
);
534 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
535 * of bytes (maximum @req->wb_bytes) that can be coalesced.
537 static size_t objio_pg_test(struct nfs_pageio_descriptor
*pgio
,
538 struct nfs_page
*prev
, struct nfs_page
*req
)
540 struct nfs_pgio_mirror
*mirror
= nfs_pgio_current_mirror(pgio
);
543 size
= pnfs_generic_pg_test(pgio
, prev
, req
);
545 if (!size
|| mirror
->pg_count
+ req
->wb_bytes
>
546 (unsigned long)pgio
->pg_layout_private
)
549 return min(size
, req
->wb_bytes
);
552 static void objio_init_read(struct nfs_pageio_descriptor
*pgio
, struct nfs_page
*req
)
554 pnfs_generic_pg_init_read(pgio
, req
);
555 if (unlikely(pgio
->pg_lseg
== NULL
))
556 return; /* Not pNFS */
558 pgio
->pg_layout_private
= (void *)
559 OBJIO_LSEG(pgio
->pg_lseg
)->layout
.max_io_length
;
562 static bool aligned_on_raid_stripe(u64 offset
, struct ore_layout
*layout
,
563 unsigned long *stripe_end
)
566 unsigned stripe_size
;
568 if (layout
->raid_algorithm
== PNFS_OSD_RAID_0
)
571 stripe_size
= layout
->stripe_unit
*
572 (layout
->group_width
- layout
->parity
);
574 div_u64_rem(offset
, stripe_size
, &stripe_off
);
578 *stripe_end
= stripe_size
- stripe_off
;
582 static void objio_init_write(struct nfs_pageio_descriptor
*pgio
, struct nfs_page
*req
)
584 unsigned long stripe_end
= 0;
587 if (pgio
->pg_dreq
== NULL
)
588 wb_size
= i_size_read(pgio
->pg_inode
) - req_offset(req
);
590 wb_size
= nfs_dreq_bytes_left(pgio
->pg_dreq
);
592 pnfs_generic_pg_init_write(pgio
, req
, wb_size
);
593 if (unlikely(pgio
->pg_lseg
== NULL
))
594 return; /* Not pNFS */
596 if (req
->wb_offset
||
597 !aligned_on_raid_stripe(req
->wb_index
* PAGE_SIZE
,
598 &OBJIO_LSEG(pgio
->pg_lseg
)->layout
,
600 pgio
->pg_layout_private
= (void *)stripe_end
;
602 pgio
->pg_layout_private
= (void *)
603 OBJIO_LSEG(pgio
->pg_lseg
)->layout
.max_io_length
;
607 static const struct nfs_pageio_ops objio_pg_read_ops
= {
608 .pg_init
= objio_init_read
,
609 .pg_test
= objio_pg_test
,
610 .pg_doio
= pnfs_generic_pg_readpages
,
611 .pg_cleanup
= pnfs_generic_pg_cleanup
,
614 static const struct nfs_pageio_ops objio_pg_write_ops
= {
615 .pg_init
= objio_init_write
,
616 .pg_test
= objio_pg_test
,
617 .pg_doio
= pnfs_generic_pg_writepages
,
618 .pg_cleanup
= pnfs_generic_pg_cleanup
,
621 static struct pnfs_layoutdriver_type objlayout_type
= {
622 .id
= LAYOUT_OSD2_OBJECTS
,
623 .name
= "LAYOUT_OSD2_OBJECTS",
624 .flags
= PNFS_LAYOUTRET_ON_SETATTR
|
625 PNFS_LAYOUTRET_ON_ERROR
,
627 .max_deviceinfo_size
= PAGE_SIZE
,
628 .owner
= THIS_MODULE
,
629 .alloc_layout_hdr
= objlayout_alloc_layout_hdr
,
630 .free_layout_hdr
= objlayout_free_layout_hdr
,
632 .alloc_lseg
= objlayout_alloc_lseg
,
633 .free_lseg
= objlayout_free_lseg
,
635 .read_pagelist
= objlayout_read_pagelist
,
636 .write_pagelist
= objlayout_write_pagelist
,
637 .pg_read_ops
= &objio_pg_read_ops
,
638 .pg_write_ops
= &objio_pg_write_ops
,
640 .sync
= pnfs_generic_sync
,
642 .free_deviceid_node
= objio_free_deviceid_node
,
644 .encode_layoutcommit
= objlayout_encode_layoutcommit
,
645 .encode_layoutreturn
= objlayout_encode_layoutreturn
,
648 MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
649 MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
650 MODULE_LICENSE("GPL");
655 int ret
= pnfs_register_layoutdriver(&objlayout_type
);
659 "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n",
662 printk(KERN_INFO
"NFS: %s: Registered OSD pNFS Layout Driver\n",
670 pnfs_unregister_layoutdriver(&objlayout_type
);
671 printk(KERN_INFO
"NFS: %s: Unregistered OSD pNFS Layout Driver\n",
675 MODULE_ALIAS("nfs-layouttype4-2");
677 module_init(objlayout_init
);
678 module_exit(objlayout_exit
);