net: Clone skb before setting peeked flag
[linux/fpc-iii.git] / fs / nfs / objlayout / objio_osd.c
blob5aaed363556a66e7bd5bb43d4619d037108f5421
1 /*
2 * pNFS Objects layout implementation over open-osd initiator library
4 * Copyright (C) 2009 Panasas Inc. [year of first publication]
5 * All rights reserved.
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <ooo@electrozaur.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 #include <linux/module.h>
41 #include <scsi/osd_ore.h>
43 #include "objlayout.h"
44 #include "../internal.h"
46 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
48 struct objio_dev_ent {
49 struct nfs4_deviceid_node id_node;
50 struct ore_dev od;
53 static void
54 objio_free_deviceid_node(struct nfs4_deviceid_node *d)
56 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
58 dprintk("%s: free od=%p\n", __func__, de->od.od);
59 osduld_put_device(de->od.od);
60 kfree_rcu(d, rcu);
63 struct objio_segment {
64 struct pnfs_layout_segment lseg;
66 struct ore_layout layout;
67 struct ore_components oc;
70 static inline struct objio_segment *
71 OBJIO_LSEG(struct pnfs_layout_segment *lseg)
73 return container_of(lseg, struct objio_segment, lseg);
76 struct objio_state {
77 /* Generic layer */
78 struct objlayout_io_res oir;
80 bool sync;
81 /*FIXME: Support for extra_bytes at ore_get_rw_state() */
82 struct ore_io_state *ios;
85 /* Send and wait for a get_device_info of devices in the layout,
86 then look them up with the osd_initiator library */
87 struct nfs4_deviceid_node *
88 objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
89 gfp_t gfp_flags)
91 struct pnfs_osd_deviceaddr *deviceaddr;
92 struct objio_dev_ent *ode = NULL;
93 struct osd_dev *od;
94 struct osd_dev_info odi;
95 bool retry_flag = true;
96 __be32 *p;
97 int err;
99 deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
100 if (!deviceaddr)
101 return NULL;
103 p = page_address(pdev->pages[0]);
104 pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
106 odi.systemid_len = deviceaddr->oda_systemid.len;
107 if (odi.systemid_len > sizeof(odi.systemid)) {
108 dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
109 __func__, sizeof(odi.systemid));
110 err = -EINVAL;
111 goto out;
112 } else if (odi.systemid_len)
113 memcpy(odi.systemid, deviceaddr->oda_systemid.data,
114 odi.systemid_len);
115 odi.osdname_len = deviceaddr->oda_osdname.len;
116 odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
118 if (!odi.osdname_len && !odi.systemid_len) {
119 dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
120 __func__);
121 err = -ENODEV;
122 goto out;
125 retry_lookup:
126 od = osduld_info_lookup(&odi);
127 if (unlikely(IS_ERR(od))) {
128 err = PTR_ERR(od);
129 dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
130 if (err == -ENODEV && retry_flag) {
131 err = objlayout_autologin(deviceaddr);
132 if (likely(!err)) {
133 retry_flag = false;
134 goto retry_lookup;
137 goto out;
140 dprintk("Adding new dev_id(%llx:%llx)\n",
141 _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
143 ode = kzalloc(sizeof(*ode), gfp_flags);
144 if (!ode) {
145 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
146 goto out;
149 nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
150 kfree(deviceaddr);
152 ode->od.od = od;
153 return &ode->id_node;
155 out:
156 kfree(deviceaddr);
157 return NULL;
160 static void copy_single_comp(struct ore_components *oc, unsigned c,
161 struct pnfs_osd_object_cred *src_comp)
163 struct ore_comp *ocomp = &oc->comps[c];
165 WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
166 WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
168 ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
169 ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
171 memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
174 static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
175 struct objio_segment **pseg)
177 /* This is the in memory structure of the objio_segment
179 * struct __alloc_objio_segment {
180 * struct objio_segment olseg;
181 * struct ore_dev *ods[numdevs];
182 * struct ore_comp comps[numdevs];
183 * } *aolseg;
184 * NOTE: The code as above compiles and runs perfectly. It is elegant,
185 * type safe and compact. At some Past time Linus has decided he does not
186 * like variable length arrays, For the sake of this principal we uglify
187 * the code as below.
189 struct objio_segment *lseg;
190 size_t lseg_size = sizeof(*lseg) +
191 numdevs * sizeof(lseg->oc.ods[0]) +
192 numdevs * sizeof(*lseg->oc.comps);
194 lseg = kzalloc(lseg_size, gfp_flags);
195 if (unlikely(!lseg)) {
196 dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__,
197 numdevs, lseg_size);
198 return -ENOMEM;
201 lseg->oc.numdevs = numdevs;
202 lseg->oc.single_comp = EC_MULTPLE_COMPS;
203 lseg->oc.ods = (void *)(lseg + 1);
204 lseg->oc.comps = (void *)(lseg->oc.ods + numdevs);
206 *pseg = lseg;
207 return 0;
210 int objio_alloc_lseg(struct pnfs_layout_segment **outp,
211 struct pnfs_layout_hdr *pnfslay,
212 struct pnfs_layout_range *range,
213 struct xdr_stream *xdr,
214 gfp_t gfp_flags)
216 struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
217 struct objio_segment *objio_seg;
218 struct pnfs_osd_xdr_decode_layout_iter iter;
219 struct pnfs_osd_layout layout;
220 struct pnfs_osd_object_cred src_comp;
221 unsigned cur_comp;
222 int err;
224 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
225 if (unlikely(err))
226 return err;
228 err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
229 if (unlikely(err))
230 return err;
232 objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
233 objio_seg->layout.group_width = layout.olo_map.odm_group_width;
234 objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
235 objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
236 objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
238 err = ore_verify_layout(layout.olo_map.odm_num_comps,
239 &objio_seg->layout);
240 if (unlikely(err))
241 goto err;
243 objio_seg->oc.first_dev = layout.olo_comps_index;
244 cur_comp = 0;
245 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
246 struct nfs4_deviceid_node *d;
247 struct objio_dev_ent *ode;
249 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
251 d = nfs4_find_get_deviceid(server,
252 &src_comp.oc_object_id.oid_device_id,
253 pnfslay->plh_lc_cred, gfp_flags);
254 if (!d) {
255 err = -ENXIO;
256 goto err;
259 ode = container_of(d, struct objio_dev_ent, id_node);
260 objio_seg->oc.ods[cur_comp++] = &ode->od;
262 /* pnfs_osd_xdr_decode_layout_comp returns false on error */
263 if (unlikely(err))
264 goto err;
266 *outp = &objio_seg->lseg;
267 return 0;
269 err:
270 kfree(objio_seg);
271 dprintk("%s: Error: return %d\n", __func__, err);
272 *outp = NULL;
273 return err;
276 void objio_free_lseg(struct pnfs_layout_segment *lseg)
278 int i;
279 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
281 for (i = 0; i < objio_seg->oc.numdevs; i++) {
282 struct ore_dev *od = objio_seg->oc.ods[i];
283 struct objio_dev_ent *ode;
285 if (!od)
286 break;
287 ode = container_of(od, typeof(*ode), od);
288 nfs4_put_deviceid_node(&ode->id_node);
290 kfree(objio_seg);
293 static int
294 objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
295 struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
296 loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
297 struct objio_state **outp)
299 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
300 struct ore_io_state *ios;
301 int ret;
302 struct __alloc_objio_state {
303 struct objio_state objios;
304 struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
305 } *aos;
307 aos = kzalloc(sizeof(*aos), gfp_flags);
308 if (unlikely(!aos))
309 return -ENOMEM;
311 objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
312 aos->ioerrs, rpcdata, pnfs_layout_type);
314 ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
315 offset, count, &ios);
316 if (unlikely(ret)) {
317 kfree(aos);
318 return ret;
321 ios->pages = pages;
322 ios->pgbase = pgbase;
323 ios->private = aos;
324 BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
326 aos->objios.sync = 0;
327 aos->objios.ios = ios;
328 *outp = &aos->objios;
329 return 0;
332 void objio_free_result(struct objlayout_io_res *oir)
334 struct objio_state *objios = container_of(oir, struct objio_state, oir);
336 ore_put_io_state(objios->ios);
337 kfree(objios);
340 static enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
342 switch (oep) {
343 case OSD_ERR_PRI_NO_ERROR:
344 return (enum pnfs_osd_errno)0;
346 case OSD_ERR_PRI_CLEAR_PAGES:
347 BUG_ON(1);
348 return 0;
350 case OSD_ERR_PRI_RESOURCE:
351 return PNFS_OSD_ERR_RESOURCE;
352 case OSD_ERR_PRI_BAD_CRED:
353 return PNFS_OSD_ERR_BAD_CRED;
354 case OSD_ERR_PRI_NO_ACCESS:
355 return PNFS_OSD_ERR_NO_ACCESS;
356 case OSD_ERR_PRI_UNREACHABLE:
357 return PNFS_OSD_ERR_UNREACHABLE;
358 case OSD_ERR_PRI_NOT_FOUND:
359 return PNFS_OSD_ERR_NOT_FOUND;
360 case OSD_ERR_PRI_NO_SPACE:
361 return PNFS_OSD_ERR_NO_SPACE;
362 default:
363 WARN_ON(1);
364 /* fallthrough */
365 case OSD_ERR_PRI_EIO:
366 return PNFS_OSD_ERR_EIO;
370 static void __on_dev_error(struct ore_io_state *ios,
371 struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
372 u64 dev_offset, u64 dev_len)
374 struct objio_state *objios = ios->private;
375 struct pnfs_osd_objid pooid;
376 struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
377 /* FIXME: what to do with more-then-one-group layouts. We need to
378 * translate from ore_io_state index to oc->comps index
380 unsigned comp = dev_index;
382 pooid.oid_device_id = ode->id_node.deviceid;
383 pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
384 pooid.oid_object_id = ios->oc->comps[comp].obj.id;
386 objlayout_io_set_result(&objios->oir, comp,
387 &pooid, osd_pri_2_pnfs_err(oep),
388 dev_offset, dev_len, !ios->reading);
392 * read
394 static void _read_done(struct ore_io_state *ios, void *private)
396 struct objio_state *objios = private;
397 ssize_t status;
398 int ret = ore_check_io(ios, &__on_dev_error);
400 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
402 if (likely(!ret))
403 status = ios->length;
404 else
405 status = ret;
407 objlayout_read_done(&objios->oir, status, objios->sync);
410 int objio_read_pagelist(struct nfs_pgio_header *hdr)
412 struct objio_state *objios;
413 int ret;
415 ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
416 hdr->lseg, hdr->args.pages, hdr->args.pgbase,
417 hdr->args.offset, hdr->args.count, hdr,
418 GFP_KERNEL, &objios);
419 if (unlikely(ret))
420 return ret;
422 objios->ios->done = _read_done;
423 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
424 hdr->args.offset, hdr->args.count);
425 ret = ore_read(objios->ios);
426 if (unlikely(ret))
427 objio_free_result(&objios->oir);
428 return ret;
432 * write
434 static void _write_done(struct ore_io_state *ios, void *private)
436 struct objio_state *objios = private;
437 ssize_t status;
438 int ret = ore_check_io(ios, &__on_dev_error);
440 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
442 if (likely(!ret)) {
443 /* FIXME: should be based on the OSD's persistence model
444 * See OSD2r05 Section 4.13 Data persistence model */
445 objios->oir.committed = NFS_FILE_SYNC;
446 status = ios->length;
447 } else {
448 status = ret;
451 objlayout_write_done(&objios->oir, status, objios->sync);
454 static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
456 struct objio_state *objios = priv;
457 struct nfs_pgio_header *hdr = objios->oir.rpcdata;
458 struct address_space *mapping = hdr->inode->i_mapping;
459 pgoff_t index = offset / PAGE_SIZE;
460 struct page *page;
461 loff_t i_size = i_size_read(hdr->inode);
463 if (offset >= i_size) {
464 *uptodate = true;
465 dprintk("%s: g_zero_page index=0x%lx\n", __func__, index);
466 return ZERO_PAGE(0);
469 page = find_get_page(mapping, index);
470 if (!page) {
471 page = find_or_create_page(mapping, index, GFP_NOFS);
472 if (unlikely(!page)) {
473 dprintk("%s: grab_cache_page Failed index=0x%lx\n",
474 __func__, index);
475 return NULL;
477 unlock_page(page);
479 if (PageDirty(page) || PageWriteback(page))
480 *uptodate = true;
481 else
482 *uptodate = PageUptodate(page);
483 dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
484 return page;
487 static void __r4w_put_page(void *priv, struct page *page)
489 dprintk("%s: index=0x%lx\n", __func__,
490 (page == ZERO_PAGE(0)) ? -1UL : page->index);
491 if (ZERO_PAGE(0) != page)
492 page_cache_release(page);
493 return;
496 static const struct _ore_r4w_op _r4w_op = {
497 .get_page = &__r4w_get_page,
498 .put_page = &__r4w_put_page,
501 int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
503 struct objio_state *objios;
504 int ret;
506 ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
507 hdr->lseg, hdr->args.pages, hdr->args.pgbase,
508 hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
509 &objios);
510 if (unlikely(ret))
511 return ret;
513 objios->sync = 0 != (how & FLUSH_SYNC);
514 objios->ios->r4w = &_r4w_op;
516 if (!objios->sync)
517 objios->ios->done = _write_done;
519 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
520 hdr->args.offset, hdr->args.count);
521 ret = ore_write(objios->ios);
522 if (unlikely(ret)) {
523 objio_free_result(&objios->oir);
524 return ret;
527 if (objios->sync)
528 _write_done(objios->ios, objios);
530 return 0;
534 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
535 * of bytes (maximum @req->wb_bytes) that can be coalesced.
537 static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
538 struct nfs_page *prev, struct nfs_page *req)
540 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio);
541 unsigned int size;
543 size = pnfs_generic_pg_test(pgio, prev, req);
545 if (!size || mirror->pg_count + req->wb_bytes >
546 (unsigned long)pgio->pg_layout_private)
547 return 0;
549 return min(size, req->wb_bytes);
552 static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
554 pnfs_generic_pg_init_read(pgio, req);
555 if (unlikely(pgio->pg_lseg == NULL))
556 return; /* Not pNFS */
558 pgio->pg_layout_private = (void *)
559 OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
562 static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout,
563 unsigned long *stripe_end)
565 u32 stripe_off;
566 unsigned stripe_size;
568 if (layout->raid_algorithm == PNFS_OSD_RAID_0)
569 return true;
571 stripe_size = layout->stripe_unit *
572 (layout->group_width - layout->parity);
574 div_u64_rem(offset, stripe_size, &stripe_off);
575 if (!stripe_off)
576 return true;
578 *stripe_end = stripe_size - stripe_off;
579 return false;
582 static void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
584 unsigned long stripe_end = 0;
585 u64 wb_size;
587 if (pgio->pg_dreq == NULL)
588 wb_size = i_size_read(pgio->pg_inode) - req_offset(req);
589 else
590 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
592 pnfs_generic_pg_init_write(pgio, req, wb_size);
593 if (unlikely(pgio->pg_lseg == NULL))
594 return; /* Not pNFS */
596 if (req->wb_offset ||
597 !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE,
598 &OBJIO_LSEG(pgio->pg_lseg)->layout,
599 &stripe_end)) {
600 pgio->pg_layout_private = (void *)stripe_end;
601 } else {
602 pgio->pg_layout_private = (void *)
603 OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
607 static const struct nfs_pageio_ops objio_pg_read_ops = {
608 .pg_init = objio_init_read,
609 .pg_test = objio_pg_test,
610 .pg_doio = pnfs_generic_pg_readpages,
611 .pg_cleanup = pnfs_generic_pg_cleanup,
614 static const struct nfs_pageio_ops objio_pg_write_ops = {
615 .pg_init = objio_init_write,
616 .pg_test = objio_pg_test,
617 .pg_doio = pnfs_generic_pg_writepages,
618 .pg_cleanup = pnfs_generic_pg_cleanup,
621 static struct pnfs_layoutdriver_type objlayout_type = {
622 .id = LAYOUT_OSD2_OBJECTS,
623 .name = "LAYOUT_OSD2_OBJECTS",
624 .flags = PNFS_LAYOUTRET_ON_SETATTR |
625 PNFS_LAYOUTRET_ON_ERROR,
627 .max_deviceinfo_size = PAGE_SIZE,
628 .owner = THIS_MODULE,
629 .alloc_layout_hdr = objlayout_alloc_layout_hdr,
630 .free_layout_hdr = objlayout_free_layout_hdr,
632 .alloc_lseg = objlayout_alloc_lseg,
633 .free_lseg = objlayout_free_lseg,
635 .read_pagelist = objlayout_read_pagelist,
636 .write_pagelist = objlayout_write_pagelist,
637 .pg_read_ops = &objio_pg_read_ops,
638 .pg_write_ops = &objio_pg_write_ops,
640 .sync = pnfs_generic_sync,
642 .free_deviceid_node = objio_free_deviceid_node,
644 .encode_layoutcommit = objlayout_encode_layoutcommit,
645 .encode_layoutreturn = objlayout_encode_layoutreturn,
648 MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
649 MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
650 MODULE_LICENSE("GPL");
652 static int __init
653 objlayout_init(void)
655 int ret = pnfs_register_layoutdriver(&objlayout_type);
657 if (ret)
658 printk(KERN_INFO
659 "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n",
660 __func__, ret);
661 else
662 printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n",
663 __func__);
664 return ret;
667 static void __exit
668 objlayout_exit(void)
670 pnfs_unregister_layoutdriver(&objlayout_type);
671 printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n",
672 __func__);
675 MODULE_ALIAS("nfs-layouttype4-2");
677 module_init(objlayout_init);
678 module_exit(objlayout_exit);