2 * pNFS Objects layout driver high level definitions
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <ooo@electrozaur.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 #include <linux/kmod.h>
41 #include <linux/moduleparam.h>
42 #include <linux/ratelimit.h>
43 #include <scsi/osd_initiator.h>
44 #include "objlayout.h"
46 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
48 * Create a objlayout layout structure for the given inode and return it.
50 struct pnfs_layout_hdr
*
51 objlayout_alloc_layout_hdr(struct inode
*inode
, gfp_t gfp_flags
)
53 struct objlayout
*objlay
;
55 objlay
= kzalloc(sizeof(struct objlayout
), gfp_flags
);
58 spin_lock_init(&objlay
->lock
);
59 INIT_LIST_HEAD(&objlay
->err_list
);
60 dprintk("%s: Return %p\n", __func__
, objlay
);
61 return &objlay
->pnfs_layout
;
65 * Free an objlayout layout structure
68 objlayout_free_layout_hdr(struct pnfs_layout_hdr
*lo
)
70 struct objlayout
*objlay
= OBJLAYOUT(lo
);
72 dprintk("%s: objlay %p\n", __func__
, objlay
);
74 WARN_ON(!list_empty(&objlay
->err_list
));
79 * Unmarshall layout and store it in pnfslay.
81 struct pnfs_layout_segment
*
82 objlayout_alloc_lseg(struct pnfs_layout_hdr
*pnfslay
,
83 struct nfs4_layoutget_res
*lgr
,
87 struct xdr_stream stream
;
88 struct xdr_buf buf
= {
89 .pages
= lgr
->layoutp
->pages
,
90 .page_len
= lgr
->layoutp
->len
,
91 .buflen
= lgr
->layoutp
->len
,
92 .len
= lgr
->layoutp
->len
,
95 struct pnfs_layout_segment
*lseg
;
97 dprintk("%s: Begin pnfslay %p\n", __func__
, pnfslay
);
99 scratch
= alloc_page(gfp_flags
);
103 xdr_init_decode(&stream
, &buf
, NULL
);
104 xdr_set_scratch_buffer(&stream
, page_address(scratch
), PAGE_SIZE
);
106 status
= objio_alloc_lseg(&lseg
, pnfslay
, &lgr
->range
, &stream
, gfp_flags
);
107 if (unlikely(status
)) {
108 dprintk("%s: objio_alloc_lseg Return err %d\n", __func__
,
113 __free_page(scratch
);
115 dprintk("%s: Return %p\n", __func__
, lseg
);
119 __free_page(scratch
);
121 dprintk("%s: Err Return=>%d\n", __func__
, status
);
122 return ERR_PTR(status
);
126 * Free a layout segement
129 objlayout_free_lseg(struct pnfs_layout_segment
*lseg
)
131 dprintk("%s: freeing layout segment %p\n", __func__
, lseg
);
136 objio_free_lseg(lseg
);
143 end_offset(u64 start
, u64 len
)
148 return end
>= start
? end
: NFS4_MAX_UINT64
;
151 static void _fix_verify_io_params(struct pnfs_layout_segment
*lseg
,
152 struct page
***p_pages
, unsigned *p_pgbase
,
153 u64 offset
, unsigned long count
)
157 BUG_ON(offset
< lseg
->pls_range
.offset
);
158 lseg_end_offset
= end_offset(lseg
->pls_range
.offset
,
159 lseg
->pls_range
.length
);
160 BUG_ON(offset
>= lseg_end_offset
);
161 WARN_ON(offset
+ count
> lseg_end_offset
);
163 if (*p_pgbase
> PAGE_SIZE
) {
164 dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__
, *p_pgbase
);
165 *p_pages
+= *p_pgbase
>> PAGE_SHIFT
;
166 *p_pgbase
&= ~PAGE_MASK
;
171 * I/O done common code
174 objlayout_iodone(struct objlayout_io_res
*oir
)
176 if (likely(oir
->status
>= 0)) {
177 objio_free_result(oir
);
179 struct objlayout
*objlay
= oir
->objlay
;
181 spin_lock(&objlay
->lock
);
182 objlay
->delta_space_valid
= OBJ_DSU_INVALID
;
183 list_add(&objlay
->err_list
, &oir
->err_list
);
184 spin_unlock(&objlay
->lock
);
189 * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
191 * The @index component IO failed (error returned from target). Register
192 * the error for later reporting at layout-return.
195 objlayout_io_set_result(struct objlayout_io_res
*oir
, unsigned index
,
196 struct pnfs_osd_objid
*pooid
, int osd_error
,
197 u64 offset
, u64 length
, bool is_write
)
199 struct pnfs_osd_ioerr
*ioerr
= &oir
->ioerrs
[index
];
201 BUG_ON(index
>= oir
->num_comps
);
203 ioerr
->oer_component
= *pooid
;
204 ioerr
->oer_comp_offset
= offset
;
205 ioerr
->oer_comp_length
= length
;
206 ioerr
->oer_iswrite
= is_write
;
207 ioerr
->oer_errno
= osd_error
;
209 dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
210 "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
211 __func__
, index
, ioerr
->oer_errno
,
213 _DEVID_LO(&ioerr
->oer_component
.oid_device_id
),
214 _DEVID_HI(&ioerr
->oer_component
.oid_device_id
),
215 ioerr
->oer_component
.oid_partition_id
,
216 ioerr
->oer_component
.oid_object_id
,
217 ioerr
->oer_comp_offset
,
218 ioerr
->oer_comp_length
);
220 /* User need not call if no error is reported */
221 ioerr
->oer_errno
= 0;
225 /* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
226 * This is because the osd completion is called with ints-off from
229 static void _rpc_read_complete(struct work_struct
*work
)
231 struct rpc_task
*task
;
232 struct nfs_pgio_header
*hdr
;
234 dprintk("%s enter\n", __func__
);
235 task
= container_of(work
, struct rpc_task
, u
.tk_work
);
236 hdr
= container_of(task
, struct nfs_pgio_header
, task
);
238 pnfs_ld_read_done(hdr
);
242 objlayout_read_done(struct objlayout_io_res
*oir
, ssize_t status
, bool sync
)
244 struct nfs_pgio_header
*hdr
= oir
->rpcdata
;
246 oir
->status
= hdr
->task
.tk_status
= status
;
248 hdr
->res
.count
= status
;
250 hdr
->pnfs_error
= status
;
251 objlayout_iodone(oir
);
252 /* must not use oir after this point */
254 dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__
,
255 status
, hdr
->res
.eof
, sync
);
258 pnfs_ld_read_done(hdr
);
260 INIT_WORK(&hdr
->task
.u
.tk_work
, _rpc_read_complete
);
261 schedule_work(&hdr
->task
.u
.tk_work
);
266 * Perform sync or async reads.
269 objlayout_read_pagelist(struct nfs_pgio_header
*hdr
)
271 struct inode
*inode
= hdr
->inode
;
272 loff_t offset
= hdr
->args
.offset
;
273 size_t count
= hdr
->args
.count
;
277 eof
= i_size_read(inode
);
278 if (unlikely(offset
+ count
> eof
)) {
283 /*FIXME: do we need to call pnfs_ld_read_done() */
286 count
= eof
- offset
;
289 hdr
->res
.eof
= (offset
+ count
) >= eof
;
290 _fix_verify_io_params(hdr
->lseg
, &hdr
->args
.pages
,
292 hdr
->args
.offset
, hdr
->args
.count
);
294 dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
295 __func__
, inode
->i_ino
, offset
, count
, hdr
->res
.eof
);
297 err
= objio_read_pagelist(hdr
);
300 hdr
->pnfs_error
= err
;
301 dprintk("%s: Returned Error %d\n", __func__
, err
);
302 return PNFS_NOT_ATTEMPTED
;
304 return PNFS_ATTEMPTED
;
307 /* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
308 * This is because the osd completion is called with ints-off from
311 static void _rpc_write_complete(struct work_struct
*work
)
313 struct rpc_task
*task
;
314 struct nfs_pgio_header
*hdr
;
316 dprintk("%s enter\n", __func__
);
317 task
= container_of(work
, struct rpc_task
, u
.tk_work
);
318 hdr
= container_of(task
, struct nfs_pgio_header
, task
);
320 pnfs_ld_write_done(hdr
);
324 objlayout_write_done(struct objlayout_io_res
*oir
, ssize_t status
, bool sync
)
326 struct nfs_pgio_header
*hdr
= oir
->rpcdata
;
328 oir
->status
= hdr
->task
.tk_status
= status
;
330 hdr
->res
.count
= status
;
331 hdr
->verf
.committed
= oir
->committed
;
333 hdr
->pnfs_error
= status
;
335 objlayout_iodone(oir
);
336 /* must not use oir after this point */
338 dprintk("%s: Return status %zd committed %d sync=%d\n", __func__
,
339 status
, hdr
->verf
.committed
, sync
);
342 pnfs_ld_write_done(hdr
);
344 INIT_WORK(&hdr
->task
.u
.tk_work
, _rpc_write_complete
);
345 schedule_work(&hdr
->task
.u
.tk_work
);
350 * Perform sync or async writes.
353 objlayout_write_pagelist(struct nfs_pgio_header
*hdr
, int how
)
357 _fix_verify_io_params(hdr
->lseg
, &hdr
->args
.pages
,
359 hdr
->args
.offset
, hdr
->args
.count
);
361 err
= objio_write_pagelist(hdr
, how
);
363 hdr
->pnfs_error
= err
;
364 dprintk("%s: Returned Error %d\n", __func__
, err
);
365 return PNFS_NOT_ATTEMPTED
;
367 return PNFS_ATTEMPTED
;
371 objlayout_encode_layoutcommit(struct pnfs_layout_hdr
*pnfslay
,
372 struct xdr_stream
*xdr
,
373 const struct nfs4_layoutcommit_args
*args
)
375 struct objlayout
*objlay
= OBJLAYOUT(pnfslay
);
376 struct pnfs_osd_layoutupdate lou
;
379 dprintk("%s: Begin\n", __func__
);
381 spin_lock(&objlay
->lock
);
382 lou
.dsu_valid
= (objlay
->delta_space_valid
== OBJ_DSU_VALID
);
383 lou
.dsu_delta
= objlay
->delta_space_used
;
384 objlay
->delta_space_used
= 0;
385 objlay
->delta_space_valid
= OBJ_DSU_INIT
;
386 lou
.olu_ioerr_flag
= !list_empty(&objlay
->err_list
);
387 spin_unlock(&objlay
->lock
);
389 start
= xdr_reserve_space(xdr
, 4);
391 BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr
, &lou
));
393 *start
= cpu_to_be32((xdr
->p
- start
- 1) * 4);
395 dprintk("%s: Return delta_space_used %lld err %d\n", __func__
,
396 lou
.dsu_delta
, lou
.olu_ioerr_flag
);
400 err_prio(u32 oer_errno
)
406 case PNFS_OSD_ERR_RESOURCE
:
407 return OSD_ERR_PRI_RESOURCE
;
408 case PNFS_OSD_ERR_BAD_CRED
:
409 return OSD_ERR_PRI_BAD_CRED
;
410 case PNFS_OSD_ERR_NO_ACCESS
:
411 return OSD_ERR_PRI_NO_ACCESS
;
412 case PNFS_OSD_ERR_UNREACHABLE
:
413 return OSD_ERR_PRI_UNREACHABLE
;
414 case PNFS_OSD_ERR_NOT_FOUND
:
415 return OSD_ERR_PRI_NOT_FOUND
;
416 case PNFS_OSD_ERR_NO_SPACE
:
417 return OSD_ERR_PRI_NO_SPACE
;
421 case PNFS_OSD_ERR_EIO
:
422 return OSD_ERR_PRI_EIO
;
427 merge_ioerr(struct pnfs_osd_ioerr
*dest_err
,
428 const struct pnfs_osd_ioerr
*src_err
)
430 u64 dest_end
, src_end
;
432 if (!dest_err
->oer_errno
) {
433 *dest_err
= *src_err
;
434 /* accumulated device must be blank */
435 memset(&dest_err
->oer_component
.oid_device_id
, 0,
436 sizeof(dest_err
->oer_component
.oid_device_id
));
441 if (dest_err
->oer_component
.oid_partition_id
!=
442 src_err
->oer_component
.oid_partition_id
)
443 dest_err
->oer_component
.oid_partition_id
= 0;
445 if (dest_err
->oer_component
.oid_object_id
!=
446 src_err
->oer_component
.oid_object_id
)
447 dest_err
->oer_component
.oid_object_id
= 0;
449 if (dest_err
->oer_comp_offset
> src_err
->oer_comp_offset
)
450 dest_err
->oer_comp_offset
= src_err
->oer_comp_offset
;
452 dest_end
= end_offset(dest_err
->oer_comp_offset
,
453 dest_err
->oer_comp_length
);
454 src_end
= end_offset(src_err
->oer_comp_offset
,
455 src_err
->oer_comp_length
);
456 if (dest_end
< src_end
)
459 dest_err
->oer_comp_length
= dest_end
- dest_err
->oer_comp_offset
;
461 if ((src_err
->oer_iswrite
== dest_err
->oer_iswrite
) &&
462 (err_prio(src_err
->oer_errno
) > err_prio(dest_err
->oer_errno
))) {
463 dest_err
->oer_errno
= src_err
->oer_errno
;
464 } else if (src_err
->oer_iswrite
) {
465 dest_err
->oer_iswrite
= true;
466 dest_err
->oer_errno
= src_err
->oer_errno
;
471 encode_accumulated_error(struct objlayout
*objlay
, __be32
*p
)
473 struct objlayout_io_res
*oir
, *tmp
;
474 struct pnfs_osd_ioerr accumulated_err
= {.oer_errno
= 0};
476 list_for_each_entry_safe(oir
, tmp
, &objlay
->err_list
, err_list
) {
479 for (i
= 0; i
< oir
->num_comps
; i
++) {
480 struct pnfs_osd_ioerr
*ioerr
= &oir
->ioerrs
[i
];
482 if (!ioerr
->oer_errno
)
485 printk(KERN_ERR
"NFS: %s: err[%d]: errno=%d "
486 "is_write=%d dev(%llx:%llx) par=0x%llx "
487 "obj=0x%llx offset=0x%llx length=0x%llx\n",
488 __func__
, i
, ioerr
->oer_errno
,
490 _DEVID_LO(&ioerr
->oer_component
.oid_device_id
),
491 _DEVID_HI(&ioerr
->oer_component
.oid_device_id
),
492 ioerr
->oer_component
.oid_partition_id
,
493 ioerr
->oer_component
.oid_object_id
,
494 ioerr
->oer_comp_offset
,
495 ioerr
->oer_comp_length
);
497 merge_ioerr(&accumulated_err
, ioerr
);
499 list_del(&oir
->err_list
);
500 objio_free_result(oir
);
503 pnfs_osd_xdr_encode_ioerr(p
, &accumulated_err
);
507 objlayout_encode_layoutreturn(struct pnfs_layout_hdr
*pnfslay
,
508 struct xdr_stream
*xdr
,
509 const struct nfs4_layoutreturn_args
*args
)
511 struct objlayout
*objlay
= OBJLAYOUT(pnfslay
);
512 struct objlayout_io_res
*oir
, *tmp
;
515 dprintk("%s: Begin\n", __func__
);
516 start
= xdr_reserve_space(xdr
, 4);
519 spin_lock(&objlay
->lock
);
521 list_for_each_entry_safe(oir
, tmp
, &objlay
->err_list
, err_list
) {
522 __be32
*last_xdr
= NULL
, *p
;
526 for (i
= 0; i
< oir
->num_comps
; i
++) {
527 struct pnfs_osd_ioerr
*ioerr
= &oir
->ioerrs
[i
];
529 if (!ioerr
->oer_errno
)
532 dprintk("%s: err[%d]: errno=%d is_write=%d "
533 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
534 "offset=0x%llx length=0x%llx\n",
535 __func__
, i
, ioerr
->oer_errno
,
537 _DEVID_LO(&ioerr
->oer_component
.oid_device_id
),
538 _DEVID_HI(&ioerr
->oer_component
.oid_device_id
),
539 ioerr
->oer_component
.oid_partition_id
,
540 ioerr
->oer_component
.oid_object_id
,
541 ioerr
->oer_comp_offset
,
542 ioerr
->oer_comp_length
);
544 p
= pnfs_osd_xdr_ioerr_reserve_space(xdr
);
547 break; /* accumulated_error */
551 pnfs_osd_xdr_encode_ioerr(p
, &oir
->ioerrs
[i
]);
554 /* TODO: use xdr_write_pages */
556 /* no space for even one error descriptor */
559 /* we've encountered a situation with lots and lots of
560 * errors and no space to encode them all. Use the last
561 * available slot to report the union of all the
564 encode_accumulated_error(objlay
, last_xdr
);
567 list_del(&oir
->err_list
);
568 objio_free_result(oir
);
571 spin_unlock(&objlay
->lock
);
573 *start
= cpu_to_be32((xdr
->p
- start
- 1) * 4);
574 dprintk("%s: Return\n", __func__
);
578 OBJLAYOUT_MAX_URI_LEN
= 256, OBJLAYOUT_MAX_OSDNAME_LEN
= 64,
579 OBJLAYOUT_MAX_SYSID_HEX_LEN
= OSD_SYSTEMID_LEN
* 2 + 1,
580 OSD_LOGIN_UPCALL_PATHLEN
= 256
583 static char osd_login_prog
[OSD_LOGIN_UPCALL_PATHLEN
] = "/sbin/osd_login";
585 module_param_string(osd_login_prog
, osd_login_prog
, sizeof(osd_login_prog
),
587 MODULE_PARM_DESC(osd_login_prog
, "Path to the osd_login upcall program");
589 struct __auto_login
{
590 char uri
[OBJLAYOUT_MAX_URI_LEN
];
591 char osdname
[OBJLAYOUT_MAX_OSDNAME_LEN
];
592 char systemid_hex
[OBJLAYOUT_MAX_SYSID_HEX_LEN
];
595 static int __objlayout_upcall(struct __auto_login
*login
)
597 static char *envp
[] = { "HOME=/",
599 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
605 if (unlikely(!osd_login_prog
[0])) {
606 dprintk("%s: osd_login_prog is disabled\n", __func__
);
610 dprintk("%s uri: %s\n", __func__
, login
->uri
);
611 dprintk("%s osdname %s\n", __func__
, login
->osdname
);
612 dprintk("%s systemid_hex %s\n", __func__
, login
->systemid_hex
);
614 argv
[0] = (char *)osd_login_prog
;
616 argv
[2] = login
->uri
;
618 argv
[4] = login
->osdname
;
620 argv
[6] = login
->systemid_hex
;
623 ret
= call_usermodehelper(argv
[0], argv
, envp
, UMH_WAIT_PROC
);
625 * Disable the upcall mechanism if we're getting an ENOENT or
626 * EACCES error. The admin can re-enable it on the fly by using
627 * sysfs to set the objlayoutdriver.osd_login_prog module parameter once
628 * the problem has been fixed.
630 if (ret
== -ENOENT
|| ret
== -EACCES
) {
631 printk(KERN_ERR
"PNFS-OBJ: %s was not found please set "
632 "objlayoutdriver.osd_login_prog kernel parameter!\n",
634 osd_login_prog
[0] = '\0';
636 dprintk("%s %s return value: %d\n", __func__
, osd_login_prog
, ret
);
641 /* Assume dest is all zeros */
642 static void __copy_nfsS_and_zero_terminate(struct nfs4_string s
,
643 char *dest
, int max_len
,
644 const char *var_name
)
649 if (s
.len
>= max_len
) {
651 "objlayout_autologin: %s: s.len(%d) >= max_len(%d)",
652 var_name
, s
.len
, max_len
);
653 s
.len
= max_len
- 1; /* space for null terminator */
656 memcpy(dest
, s
.data
, s
.len
);
659 /* Assume sysid is all zeros */
660 static void _sysid_2_hex(struct nfs4_string s
,
661 char sysid
[OBJLAYOUT_MAX_SYSID_HEX_LEN
])
669 if (s
.len
!= OSD_SYSTEMID_LEN
) {
671 "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN",
673 if (s
.len
> OSD_SYSTEMID_LEN
)
674 s
.len
= OSD_SYSTEMID_LEN
;
678 for (i
= 0; i
< s
.len
; i
++)
679 cur
= hex_byte_pack(cur
, s
.data
[i
]);
682 int objlayout_autologin(struct pnfs_osd_deviceaddr
*deviceaddr
)
685 struct __auto_login login
;
687 if (!deviceaddr
->oda_targetaddr
.ota_netaddr
.r_addr
.len
)
690 memset(&login
, 0, sizeof(login
));
691 __copy_nfsS_and_zero_terminate(
692 deviceaddr
->oda_targetaddr
.ota_netaddr
.r_addr
,
693 login
.uri
, sizeof(login
.uri
), "URI");
695 __copy_nfsS_and_zero_terminate(
696 deviceaddr
->oda_osdname
,
697 login
.osdname
, sizeof(login
.osdname
), "OSDNAME");
699 _sysid_2_hex(deviceaddr
->oda_systemid
, login
.systemid_hex
);
701 rc
= __objlayout_upcall(&login
);
702 if (rc
> 0) /* script returns positive values */