2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com>
7 * This file is part of exofs.
9 * exofs is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation. Since it is based on ext2, and the only
12 * valid version of GPL for the Linux kernel is version 2, the only valid
13 * version of GPL for exofs is version 2.
15 * exofs is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with exofs; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 #include <linux/slab.h>
26 #include <asm/div64.h>
28 #include <scsi/osd_ore.h>
30 #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
32 #ifdef CONFIG_EXOFS_DEBUG
33 #define ORE_DBGMSG(fmt, a...) \
34 printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
36 #define ORE_DBGMSG(fmt, a...) \
37 do { if (0) printk(fmt, ##a); } while (0)
40 /* u64 has problems with printk this will cast it to unsigned long long */
41 #define _LLU(x) (unsigned long long)(x)
43 #define ORE_DBGMSG2(M...) do {} while (0)
44 /* #define ORE_DBGMSG2 ORE_DBGMSG */
46 MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
47 MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
48 MODULE_LICENSE("GPL");
50 static u8
*_ios_cred(struct ore_io_state
*ios
, unsigned index
)
52 return ios
->comps
->comps
[index
& ios
->comps
->single_comp
].cred
;
55 static struct osd_obj_id
*_ios_obj(struct ore_io_state
*ios
, unsigned index
)
57 return &ios
->comps
->comps
[index
& ios
->comps
->single_comp
].obj
;
60 static struct osd_dev
*_ios_od(struct ore_io_state
*ios
, unsigned index
)
62 return ios
->comps
->ods
[index
];
65 int ore_get_rw_state(struct ore_layout
*layout
, struct ore_components
*comps
,
66 bool is_reading
, u64 offset
, u64 length
,
67 struct ore_io_state
**pios
)
69 struct ore_io_state
*ios
;
71 /*TODO: Maybe use kmem_cach per sbi of size
72 * exofs_io_state_size(layout->s_numdevs)
74 ios
= kzalloc(ore_io_state_size(comps
->numdevs
), GFP_KERNEL
);
76 ORE_DBGMSG("Failed kzalloc bytes=%d\n",
77 ore_io_state_size(comps
->numdevs
));
86 ios
->reading
= is_reading
;
91 EXPORT_SYMBOL(ore_get_rw_state
);
93 int ore_get_io_state(struct ore_layout
*layout
, struct ore_components
*comps
,
94 struct ore_io_state
**ios
)
96 return ore_get_rw_state(layout
, comps
, true, 0, 0, ios
);
98 EXPORT_SYMBOL(ore_get_io_state
);
100 void ore_put_io_state(struct ore_io_state
*ios
)
105 for (i
= 0; i
< ios
->numdevs
; i
++) {
106 struct ore_per_dev_state
*per_dev
= &ios
->per_dev
[i
];
109 osd_end_request(per_dev
->or);
111 bio_put(per_dev
->bio
);
117 EXPORT_SYMBOL(ore_put_io_state
);
119 static void _sync_done(struct ore_io_state
*ios
, void *p
)
121 struct completion
*waiting
= p
;
126 static void _last_io(struct kref
*kref
)
128 struct ore_io_state
*ios
= container_of(
129 kref
, struct ore_io_state
, kref
);
131 ios
->done(ios
, ios
->private);
134 static void _done_io(struct osd_request
*or, void *p
)
136 struct ore_io_state
*ios
= p
;
138 kref_put(&ios
->kref
, _last_io
);
141 static int ore_io_execute(struct ore_io_state
*ios
)
143 DECLARE_COMPLETION_ONSTACK(wait
);
144 bool sync
= (ios
->done
== NULL
);
148 ios
->done
= _sync_done
;
149 ios
->private = &wait
;
152 for (i
= 0; i
< ios
->numdevs
; i
++) {
153 struct osd_request
*or = ios
->per_dev
[i
].or;
157 ret
= osd_finalize_request(or, 0, _ios_cred(ios
, i
), NULL
);
159 ORE_DBGMSG("Failed to osd_finalize_request() => %d\n",
165 kref_init(&ios
->kref
);
167 for (i
= 0; i
< ios
->numdevs
; i
++) {
168 struct osd_request
*or = ios
->per_dev
[i
].or;
172 kref_get(&ios
->kref
);
173 osd_execute_request_async(or, _done_io
, ios
);
176 kref_put(&ios
->kref
, _last_io
);
180 wait_for_completion(&wait
);
181 ret
= ore_check_io(ios
, NULL
);
186 static void _clear_bio(struct bio
*bio
)
191 __bio_for_each_segment(bv
, bio
, i
, 0) {
192 unsigned this_count
= bv
->bv_len
;
194 if (likely(PAGE_SIZE
== this_count
))
195 clear_highpage(bv
->bv_page
);
197 zero_user(bv
->bv_page
, bv
->bv_offset
, this_count
);
201 int ore_check_io(struct ore_io_state
*ios
, u64
*resid
)
203 enum osd_err_priority acumulated_osd_err
= 0;
204 int acumulated_lin_err
= 0;
207 for (i
= 0; i
< ios
->numdevs
; i
++) {
208 struct osd_sense_info osi
;
209 struct osd_request
*or = ios
->per_dev
[i
].or;
215 ret
= osd_req_decode_sense(or, &osi
);
219 if (OSD_ERR_PRI_CLEAR_PAGES
== osi
.osd_err_pri
) {
220 /* start read offset passed endof file */
221 _clear_bio(ios
->per_dev
[i
].bio
);
222 ORE_DBGMSG("start read offset passed end of file "
223 "offset=0x%llx, length=0x%llx\n",
224 _LLU(ios
->per_dev
[i
].offset
),
225 _LLU(ios
->per_dev
[i
].length
));
227 continue; /* we recovered */
230 if (osi
.osd_err_pri
>= acumulated_osd_err
) {
231 acumulated_osd_err
= osi
.osd_err_pri
;
232 acumulated_lin_err
= ret
;
236 /* TODO: raid specific residual calculations */
238 if (likely(!acumulated_lin_err
))
241 *resid
= ios
->length
;
244 return acumulated_lin_err
;
246 EXPORT_SYMBOL(ore_check_io
);
249 * L - logical offset into the file
251 * U - The number of bytes in a stripe within a group
253 * U = stripe_unit * group_width
255 * T - The number of bytes striped within a group of component objects
256 * (before advancing to the next group)
258 * T = stripe_unit * group_width * group_depth
260 * S - The number of bytes striped across all component objects
261 * before the pattern repeats
263 * S = stripe_unit * group_width * group_depth * group_count
265 * M - The "major" (i.e., across all components) stripe number
269 * G - Counts the groups from the beginning of the major stripe
271 * G = (L - (M * S)) / T [or (L % S) / T]
273 * H - The byte offset within the group
275 * H = (L - (M * S)) % T [or (L % S) % T]
277 * N - The "minor" (i.e., across the group) stripe number
281 * C - The component index coresponding to L
283 * C = (H - (N * U)) / stripe_unit + G * group_width
284 * [or (L % U) / stripe_unit + G * group_width]
286 * O - The component offset coresponding to L
288 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
290 struct _striping_info
{
293 u64 M
; /* for truncate */
298 static void _calc_stripe_info(struct ore_layout
*layout
, u64 file_offset
,
299 struct _striping_info
*si
)
301 u32 stripe_unit
= layout
->stripe_unit
;
302 u32 group_width
= layout
->group_width
;
303 u64 group_depth
= layout
->group_depth
;
305 u32 U
= stripe_unit
* group_width
;
306 u64 T
= U
* group_depth
;
307 u64 S
= T
* layout
->group_count
;
308 u64 M
= div64_u64(file_offset
, S
);
311 G = (L - (M * S)) / T
312 H = (L - (M * S)) % T
314 u64 LmodS
= file_offset
- M
* S
;
315 u32 G
= div64_u64(LmodS
, T
);
316 u64 H
= LmodS
- G
* T
;
318 u32 N
= div_u64(H
, U
);
320 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
321 si
->dev
= (u32
)(H
- (N
* U
)) / stripe_unit
+ G
* group_width
;
322 si
->dev
*= layout
->mirrors_p1
;
324 div_u64_rem(file_offset
, stripe_unit
, &si
->unit_off
);
326 si
->obj_offset
= si
->unit_off
+ (N
* stripe_unit
) +
327 (M
* group_depth
* stripe_unit
);
329 si
->group_length
= T
- H
;
333 static int _add_stripe_unit(struct ore_io_state
*ios
, unsigned *cur_pg
,
334 unsigned pgbase
, struct ore_per_dev_state
*per_dev
,
337 unsigned pg
= *cur_pg
;
338 struct request_queue
*q
=
339 osd_request_queue(_ios_od(ios
, per_dev
->dev
));
341 per_dev
->length
+= cur_len
;
343 if (per_dev
->bio
== NULL
) {
344 unsigned pages_in_stripe
= ios
->layout
->group_width
*
345 (ios
->layout
->stripe_unit
/ PAGE_SIZE
);
346 unsigned bio_size
= (ios
->nr_pages
+ pages_in_stripe
) /
347 ios
->layout
->group_width
;
349 per_dev
->bio
= bio_kmalloc(GFP_KERNEL
, bio_size
);
350 if (unlikely(!per_dev
->bio
)) {
351 ORE_DBGMSG("Failed to allocate BIO size=%u\n",
357 while (cur_len
> 0) {
358 unsigned pglen
= min_t(unsigned, PAGE_SIZE
- pgbase
, cur_len
);
361 BUG_ON(ios
->nr_pages
<= pg
);
364 added_len
= bio_add_pc_page(q
, per_dev
->bio
, ios
->pages
[pg
],
366 if (unlikely(pglen
!= added_len
))
377 static int _prepare_one_group(struct ore_io_state
*ios
, u64 length
,
378 struct _striping_info
*si
)
380 unsigned stripe_unit
= ios
->layout
->stripe_unit
;
381 unsigned mirrors_p1
= ios
->layout
->mirrors_p1
;
382 unsigned devs_in_group
= ios
->layout
->group_width
* mirrors_p1
;
383 unsigned dev
= si
->dev
;
384 unsigned first_dev
= dev
- (dev
% devs_in_group
);
385 unsigned max_comp
= ios
->numdevs
? ios
->numdevs
- mirrors_p1
: 0;
386 unsigned cur_pg
= ios
->pages_consumed
;
390 struct ore_per_dev_state
*per_dev
= &ios
->per_dev
[dev
];
391 unsigned cur_len
, page_off
= 0;
393 if (!per_dev
->length
) {
396 per_dev
->offset
= si
->obj_offset
+ stripe_unit
-
398 cur_len
= stripe_unit
;
399 } else if (dev
== si
->dev
) {
400 per_dev
->offset
= si
->obj_offset
;
401 cur_len
= stripe_unit
- si
->unit_off
;
402 page_off
= si
->unit_off
& ~PAGE_MASK
;
403 BUG_ON(page_off
&& (page_off
!= ios
->pgbase
));
404 } else { /* dev > si->dev */
405 per_dev
->offset
= si
->obj_offset
- si
->unit_off
;
406 cur_len
= stripe_unit
;
412 cur_len
= stripe_unit
;
414 if (cur_len
>= length
)
417 ret
= _add_stripe_unit(ios
, &cur_pg
, page_off
, per_dev
,
423 dev
= (dev
% devs_in_group
) + first_dev
;
428 ios
->numdevs
= max_comp
+ mirrors_p1
;
429 ios
->pages_consumed
= cur_pg
;
433 static int _prepare_for_striping(struct ore_io_state
*ios
)
435 u64 length
= ios
->length
;
436 u64 offset
= ios
->offset
;
437 struct _striping_info si
;
441 if (ios
->kern_buff
) {
442 struct ore_per_dev_state
*per_dev
= &ios
->per_dev
[0];
444 _calc_stripe_info(ios
->layout
, ios
->offset
, &si
);
445 per_dev
->offset
= si
.obj_offset
;
446 per_dev
->dev
= si
.dev
;
448 /* no cross device without page array */
449 BUG_ON((ios
->layout
->group_width
> 1) &&
450 (si
.unit_off
+ ios
->length
>
451 ios
->layout
->stripe_unit
));
453 ios
->numdevs
= ios
->layout
->mirrors_p1
;
458 _calc_stripe_info(ios
->layout
, offset
, &si
);
460 if (length
< si
.group_length
)
461 si
.group_length
= length
;
463 ret
= _prepare_one_group(ios
, si
.group_length
, &si
);
467 offset
+= si
.group_length
;
468 length
-= si
.group_length
;
475 int ore_create(struct ore_io_state
*ios
)
479 for (i
= 0; i
< ios
->comps
->numdevs
; i
++) {
480 struct osd_request
*or;
482 or = osd_start_request(_ios_od(ios
, i
), GFP_KERNEL
);
484 ORE_ERR("%s: osd_start_request failed\n", __func__
);
488 ios
->per_dev
[i
].or = or;
491 osd_req_create_object(or, _ios_obj(ios
, i
));
493 ret
= ore_io_execute(ios
);
498 EXPORT_SYMBOL(ore_create
);
500 int ore_remove(struct ore_io_state
*ios
)
504 for (i
= 0; i
< ios
->comps
->numdevs
; i
++) {
505 struct osd_request
*or;
507 or = osd_start_request(_ios_od(ios
, i
), GFP_KERNEL
);
509 ORE_ERR("%s: osd_start_request failed\n", __func__
);
513 ios
->per_dev
[i
].or = or;
516 osd_req_remove_object(or, _ios_obj(ios
, i
));
518 ret
= ore_io_execute(ios
);
523 EXPORT_SYMBOL(ore_remove
);
525 static int _write_mirror(struct ore_io_state
*ios
, int cur_comp
)
527 struct ore_per_dev_state
*master_dev
= &ios
->per_dev
[cur_comp
];
528 unsigned dev
= ios
->per_dev
[cur_comp
].dev
;
529 unsigned last_comp
= cur_comp
+ ios
->layout
->mirrors_p1
;
532 if (ios
->pages
&& !master_dev
->length
)
533 return 0; /* Just an empty slot */
535 for (; cur_comp
< last_comp
; ++cur_comp
, ++dev
) {
536 struct ore_per_dev_state
*per_dev
= &ios
->per_dev
[cur_comp
];
537 struct osd_request
*or;
539 or = osd_start_request(_ios_od(ios
, dev
), GFP_KERNEL
);
541 ORE_ERR("%s: osd_start_request failed\n", __func__
);
546 per_dev
->offset
= master_dev
->offset
;
551 if (per_dev
!= master_dev
) {
552 bio
= bio_kmalloc(GFP_KERNEL
,
553 master_dev
->bio
->bi_max_vecs
);
554 if (unlikely(!bio
)) {
556 "Failed to allocate BIO size=%u\n",
557 master_dev
->bio
->bi_max_vecs
);
562 __bio_clone(bio
, master_dev
->bio
);
565 per_dev
->length
= master_dev
->length
;
569 bio
= master_dev
->bio
;
570 /* FIXME: bio_set_dir() */
571 bio
->bi_rw
|= REQ_WRITE
;
574 osd_req_write(or, _ios_obj(ios
, dev
), per_dev
->offset
,
575 bio
, per_dev
->length
);
576 ORE_DBGMSG("write(0x%llx) offset=0x%llx "
577 "length=0x%llx dev=%d\n",
578 _LLU(_ios_obj(ios
, dev
)->id
),
579 _LLU(per_dev
->offset
),
580 _LLU(per_dev
->length
), dev
);
581 } else if (ios
->kern_buff
) {
582 ret
= osd_req_write_kern(or, _ios_obj(ios
, dev
),
584 ios
->kern_buff
, ios
->length
);
587 ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
588 "length=0x%llx dev=%d\n",
589 _LLU(_ios_obj(ios
, dev
)->id
),
590 _LLU(per_dev
->offset
),
591 _LLU(ios
->length
), dev
);
593 osd_req_set_attributes(or, _ios_obj(ios
, dev
));
594 ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
595 _LLU(_ios_obj(ios
, dev
)->id
),
596 ios
->out_attr_len
, dev
);
600 osd_req_add_set_attr_list(or, ios
->out_attr
,
604 osd_req_add_get_attr_list(or, ios
->in_attr
,
612 int ore_write(struct ore_io_state
*ios
)
617 ret
= _prepare_for_striping(ios
);
621 for (i
= 0; i
< ios
->numdevs
; i
+= ios
->layout
->mirrors_p1
) {
622 ret
= _write_mirror(ios
, i
);
627 ret
= ore_io_execute(ios
);
630 EXPORT_SYMBOL(ore_write
);
632 static int _read_mirror(struct ore_io_state
*ios
, unsigned cur_comp
)
634 struct osd_request
*or;
635 struct ore_per_dev_state
*per_dev
= &ios
->per_dev
[cur_comp
];
636 struct osd_obj_id
*obj
= _ios_obj(ios
, cur_comp
);
637 unsigned first_dev
= (unsigned)obj
->id
;
639 if (ios
->pages
&& !per_dev
->length
)
640 return 0; /* Just an empty slot */
642 first_dev
= per_dev
->dev
+ first_dev
% ios
->layout
->mirrors_p1
;
643 or = osd_start_request(_ios_od(ios
, first_dev
), GFP_KERNEL
);
645 ORE_ERR("%s: osd_start_request failed\n", __func__
);
651 osd_req_read(or, obj
, per_dev
->offset
,
652 per_dev
->bio
, per_dev
->length
);
653 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
654 " dev=%d\n", _LLU(obj
->id
),
655 _LLU(per_dev
->offset
), _LLU(per_dev
->length
),
657 } else if (ios
->kern_buff
) {
658 int ret
= osd_req_read_kern(or, obj
, per_dev
->offset
,
659 ios
->kern_buff
, ios
->length
);
660 ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
661 "length=0x%llx dev=%d ret=>%d\n",
662 _LLU(obj
->id
), _LLU(per_dev
->offset
),
663 _LLU(ios
->length
), first_dev
, ret
);
667 osd_req_get_attributes(or, obj
);
668 ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
670 ios
->in_attr_len
, first_dev
);
673 osd_req_add_set_attr_list(or, ios
->out_attr
, ios
->out_attr_len
);
676 osd_req_add_get_attr_list(or, ios
->in_attr
, ios
->in_attr_len
);
681 int ore_read(struct ore_io_state
*ios
)
686 ret
= _prepare_for_striping(ios
);
690 for (i
= 0; i
< ios
->numdevs
; i
+= ios
->layout
->mirrors_p1
) {
691 ret
= _read_mirror(ios
, i
);
696 ret
= ore_io_execute(ios
);
699 EXPORT_SYMBOL(ore_read
);
701 int extract_attr_from_ios(struct ore_io_state
*ios
, struct osd_attr
*attr
)
703 struct osd_attr cur_attr
= {.attr_page
= 0}; /* start with zeros */
709 osd_req_decode_get_attr_list(ios
->per_dev
[0].or,
710 &cur_attr
, &nelem
, &iter
);
711 if ((cur_attr
.attr_page
== attr
->attr_page
) &&
712 (cur_attr
.attr_id
== attr
->attr_id
)) {
713 attr
->len
= cur_attr
.len
;
714 attr
->val_ptr
= cur_attr
.val_ptr
;
721 EXPORT_SYMBOL(extract_attr_from_ios
);
723 static int _truncate_mirrors(struct ore_io_state
*ios
, unsigned cur_comp
,
724 struct osd_attr
*attr
)
726 int last_comp
= cur_comp
+ ios
->layout
->mirrors_p1
;
728 for (; cur_comp
< last_comp
; ++cur_comp
) {
729 struct ore_per_dev_state
*per_dev
= &ios
->per_dev
[cur_comp
];
730 struct osd_request
*or;
732 or = osd_start_request(_ios_od(ios
, cur_comp
), GFP_KERNEL
);
734 ORE_ERR("%s: osd_start_request failed\n", __func__
);
739 osd_req_set_attributes(or, _ios_obj(ios
, cur_comp
));
740 osd_req_add_set_attr_list(or, attr
, 1);
747 struct _striping_info si
;
748 u64 prev_group_obj_off
;
749 u64 next_group_obj_off
;
751 unsigned first_group_dev
;
752 unsigned nex_group_dev
;
756 void _calc_trunk_info(struct ore_layout
*layout
, u64 file_offset
,
757 struct _trunc_info
*ti
)
759 unsigned stripe_unit
= layout
->stripe_unit
;
761 _calc_stripe_info(layout
, file_offset
, &ti
->si
);
763 ti
->prev_group_obj_off
= ti
->si
.M
* stripe_unit
;
764 ti
->next_group_obj_off
= ti
->si
.M
? (ti
->si
.M
- 1) * stripe_unit
: 0;
766 ti
->first_group_dev
= ti
->si
.dev
- (ti
->si
.dev
% layout
->group_width
);
767 ti
->nex_group_dev
= ti
->first_group_dev
+ layout
->group_width
;
768 ti
->max_devs
= layout
->group_width
* layout
->group_count
;
771 int ore_truncate(struct ore_layout
*layout
, struct ore_components
*comps
,
774 struct ore_io_state
*ios
;
775 struct exofs_trunc_attr
{
776 struct osd_attr attr
;
779 struct _trunc_info ti
;
782 ret
= ore_get_io_state(layout
, comps
, &ios
);
786 _calc_trunk_info(ios
->layout
, size
, &ti
);
788 size_attrs
= kcalloc(ti
.max_devs
, sizeof(*size_attrs
),
790 if (unlikely(!size_attrs
)) {
795 ios
->numdevs
= ios
->comps
->numdevs
;
797 for (i
= 0; i
< ti
.max_devs
; ++i
) {
798 struct exofs_trunc_attr
*size_attr
= &size_attrs
[i
];
801 if (i
< ti
.first_group_dev
)
802 obj_size
= ti
.prev_group_obj_off
;
803 else if (i
>= ti
.nex_group_dev
)
804 obj_size
= ti
.next_group_obj_off
;
805 else if (i
< ti
.si
.dev
) /* dev within this group */
806 obj_size
= ti
.si
.obj_offset
+
807 ios
->layout
->stripe_unit
- ti
.si
.unit_off
;
808 else if (i
== ti
.si
.dev
)
809 obj_size
= ti
.si
.obj_offset
;
810 else /* i > ti.dev */
811 obj_size
= ti
.si
.obj_offset
- ti
.si
.unit_off
;
813 size_attr
->newsize
= cpu_to_be64(obj_size
);
814 size_attr
->attr
= g_attr_logical_length
;
815 size_attr
->attr
.val_ptr
= &size_attr
->newsize
;
817 ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
818 _LLU(comps
->comps
->obj
.id
), _LLU(obj_size
), i
);
819 ret
= _truncate_mirrors(ios
, i
* ios
->layout
->mirrors_p1
,
824 ret
= ore_io_execute(ios
);
828 ore_put_io_state(ios
);
831 EXPORT_SYMBOL(ore_truncate
);
833 const struct osd_attr g_attr_logical_length
= ATTR_DEF(
834 OSD_APAGE_OBJECT_INFORMATION
, OSD_ATTR_OI_LOGICAL_LENGTH
, 8);
835 EXPORT_SYMBOL(g_attr_logical_length
);