3 * Boaz Harrosh <bharrosh@panasas.com>
5 * This file is part of the objects raid engine (ore).
7 * It is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
11 * You should have received a copy of the GNU General Public License
12 * along with "ore". If not, write to the Free Software Foundation, Inc:
13 * "Free Software Foundation <info@fsf.org>"
16 #include <linux/gfp.h>
17 #include <linux/async_tx.h>
22 #define ORE_DBGMSG2 ORE_DBGMSG
24 static struct page
*_raid_page_alloc(void)
26 return alloc_page(GFP_KERNEL
);
29 static void _raid_page_free(struct page
*p
)
34 /* This struct is forward declare in ore_io_state, but is private to here.
35 * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
37 * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
38 * Ascending page index access is sp2d(p-minor, c-major). But storage is
39 * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
42 struct __stripe_pages_2d
{
43 /* Cache some hot path repeated calculations */
46 unsigned pages_in_unit
;
50 /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
51 struct __1_page_stripe
{
54 struct async_submit_ctl submit
;
55 struct dma_async_tx_descriptor
*tx
;
57 /* The size of this array is data_devs + parity */
59 struct page
**scribble
;
60 /* bool array, size of this array is data_devs */
65 /* This can get bigger then a page. So support multiple page allocations
66 * _sp2d_free should be called even if _sp2d_alloc fails (by returning
69 static int _sp2d_alloc(unsigned pages_in_unit
, unsigned group_width
,
70 unsigned parity
, struct __stripe_pages_2d
**psp2d
)
72 struct __stripe_pages_2d
*sp2d
;
73 unsigned data_devs
= group_width
- parity
;
74 struct _alloc_all_bytes
{
75 struct __alloc_stripe_pages_2d
{
76 struct __stripe_pages_2d sp2d
;
77 struct __1_page_stripe _1p_stripes
[pages_in_unit
];
79 struct __alloc_1p_arrays
{
80 struct page
*pages
[group_width
];
81 struct page
*scribble
[group_width
];
82 char page_is_read
[data_devs
];
83 } __a1pa
[pages_in_unit
];
85 struct __alloc_1p_arrays
*__a1pa
;
86 struct __alloc_1p_arrays
*__a1pa_end
;
87 const unsigned sizeof__a1pa
= sizeof(_aab
->__a1pa
[0]);
88 unsigned num_a1pa
, alloc_size
, i
;
90 /* FIXME: check these numbers in ore_verify_layout */
91 BUG_ON(sizeof(_aab
->__asp2d
) > PAGE_SIZE
);
92 BUG_ON(sizeof__a1pa
> PAGE_SIZE
);
94 if (sizeof(*_aab
) > PAGE_SIZE
) {
95 num_a1pa
= (PAGE_SIZE
- sizeof(_aab
->__asp2d
)) / sizeof__a1pa
;
96 alloc_size
= sizeof(_aab
->__asp2d
) + sizeof__a1pa
* num_a1pa
;
98 num_a1pa
= pages_in_unit
;
99 alloc_size
= sizeof(*_aab
);
102 _aab
= kzalloc(alloc_size
, GFP_KERNEL
);
103 if (unlikely(!_aab
)) {
104 ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size
);
108 sp2d
= &_aab
->__asp2d
.sp2d
;
109 *psp2d
= sp2d
; /* From here Just call _sp2d_free */
111 __a1pa
= _aab
->__a1pa
;
112 __a1pa_end
= __a1pa
+ num_a1pa
;
114 for (i
= 0; i
< pages_in_unit
; ++i
) {
115 if (unlikely(__a1pa
>= __a1pa_end
)) {
116 num_a1pa
= min_t(unsigned, PAGE_SIZE
/ sizeof__a1pa
,
119 __a1pa
= kzalloc(num_a1pa
* sizeof__a1pa
, GFP_KERNEL
);
120 if (unlikely(!__a1pa
)) {
121 ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
125 __a1pa_end
= __a1pa
+ num_a1pa
;
126 /* First *pages is marked for kfree of the buffer */
127 sp2d
->_1p_stripes
[i
].alloc
= true;
130 sp2d
->_1p_stripes
[i
].pages
= __a1pa
->pages
;
131 sp2d
->_1p_stripes
[i
].scribble
= __a1pa
->scribble
;
132 sp2d
->_1p_stripes
[i
].page_is_read
= __a1pa
->page_is_read
;
136 sp2d
->parity
= parity
;
137 sp2d
->data_devs
= data_devs
;
138 sp2d
->pages_in_unit
= pages_in_unit
;
142 static void _sp2d_reset(struct __stripe_pages_2d
*sp2d
,
143 const struct _ore_r4w_op
*r4w
, void *priv
)
145 unsigned data_devs
= sp2d
->data_devs
;
146 unsigned group_width
= data_devs
+ sp2d
->parity
;
152 for (c
= data_devs
- 1; c
>= 0; --c
)
153 for (p
= sp2d
->pages_in_unit
- 1; p
>= 0; --p
) {
154 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
156 if (_1ps
->page_is_read
[c
]) {
157 struct page
*page
= _1ps
->pages
[c
];
159 r4w
->put_page(priv
, page
);
160 _1ps
->page_is_read
[c
] = false;
164 for (p
= 0; p
< sp2d
->pages_in_unit
; p
++) {
165 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
167 memset(_1ps
->pages
, 0, group_width
* sizeof(*_1ps
->pages
));
168 _1ps
->write_count
= 0;
172 sp2d
->needed
= false;
175 static void _sp2d_free(struct __stripe_pages_2d
*sp2d
)
182 for (i
= 0; i
< sp2d
->pages_in_unit
; ++i
) {
183 if (sp2d
->_1p_stripes
[i
].alloc
)
184 kfree(sp2d
->_1p_stripes
[i
].pages
);
190 static unsigned _sp2d_min_pg(struct __stripe_pages_2d
*sp2d
)
194 for (p
= 0; p
< sp2d
->pages_in_unit
; p
++) {
195 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
197 if (_1ps
->write_count
)
204 static unsigned _sp2d_max_pg(struct __stripe_pages_2d
*sp2d
)
208 for (p
= sp2d
->pages_in_unit
- 1; p
>= 0; --p
) {
209 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
211 if (_1ps
->write_count
)
218 static void _gen_xor_unit(struct __stripe_pages_2d
*sp2d
)
221 unsigned tx_flags
= ASYNC_TX_ACK
;
223 if (sp2d
->parity
== 1)
224 tx_flags
|= ASYNC_TX_XOR_ZERO_DST
;
226 for (p
= 0; p
< sp2d
->pages_in_unit
; p
++) {
227 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
229 if (!_1ps
->write_count
)
232 init_async_submit(&_1ps
->submit
, tx_flags
,
233 NULL
, NULL
, NULL
, (addr_conv_t
*)_1ps
->scribble
);
235 if (sp2d
->parity
== 1)
236 _1ps
->tx
= async_xor(_1ps
->pages
[sp2d
->data_devs
],
237 _1ps
->pages
, 0, sp2d
->data_devs
,
238 PAGE_SIZE
, &_1ps
->submit
);
239 else /* parity == 2 */
240 _1ps
->tx
= async_gen_syndrome(_1ps
->pages
, 0,
241 sp2d
->data_devs
+ sp2d
->parity
,
242 PAGE_SIZE
, &_1ps
->submit
);
245 for (p
= 0; p
< sp2d
->pages_in_unit
; p
++) {
246 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
247 /* NOTE: We wait for HW synchronously (I don't have such HW
248 * to test with.) Is parallelism needed with today's multi
251 async_tx_issue_pending(_1ps
->tx
);
255 void _ore_add_stripe_page(struct __stripe_pages_2d
*sp2d
,
256 struct ore_striping_info
*si
, struct page
*page
)
258 struct __1_page_stripe
*_1ps
;
262 _1ps
= &sp2d
->_1p_stripes
[si
->cur_pg
];
263 _1ps
->pages
[si
->cur_comp
] = page
;
266 si
->cur_pg
= (si
->cur_pg
+ 1) % sp2d
->pages_in_unit
;
267 /* si->cur_comp is advanced outside at main loop */
270 void _ore_add_sg_seg(struct ore_per_dev_state
*per_dev
, unsigned cur_len
,
273 struct osd_sg_entry
*sge
;
275 ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
276 "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
277 per_dev
->dev
, cur_len
, not_last
, per_dev
->cur_sg
,
278 _LLU(per_dev
->offset
), per_dev
->length
,
279 per_dev
->last_sgs_total
);
281 if (!per_dev
->cur_sg
) {
282 sge
= per_dev
->sglist
;
284 /* First time we prepare two entries */
285 if (per_dev
->length
) {
287 sge
->offset
= per_dev
->offset
;
288 sge
->len
= per_dev
->length
;
290 /* Here the parity is the first unit of this object.
291 * This happens every time we reach a parity device on
292 * the same stripe as the per_dev->offset. We need to
293 * just skip this unit.
295 per_dev
->offset
+= cur_len
;
299 /* finalize the last one */
300 sge
= &per_dev
->sglist
[per_dev
->cur_sg
- 1];
301 sge
->len
= per_dev
->length
- per_dev
->last_sgs_total
;
305 /* Partly prepare the next one */
306 struct osd_sg_entry
*next_sge
= sge
+ 1;
309 next_sge
->offset
= sge
->offset
+ sge
->len
+ cur_len
;
310 /* Save cur len so we know how mutch was added next time */
311 per_dev
->last_sgs_total
= per_dev
->length
;
313 } else if (!sge
->len
) {
314 /* Optimize for when the last unit is a parity */
319 static int _alloc_read_4_write(struct ore_io_state
*ios
)
321 struct ore_layout
*layout
= ios
->layout
;
323 /* We want to only read those pages not in cache so worst case
324 * is a stripe populated with every other page
326 unsigned sgs_per_dev
= ios
->sp2d
->pages_in_unit
+ 2;
328 ret
= _ore_get_io_state(layout
, ios
->oc
,
329 layout
->group_width
* layout
->mirrors_p1
,
330 sgs_per_dev
, 0, &ios
->ios_read_4_write
);
334 /* @si contains info of the to-be-inserted page. Update of @si should be
335 * maintained by caller. Specificaly si->dev, si->obj_offset, ...
337 static int _add_to_r4w(struct ore_io_state
*ios
, struct ore_striping_info
*si
,
338 struct page
*page
, unsigned pg_len
)
340 struct request_queue
*q
;
341 struct ore_per_dev_state
*per_dev
;
342 struct ore_io_state
*read_ios
;
343 unsigned first_dev
= si
->dev
- (si
->dev
%
344 (ios
->layout
->group_width
* ios
->layout
->mirrors_p1
));
345 unsigned comp
= si
->dev
- first_dev
;
348 if (!ios
->ios_read_4_write
) {
349 int ret
= _alloc_read_4_write(ios
);
355 read_ios
= ios
->ios_read_4_write
;
356 read_ios
->numdevs
= ios
->layout
->group_width
* ios
->layout
->mirrors_p1
;
358 per_dev
= &read_ios
->per_dev
[comp
];
359 if (!per_dev
->length
) {
360 per_dev
->bio
= bio_kmalloc(GFP_KERNEL
,
361 ios
->sp2d
->pages_in_unit
);
362 if (unlikely(!per_dev
->bio
)) {
363 ORE_DBGMSG("Failed to allocate BIO size=%u\n",
364 ios
->sp2d
->pages_in_unit
);
367 per_dev
->offset
= si
->obj_offset
;
368 per_dev
->dev
= si
->dev
;
369 } else if (si
->obj_offset
!= (per_dev
->offset
+ per_dev
->length
)) {
370 u64 gap
= si
->obj_offset
- (per_dev
->offset
+ per_dev
->length
);
372 _ore_add_sg_seg(per_dev
, gap
, true);
374 q
= osd_request_queue(ore_comp_dev(read_ios
->oc
, per_dev
->dev
));
375 added_len
= bio_add_pc_page(q
, per_dev
->bio
, page
, pg_len
,
376 si
->obj_offset
% PAGE_SIZE
);
377 if (unlikely(added_len
!= pg_len
)) {
378 ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
379 per_dev
->bio
->bi_vcnt
);
383 per_dev
->length
+= pg_len
;
387 /* read the beginning of an unaligned first page */
388 static int _add_to_r4w_first_page(struct ore_io_state
*ios
, struct page
*page
)
390 struct ore_striping_info si
;
393 ore_calc_stripe_info(ios
->layout
, ios
->offset
, 0, &si
);
395 pg_len
= si
.obj_offset
% PAGE_SIZE
;
396 si
.obj_offset
-= pg_len
;
398 ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n",
399 _LLU(si
.obj_offset
), pg_len
, page
->index
, si
.dev
);
401 return _add_to_r4w(ios
, &si
, page
, pg_len
);
404 /* read the end of an incomplete last page */
405 static int _add_to_r4w_last_page(struct ore_io_state
*ios
, u64
*offset
)
407 struct ore_striping_info si
;
409 unsigned pg_len
, p
, c
;
411 ore_calc_stripe_info(ios
->layout
, *offset
, 0, &si
);
415 page
= ios
->sp2d
->_1p_stripes
[p
].pages
[c
];
417 pg_len
= PAGE_SIZE
- (si
.unit_off
% PAGE_SIZE
);
420 ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n",
421 p
, c
, _LLU(*offset
), pg_len
, si
.dev
, si
.par_dev
);
425 return _add_to_r4w(ios
, &si
, page
, pg_len
);
428 static void _mark_read4write_pages_uptodate(struct ore_io_state
*ios
, int ret
)
433 /* loop on all devices all pages */
434 for (d
= 0; d
< ios
->numdevs
; d
++) {
435 struct bio
*bio
= ios
->per_dev
[d
].bio
;
440 bio_for_each_segment_all(bv
, bio
, i
) {
441 struct page
*page
= bv
->bv_page
;
443 SetPageUptodate(page
);
445 ClearPageError(page
);
450 /* read_4_write is hacked to read the start of the first stripe and/or
451 * the end of the last stripe. If needed, with an sg-gap at each device/page.
452 * It is assumed to be called after the to_be_written pages of the first stripe
453 * are populating ios->sp2d[][]
455 * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
456 * These pages are held at sp2d[p].pages[c] but with
457 * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
458 * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
459 * @uptodate=true, so we don't need to read it, only unlock, after IO.
461 * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
462 * to-be-written count, we should consider the xor-in-place mode.
463 * need_to_read_pages_count is the actual number of pages not present in cache.
464 * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
465 * approximation? In this mode the read pages are put in the empty places of
466 * ios->sp2d[p][*], xor is calculated the same way. These pages are
467 * allocated/freed and don't go through cache
469 static int _read_4_write_first_stripe(struct ore_io_state
*ios
)
471 struct ore_striping_info read_si
;
472 struct __stripe_pages_2d
*sp2d
= ios
->sp2d
;
473 u64 offset
= ios
->si
.first_stripe_start
;
474 unsigned c
, p
, min_p
= sp2d
->pages_in_unit
, max_p
= -1;
476 if (offset
== ios
->offset
) /* Go to start collect $200 */
477 goto read_last_stripe
;
479 min_p
= _sp2d_min_pg(sp2d
);
480 max_p
= _sp2d_max_pg(sp2d
);
482 ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n",
483 offset
, ios
->offset
, min_p
, max_p
);
486 ore_calc_stripe_info(ios
->layout
, offset
, 0, &read_si
);
487 read_si
.obj_offset
+= min_p
* PAGE_SIZE
;
488 offset
+= min_p
* PAGE_SIZE
;
489 for (p
= min_p
; p
<= max_p
; p
++) {
490 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
491 struct page
**pp
= &_1ps
->pages
[c
];
495 if (ios
->offset
% PAGE_SIZE
)
496 /* Read the remainder of the page */
497 _add_to_r4w_first_page(ios
, *pp
);
498 /* to-be-written pages start here */
499 goto read_last_stripe
;
502 *pp
= ios
->r4w
->get_page(ios
->private, offset
,
508 _add_to_r4w(ios
, &read_si
, *pp
, PAGE_SIZE
);
510 /* Mark read-pages to be cache_released */
511 _1ps
->page_is_read
[c
] = true;
512 read_si
.obj_offset
+= PAGE_SIZE
;
515 offset
+= (sp2d
->pages_in_unit
- p
) * PAGE_SIZE
;
522 static int _read_4_write_last_stripe(struct ore_io_state
*ios
)
524 struct ore_striping_info read_si
;
525 struct __stripe_pages_2d
*sp2d
= ios
->sp2d
;
528 unsigned bytes_in_stripe
= ios
->si
.bytes_in_stripe
;
529 unsigned c
, p
, min_p
= sp2d
->pages_in_unit
, max_p
= -1;
531 offset
= ios
->offset
+ ios
->length
;
532 if (offset
% PAGE_SIZE
)
533 _add_to_r4w_last_page(ios
, &offset
);
534 /* offset will be aligned to next page */
536 last_stripe_end
= div_u64(offset
+ bytes_in_stripe
- 1, bytes_in_stripe
)
538 if (offset
== last_stripe_end
) /* Optimize for the aligned case */
541 ore_calc_stripe_info(ios
->layout
, offset
, 0, &read_si
);
543 c
= read_si
.cur_comp
;
545 if (min_p
== sp2d
->pages_in_unit
) {
546 /* Didn't do it yet */
547 min_p
= _sp2d_min_pg(sp2d
);
548 max_p
= _sp2d_max_pg(sp2d
);
551 ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n",
552 offset
, last_stripe_end
, min_p
, max_p
);
554 while (offset
< last_stripe_end
) {
555 struct __1_page_stripe
*_1ps
= &sp2d
->_1p_stripes
[p
];
557 if ((min_p
<= p
) && (p
<= max_p
)) {
561 BUG_ON(_1ps
->pages
[c
]);
562 page
= ios
->r4w
->get_page(ios
->private, offset
,
567 _1ps
->pages
[c
] = page
;
568 /* Mark read-pages to be cache_released */
569 _1ps
->page_is_read
[c
] = true;
571 _add_to_r4w(ios
, &read_si
, page
, PAGE_SIZE
);
575 if (p
== (sp2d
->pages_in_unit
- 1)) {
578 ore_calc_stripe_info(ios
->layout
, offset
, 0, &read_si
);
580 read_si
.obj_offset
+= PAGE_SIZE
;
589 static int _read_4_write_execute(struct ore_io_state
*ios
)
591 struct ore_io_state
*ios_read
;
595 ios_read
= ios
->ios_read_4_write
;
599 /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
600 * to check for per_dev->bio
602 ios_read
->pages
= ios
->pages
;
604 /* Now read these devices */
605 for (i
= 0; i
< ios_read
->numdevs
; i
+= ios_read
->layout
->mirrors_p1
) {
606 ret
= _ore_read_mirror(ios_read
, i
);
611 ret
= ore_io_execute(ios_read
); /* Synchronus execution */
613 ORE_DBGMSG("!! ore_io_execute => %d\n", ret
);
617 _mark_read4write_pages_uptodate(ios_read
, ret
);
618 ore_put_io_state(ios_read
);
619 ios
->ios_read_4_write
= NULL
; /* Might need a reuse at last stripe */
623 /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
624 int _ore_add_parity_unit(struct ore_io_state
*ios
,
625 struct ore_striping_info
*si
,
626 struct ore_per_dev_state
*per_dev
,
627 unsigned cur_len
, bool do_xor
)
630 if (per_dev
->cur_sg
>= ios
->sgs_per_dev
) {
631 ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" ,
632 per_dev
->cur_sg
, ios
->sgs_per_dev
);
635 _ore_add_sg_seg(per_dev
, cur_len
, true);
637 struct __stripe_pages_2d
*sp2d
= ios
->sp2d
;
638 struct page
**pages
= ios
->parity_pages
+ ios
->cur_par_page
;
640 unsigned array_start
= 0;
644 si
->cur_pg
= _sp2d_min_pg(sp2d
);
645 num_pages
= _sp2d_max_pg(sp2d
) + 1 - si
->cur_pg
;
647 if (!per_dev
->length
) {
648 per_dev
->offset
+= si
->cur_pg
* PAGE_SIZE
;
649 /* If first stripe, Read in all read4write pages
650 * (if needed) before we calculate the first parity.
653 _read_4_write_first_stripe(ios
);
655 if (!cur_len
&& do_xor
)
656 /* If last stripe r4w pages of last stripe */
657 _read_4_write_last_stripe(ios
);
658 _read_4_write_execute(ios
);
660 for (i
= 0; i
< num_pages
; i
++) {
661 pages
[i
] = _raid_page_alloc();
662 if (unlikely(!pages
[i
]))
665 ++(ios
->cur_par_page
);
668 BUG_ON(si
->cur_comp
< sp2d
->data_devs
);
669 BUG_ON(si
->cur_pg
+ num_pages
> sp2d
->pages_in_unit
);
671 ret
= _ore_add_stripe_unit(ios
, &array_start
, 0, pages
,
672 per_dev
, num_pages
* PAGE_SIZE
);
678 _sp2d_reset(sp2d
, ios
->r4w
, ios
->private);
684 int _ore_post_alloc_raid_stuff(struct ore_io_state
*ios
)
686 if (ios
->parity_pages
) {
687 struct ore_layout
*layout
= ios
->layout
;
688 unsigned pages_in_unit
= layout
->stripe_unit
/ PAGE_SIZE
;
690 if (_sp2d_alloc(pages_in_unit
, layout
->group_width
,
691 layout
->parity
, &ios
->sp2d
)) {
698 void _ore_free_raid_stuff(struct ore_io_state
*ios
)
700 if (ios
->sp2d
) { /* writing and raid */
703 for (i
= 0; i
< ios
->cur_par_page
; i
++) {
704 struct page
*page
= ios
->parity_pages
[i
];
707 _raid_page_free(page
);
709 if (ios
->extra_part_alloc
)
710 kfree(ios
->parity_pages
);
711 /* If IO returned an error pages might need unlocking */
712 _sp2d_reset(ios
->sp2d
, ios
->r4w
, ios
->private);
713 _sp2d_free(ios
->sp2d
);
715 /* Will only be set if raid reading && sglist is big */
716 if (ios
->extra_part_alloc
)
717 kfree(ios
->per_dev
[0].sglist
);
719 if (ios
->ios_read_4_write
)
720 ore_put_io_state(ios
->ios_read_4_write
);