4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
23 * Copyright (c) 2019 by Delphix. All rights reserved.
27 * ARC buffer data (ABD).
29 * ABDs are an abstract data structure for the ARC which can use two
30 * different ways of storing the underlying data:
32 * (a) Linear buffer. In this case, all the data in the ABD is stored in one
33 * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
35 * +-------------------+
38 * | abd_size = ... | +--------------------------------+
39 * | abd_buf ------------->| raw buffer of size abd_size |
40 * +-------------------+ +--------------------------------+
43 * (b) Scattered buffer. In this case, the data in the ABD is split into
44 * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
45 * to the chunks recorded in an array at the end of the ABD structure.
47 * +-------------------+
51 * | abd_offset = 0 | +-----------+
52 * | abd_chunks[0] ----------------------------->| chunk 0 |
53 * | abd_chunks[1] ---------------------+ +-----------+
54 * | ... | | +-----------+
55 * | abd_chunks[N-1] ---------+ +------->| chunk 1 |
56 * +-------------------+ | +-----------+
59 * +----------------->| chunk N-1 |
62 * Linear buffers act exactly like normal buffers and are always mapped into the
63 * kernel's virtual memory space, while scattered ABD data chunks are allocated
64 * as physical pages and then mapped in only while they are actually being
65 * accessed through one of the abd_* library functions. Using scattered ABDs
66 * provides several benefits:
68 * (1) They avoid use of kmem_*, preventing performance problems where running
69 * kmem_reap on very large memory systems never finishes and causes
70 * constant TLB shootdowns.
72 * (2) Fragmentation is less of an issue since when we are at the limit of
73 * allocatable space, we won't have to search around for a long free
74 * hole in the VA space for large ARC allocations. Each chunk is mapped in
75 * individually, so even if we are using HIGHMEM (see next point) we
76 * wouldn't need to worry about finding a contiguous address range.
78 * (3) If we are not using HIGHMEM, then all physical memory is always
79 * mapped into the kernel's address space, so we also avoid the map /
80 * unmap costs on each ABD access.
82 * If we are not using HIGHMEM, scattered buffers which have only one chunk
83 * can be treated as linear buffers, because they are contiguous in the
84 * kernel's virtual address space. See abd_alloc_pages() for details.
86 * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
89 * In addition to directly allocating a linear or scattered ABD, it is also
90 * possible to create an ABD by requesting the "sub-ABD" starting at an offset
91 * within an existing ABD. In linear buffers this is simple (set abd_buf of
92 * the new ABD to the starting point within the original raw buffer), but
93 * scattered ABDs are a little more complex. The new ABD makes a copy of the
94 * relevant abd_chunks pointers (but not the underlying data). However, to
95 * provide arbitrary rather than only chunk-aligned starting offsets, it also
96 * tracks an abd_offset field which represents the starting point of the data
97 * within the first chunk in abd_chunks. For both linear and scattered ABDs,
98 * creating an offset ABD marks the original ABD as the offset's parent, and the
99 * original ABD's abd_children refcount is incremented. This data allows us to
100 * ensure the root ABD isn't deleted before its children.
102 * Most consumers should never need to know what type of ABD they're using --
103 * the ABD public API ensures that it's possible to transparently switch from
104 * using a linear ABD to a scattered one when doing so would be beneficial.
106 * If you need to use the data within an ABD directly, if you know it's linear
107 * (because you allocated it) you can use abd_to_buf() to access the underlying
108 * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
109 * which will allocate a raw buffer if necessary. Use the abd_return_buf*
110 * functions to return any raw buffers that are no longer necessary when you're
113 * There are a variety of ABD APIs that implement basic buffer operations:
114 * compare, copy, read, write, and fill with zeroes. If you need a custom
115 * function which progressively accesses the whole ABD, use the abd_iterate_*
120 #include <sys/param.h>
122 #include <sys/zfs_context.h>
123 #include <sys/zfs_znode.h>
125 #include <linux/scatterlist.h>
126 #include <linux/kmap_compat.h>
131 typedef struct abd_stats
{
132 kstat_named_t abdstat_struct_size
;
133 kstat_named_t abdstat_linear_cnt
;
134 kstat_named_t abdstat_linear_data_size
;
135 kstat_named_t abdstat_scatter_cnt
;
136 kstat_named_t abdstat_scatter_data_size
;
137 kstat_named_t abdstat_scatter_chunk_waste
;
138 kstat_named_t abdstat_scatter_orders
[MAX_ORDER
];
139 kstat_named_t abdstat_scatter_page_multi_chunk
;
140 kstat_named_t abdstat_scatter_page_multi_zone
;
141 kstat_named_t abdstat_scatter_page_alloc_retry
;
142 kstat_named_t abdstat_scatter_sg_table_retry
;
145 static abd_stats_t abd_stats
= {
146 /* Amount of memory occupied by all of the abd_t struct allocations */
147 { "struct_size", KSTAT_DATA_UINT64
},
149 * The number of linear ABDs which are currently allocated, excluding
150 * ABDs which don't own their data (for instance the ones which were
151 * allocated through abd_get_offset() and abd_get_from_buf()). If an
152 * ABD takes ownership of its buf then it will become tracked.
154 { "linear_cnt", KSTAT_DATA_UINT64
},
155 /* Amount of data stored in all linear ABDs tracked by linear_cnt */
156 { "linear_data_size", KSTAT_DATA_UINT64
},
158 * The number of scatter ABDs which are currently allocated, excluding
159 * ABDs which don't own their data (for instance the ones which were
160 * allocated through abd_get_offset()).
162 { "scatter_cnt", KSTAT_DATA_UINT64
},
163 /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
164 { "scatter_data_size", KSTAT_DATA_UINT64
},
166 * The amount of space wasted at the end of the last chunk across all
167 * scatter ABDs tracked by scatter_cnt.
169 { "scatter_chunk_waste", KSTAT_DATA_UINT64
},
171 * The number of compound allocations of a given order. These
172 * allocations are spread over all currently allocated ABDs, and
173 * act as a measure of memory fragmentation.
175 { { "scatter_order_N", KSTAT_DATA_UINT64
} },
177 * The number of scatter ABDs which contain multiple chunks.
178 * ABDs are preferentially allocated from the minimum number of
179 * contiguous multi-page chunks, a single chunk is optimal.
181 { "scatter_page_multi_chunk", KSTAT_DATA_UINT64
},
183 * The number of scatter ABDs which are split across memory zones.
184 * ABDs are preferentially allocated using pages from a single zone.
186 { "scatter_page_multi_zone", KSTAT_DATA_UINT64
},
188 * The total number of retries encountered when attempting to
189 * allocate the pages to populate the scatter ABD.
191 { "scatter_page_alloc_retry", KSTAT_DATA_UINT64
},
193 * The total number of retries encountered when attempting to
194 * allocate the sg table for an ABD.
196 { "scatter_sg_table_retry", KSTAT_DATA_UINT64
},
199 #define ABDSTAT(stat) (abd_stats.stat.value.ui64)
200 #define ABDSTAT_INCR(stat, val) \
201 atomic_add_64(&abd_stats.stat.value.ui64, (val))
202 #define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
203 #define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
205 #define ABD_SCATTER(abd) (abd->abd_u.abd_scatter)
206 #define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf)
207 #define abd_for_each_sg(abd, sg, n, i) \
208 for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
210 /* see block comment above for description */
211 int zfs_abd_scatter_enabled
= B_TRUE
;
212 unsigned zfs_abd_scatter_max_order
= MAX_ORDER
- 1;
215 * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
216 * ABD's. Smaller allocations will use linear ABD's which uses
217 * zio_[data_]buf_alloc().
219 * Scatter ABD's use at least one page each, so sub-page allocations waste
220 * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
221 * half of each page). Using linear ABD's for small allocations means that
222 * they will be put on slabs which contain many allocations. This can
223 * improve memory efficiency, but it also makes it much harder for ARC
224 * evictions to actually free pages, because all the buffers on one slab need
225 * to be freed in order for the slab (and underlying pages) to be freed.
226 * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
227 * possible for them to actually waste more memory than scatter (one page per
228 * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
230 * Spill blocks are typically 512B and are heavily used on systems running
231 * selinux with the default dnode size and the `xattr=sa` property set.
233 * By default we use linear allocations for 512B and 1KB, and scatter
234 * allocations for larger (1.5KB and up).
236 int zfs_abd_scatter_min_size
= 512 * 3;
238 static kmem_cache_t
*abd_cache
= NULL
;
239 static kstat_t
*abd_ksp
;
242 abd_chunkcnt_for_bytes(size_t size
)
244 return (P2ROUNDUP(size
, PAGESIZE
) / PAGESIZE
);
248 #ifndef CONFIG_HIGHMEM
250 #ifndef __GFP_RECLAIM
251 #define __GFP_RECLAIM __GFP_WAIT
255 * The goal is to minimize fragmentation by preferentially populating ABDs
256 * with higher order compound pages from a single zone. Allocation size is
257 * progressively decreased until it can be satisfied without performing
258 * reclaim or compaction. When necessary this function will degenerate to
259 * allocating individual pages and allowing reclaim to satisfy allocations.
262 abd_alloc_pages(abd_t
*abd
, size_t size
)
264 struct list_head pages
;
265 struct sg_table table
;
266 struct scatterlist
*sg
;
267 struct page
*page
, *tmp_page
= NULL
;
268 gfp_t gfp
= __GFP_NOWARN
| GFP_NOIO
;
269 gfp_t gfp_comp
= (gfp
| __GFP_NORETRY
| __GFP_COMP
) & ~__GFP_RECLAIM
;
270 int max_order
= MIN(zfs_abd_scatter_max_order
, MAX_ORDER
- 1);
271 int nr_pages
= abd_chunkcnt_for_bytes(size
);
272 int chunks
= 0, zones
= 0;
273 size_t remaining_size
;
274 int nid
= NUMA_NO_NODE
;
277 INIT_LIST_HEAD(&pages
);
279 while (alloc_pages
< nr_pages
) {
280 unsigned chunk_pages
;
283 order
= MIN(highbit64(nr_pages
- alloc_pages
) - 1, max_order
);
284 chunk_pages
= (1U << order
);
286 page
= alloc_pages_node(nid
, order
? gfp_comp
: gfp
, order
);
289 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry
);
290 schedule_timeout_interruptible(1);
292 max_order
= MAX(0, order
- 1);
297 list_add_tail(&page
->lru
, &pages
);
299 if ((nid
!= NUMA_NO_NODE
) && (page_to_nid(page
) != nid
))
302 nid
= page_to_nid(page
);
303 ABDSTAT_BUMP(abdstat_scatter_orders
[order
]);
305 alloc_pages
+= chunk_pages
;
308 ASSERT3S(alloc_pages
, ==, nr_pages
);
310 while (sg_alloc_table(&table
, chunks
, gfp
)) {
311 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry
);
312 schedule_timeout_interruptible(1);
316 remaining_size
= size
;
317 list_for_each_entry_safe(page
, tmp_page
, &pages
, lru
) {
318 size_t sg_size
= MIN(PAGESIZE
<< compound_order(page
),
320 sg_set_page(sg
, page
, sg_size
, 0);
321 remaining_size
-= sg_size
;
324 list_del(&page
->lru
);
328 * These conditions ensure that a possible transformation to a linear
329 * ABD would be valid.
331 ASSERT(!PageHighMem(sg_page(table
.sgl
)));
332 ASSERT0(ABD_SCATTER(abd
).abd_offset
);
334 if (table
.nents
== 1) {
336 * Since there is only one entry, this ABD can be represented
337 * as a linear buffer. All single-page (4K) ABD's can be
338 * represented this way. Some multi-page ABD's can also be
339 * represented this way, if we were able to allocate a single
340 * "chunk" (higher-order "page" which represents a power-of-2
341 * series of physically-contiguous pages). This is often the
342 * case for 2-page (8K) ABD's.
344 * Representing a single-entry scatter ABD as a linear ABD
345 * has the performance advantage of avoiding the copy (and
346 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
347 * A performance increase of around 5% has been observed for
348 * ARC-cached reads (of small blocks which can take advantage
351 * Note that this optimization is only possible because the
352 * pages are always mapped into the kernel's address space.
353 * This is not the case for highmem pages, so the
354 * optimization can not be made there.
356 abd
->abd_flags
|= ABD_FLAG_LINEAR
;
357 abd
->abd_flags
|= ABD_FLAG_LINEAR_PAGE
;
358 abd
->abd_u
.abd_linear
.abd_sgl
= table
.sgl
;
359 abd
->abd_u
.abd_linear
.abd_buf
=
360 page_address(sg_page(table
.sgl
));
361 } else if (table
.nents
> 1) {
362 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk
);
363 abd
->abd_flags
|= ABD_FLAG_MULTI_CHUNK
;
366 ABDSTAT_BUMP(abdstat_scatter_page_multi_zone
);
367 abd
->abd_flags
|= ABD_FLAG_MULTI_ZONE
;
370 ABD_SCATTER(abd
).abd_sgl
= table
.sgl
;
371 ABD_SCATTER(abd
).abd_nents
= table
.nents
;
376 * Allocate N individual pages to construct a scatter ABD. This function
377 * makes no attempt to request contiguous pages and requires the minimal
378 * number of kernel interfaces. It's designed for maximum compatibility.
381 abd_alloc_pages(abd_t
*abd
, size_t size
)
383 struct scatterlist
*sg
= NULL
;
384 struct sg_table table
;
386 gfp_t gfp
= __GFP_NOWARN
| GFP_NOIO
;
387 int nr_pages
= abd_chunkcnt_for_bytes(size
);
390 while (sg_alloc_table(&table
, nr_pages
, gfp
)) {
391 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry
);
392 schedule_timeout_interruptible(1);
395 ASSERT3U(table
.nents
, ==, nr_pages
);
396 ABD_SCATTER(abd
).abd_sgl
= table
.sgl
;
397 ABD_SCATTER(abd
).abd_nents
= nr_pages
;
399 abd_for_each_sg(abd
, sg
, nr_pages
, i
) {
400 while ((page
= __page_cache_alloc(gfp
)) == NULL
) {
401 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry
);
402 schedule_timeout_interruptible(1);
405 ABDSTAT_BUMP(abdstat_scatter_orders
[0]);
406 sg_set_page(sg
, page
, PAGESIZE
, 0);
410 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk
);
411 abd
->abd_flags
|= ABD_FLAG_MULTI_CHUNK
;
414 #endif /* !CONFIG_HIGHMEM */
417 abd_free_pages(abd_t
*abd
)
419 struct scatterlist
*sg
= NULL
;
420 struct sg_table table
;
422 int nr_pages
= ABD_SCATTER(abd
).abd_nents
;
425 if (abd
->abd_flags
& ABD_FLAG_MULTI_ZONE
)
426 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone
);
428 if (abd
->abd_flags
& ABD_FLAG_MULTI_CHUNK
)
429 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk
);
431 abd_for_each_sg(abd
, sg
, nr_pages
, i
) {
433 order
= compound_order(page
);
434 __free_pages(page
, order
);
435 ASSERT3U(sg
->length
, <=, PAGE_SIZE
<< order
);
436 ABDSTAT_BUMPDOWN(abdstat_scatter_orders
[order
]);
439 table
.sgl
= ABD_SCATTER(abd
).abd_sgl
;
440 table
.nents
= table
.orig_nents
= nr_pages
;
441 sg_free_table(&table
);
447 #define PAGE_SHIFT (highbit64(PAGESIZE)-1)
452 #define zfs_kmap_atomic(chunk, km) ((void *)chunk)
453 #define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0)
454 #define local_irq_save(flags) do { (void)(flags); } while (0)
455 #define local_irq_restore(flags) do { (void)(flags); } while (0)
456 #define nth_page(pg, i) \
457 ((struct page *)((void *)(pg) + (i) * PAGESIZE))
466 sg_init_table(struct scatterlist
*sg
, int nr
)
468 memset(sg
, 0, nr
* sizeof (struct scatterlist
));
472 #define for_each_sg(sgl, sg, nr, i) \
473 for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
476 sg_set_page(struct scatterlist
*sg
, struct page
*page
, unsigned int len
,
479 /* currently we don't use offset */
485 static inline struct page
*
486 sg_page(struct scatterlist
*sg
)
491 static inline struct scatterlist
*
492 sg_next(struct scatterlist
*sg
)
501 abd_alloc_pages(abd_t
*abd
, size_t size
)
503 unsigned nr_pages
= abd_chunkcnt_for_bytes(size
);
504 struct scatterlist
*sg
;
507 ABD_SCATTER(abd
).abd_sgl
= vmem_alloc(nr_pages
*
508 sizeof (struct scatterlist
), KM_SLEEP
);
509 sg_init_table(ABD_SCATTER(abd
).abd_sgl
, nr_pages
);
511 abd_for_each_sg(abd
, sg
, nr_pages
, i
) {
512 struct page
*p
= umem_alloc_aligned(PAGESIZE
, 64, KM_SLEEP
);
513 sg_set_page(sg
, p
, PAGESIZE
, 0);
515 ABD_SCATTER(abd
).abd_nents
= nr_pages
;
519 abd_free_pages(abd_t
*abd
)
521 int i
, n
= ABD_SCATTER(abd
).abd_nents
;
522 struct scatterlist
*sg
;
524 abd_for_each_sg(abd
, sg
, n
, i
) {
525 for (int j
= 0; j
< sg
->length
; j
+= PAGESIZE
) {
526 struct page
*p
= nth_page(sg_page(sg
), j
>> PAGE_SHIFT
);
527 umem_free(p
, PAGESIZE
);
531 vmem_free(ABD_SCATTER(abd
).abd_sgl
, n
* sizeof (struct scatterlist
));
541 abd_cache
= kmem_cache_create("abd_t", sizeof (abd_t
),
542 0, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
544 abd_ksp
= kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED
,
545 sizeof (abd_stats
) / sizeof (kstat_named_t
), KSTAT_FLAG_VIRTUAL
);
546 if (abd_ksp
!= NULL
) {
547 abd_ksp
->ks_data
= &abd_stats
;
548 kstat_install(abd_ksp
);
550 for (i
= 0; i
< MAX_ORDER
; i
++) {
551 snprintf(abd_stats
.abdstat_scatter_orders
[i
].name
,
552 KSTAT_STRLEN
, "scatter_order_%d", i
);
553 abd_stats
.abdstat_scatter_orders
[i
].data_type
=
562 if (abd_ksp
!= NULL
) {
563 kstat_delete(abd_ksp
);
568 kmem_cache_destroy(abd_cache
);
574 abd_verify(abd_t
*abd
)
576 ASSERT3U(abd
->abd_size
, >, 0);
577 ASSERT3U(abd
->abd_size
, <=, SPA_MAXBLOCKSIZE
);
578 ASSERT3U(abd
->abd_flags
, ==, abd
->abd_flags
& (ABD_FLAG_LINEAR
|
579 ABD_FLAG_OWNER
| ABD_FLAG_META
| ABD_FLAG_MULTI_ZONE
|
580 ABD_FLAG_MULTI_CHUNK
| ABD_FLAG_LINEAR_PAGE
));
581 IMPLY(abd
->abd_parent
!= NULL
, !(abd
->abd_flags
& ABD_FLAG_OWNER
));
582 IMPLY(abd
->abd_flags
& ABD_FLAG_META
, abd
->abd_flags
& ABD_FLAG_OWNER
);
583 if (abd_is_linear(abd
)) {
584 ASSERT3P(abd
->abd_u
.abd_linear
.abd_buf
, !=, NULL
);
588 struct scatterlist
*sg
= NULL
;
590 ASSERT3U(ABD_SCATTER(abd
).abd_nents
, >, 0);
591 ASSERT3U(ABD_SCATTER(abd
).abd_offset
, <,
592 ABD_SCATTER(abd
).abd_sgl
->length
);
593 n
= ABD_SCATTER(abd
).abd_nents
;
594 abd_for_each_sg(abd
, sg
, n
, i
) {
595 ASSERT3P(sg_page(sg
), !=, NULL
);
600 static inline abd_t
*
601 abd_alloc_struct(void)
603 abd_t
*abd
= kmem_cache_alloc(abd_cache
, KM_PUSHPAGE
);
605 ASSERT3P(abd
, !=, NULL
);
606 ABDSTAT_INCR(abdstat_struct_size
, sizeof (abd_t
));
612 abd_free_struct(abd_t
*abd
)
614 kmem_cache_free(abd_cache
, abd
);
615 ABDSTAT_INCR(abdstat_struct_size
, -(int)sizeof (abd_t
));
619 * Allocate an ABD, along with its own underlying data buffers. Use this if you
620 * don't care whether the ABD is linear or not.
623 abd_alloc(size_t size
, boolean_t is_metadata
)
625 /* see the comment above zfs_abd_scatter_min_size */
626 if (!zfs_abd_scatter_enabled
|| size
< zfs_abd_scatter_min_size
)
627 return (abd_alloc_linear(size
, is_metadata
));
629 VERIFY3U(size
, <=, SPA_MAXBLOCKSIZE
);
631 abd_t
*abd
= abd_alloc_struct();
632 abd
->abd_flags
= ABD_FLAG_OWNER
;
633 abd
->abd_u
.abd_scatter
.abd_offset
= 0;
634 abd_alloc_pages(abd
, size
);
637 abd
->abd_flags
|= ABD_FLAG_META
;
639 abd
->abd_size
= size
;
640 abd
->abd_parent
= NULL
;
641 zfs_refcount_create(&abd
->abd_children
);
643 ABDSTAT_BUMP(abdstat_scatter_cnt
);
644 ABDSTAT_INCR(abdstat_scatter_data_size
, size
);
645 ABDSTAT_INCR(abdstat_scatter_chunk_waste
,
646 P2ROUNDUP(size
, PAGESIZE
) - size
);
652 abd_free_scatter(abd_t
*abd
)
656 zfs_refcount_destroy(&abd
->abd_children
);
657 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt
);
658 ABDSTAT_INCR(abdstat_scatter_data_size
, -(int)abd
->abd_size
);
659 ABDSTAT_INCR(abdstat_scatter_chunk_waste
,
660 (int)abd
->abd_size
- (int)P2ROUNDUP(abd
->abd_size
, PAGESIZE
));
662 abd_free_struct(abd
);
666 * Allocate an ABD that must be linear, along with its own underlying data
667 * buffer. Only use this when it would be very annoying to write your ABD
668 * consumer with a scattered ABD.
671 abd_alloc_linear(size_t size
, boolean_t is_metadata
)
673 abd_t
*abd
= abd_alloc_struct();
675 VERIFY3U(size
, <=, SPA_MAXBLOCKSIZE
);
677 abd
->abd_flags
= ABD_FLAG_LINEAR
| ABD_FLAG_OWNER
;
679 abd
->abd_flags
|= ABD_FLAG_META
;
681 abd
->abd_size
= size
;
682 abd
->abd_parent
= NULL
;
683 zfs_refcount_create(&abd
->abd_children
);
686 abd
->abd_u
.abd_linear
.abd_buf
= zio_buf_alloc(size
);
688 abd
->abd_u
.abd_linear
.abd_buf
= zio_data_buf_alloc(size
);
691 ABDSTAT_BUMP(abdstat_linear_cnt
);
692 ABDSTAT_INCR(abdstat_linear_data_size
, size
);
698 abd_free_linear(abd_t
*abd
)
700 if (abd_is_linear_page(abd
)) {
701 /* Transform it back into a scatter ABD for freeing */
702 struct scatterlist
*sg
= abd
->abd_u
.abd_linear
.abd_sgl
;
703 abd
->abd_flags
&= ~ABD_FLAG_LINEAR
;
704 abd
->abd_flags
&= ~ABD_FLAG_LINEAR_PAGE
;
705 ABD_SCATTER(abd
).abd_nents
= 1;
706 ABD_SCATTER(abd
).abd_offset
= 0;
707 ABD_SCATTER(abd
).abd_sgl
= sg
;
708 abd_free_scatter(abd
);
711 if (abd
->abd_flags
& ABD_FLAG_META
) {
712 zio_buf_free(abd
->abd_u
.abd_linear
.abd_buf
, abd
->abd_size
);
714 zio_data_buf_free(abd
->abd_u
.abd_linear
.abd_buf
, abd
->abd_size
);
717 zfs_refcount_destroy(&abd
->abd_children
);
718 ABDSTAT_BUMPDOWN(abdstat_linear_cnt
);
719 ABDSTAT_INCR(abdstat_linear_data_size
, -(int)abd
->abd_size
);
721 abd_free_struct(abd
);
725 * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
726 * abd_alloc_linear().
732 ASSERT3P(abd
->abd_parent
, ==, NULL
);
733 ASSERT(abd
->abd_flags
& ABD_FLAG_OWNER
);
734 if (abd_is_linear(abd
))
735 abd_free_linear(abd
);
737 abd_free_scatter(abd
);
741 * Allocate an ABD of the same format (same metadata flag, same scatterize
742 * setting) as another ABD.
745 abd_alloc_sametype(abd_t
*sabd
, size_t size
)
747 boolean_t is_metadata
= (sabd
->abd_flags
& ABD_FLAG_META
) != 0;
748 if (abd_is_linear(sabd
) &&
749 !abd_is_linear_page(sabd
)) {
750 return (abd_alloc_linear(size
, is_metadata
));
752 return (abd_alloc(size
, is_metadata
));
757 * If we're going to use this ABD for doing I/O using the block layer, the
758 * consumer of the ABD data doesn't care if it's scattered or not, and we don't
759 * plan to store this ABD in memory for a long period of time, we should
760 * allocate the ABD type that requires the least data copying to do the I/O.
762 * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os
763 * using a scatter/gather list we should switch to that and replace this call
764 * with vanilla abd_alloc().
766 * On Linux the optimal thing to do would be to use abd_get_offset() and
767 * construct a new ABD which shares the original pages thereby eliminating
768 * the copy. But for the moment a new linear ABD is allocated until this
769 * performance optimization can be implemented.
772 abd_alloc_for_io(size_t size
, boolean_t is_metadata
)
774 return (abd_alloc(size
, is_metadata
));
778 * Allocate a new ABD to point to offset off of sabd. It shares the underlying
779 * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
780 * any derived ABDs exist.
782 static inline abd_t
*
783 abd_get_offset_impl(abd_t
*sabd
, size_t off
, size_t size
)
788 ASSERT3U(off
, <=, sabd
->abd_size
);
790 if (abd_is_linear(sabd
)) {
791 abd
= abd_alloc_struct();
794 * Even if this buf is filesystem metadata, we only track that
795 * if we own the underlying data buffer, which is not true in
796 * this case. Therefore, we don't ever use ABD_FLAG_META here.
798 abd
->abd_flags
= ABD_FLAG_LINEAR
;
800 abd
->abd_u
.abd_linear
.abd_buf
=
801 (char *)sabd
->abd_u
.abd_linear
.abd_buf
+ off
;
804 struct scatterlist
*sg
= NULL
;
805 size_t new_offset
= sabd
->abd_u
.abd_scatter
.abd_offset
+ off
;
807 abd
= abd_alloc_struct();
810 * Even if this buf is filesystem metadata, we only track that
811 * if we own the underlying data buffer, which is not true in
812 * this case. Therefore, we don't ever use ABD_FLAG_META here.
816 abd_for_each_sg(sabd
, sg
, ABD_SCATTER(sabd
).abd_nents
, i
) {
817 if (new_offset
< sg
->length
)
819 new_offset
-= sg
->length
;
822 ABD_SCATTER(abd
).abd_sgl
= sg
;
823 ABD_SCATTER(abd
).abd_offset
= new_offset
;
824 ABD_SCATTER(abd
).abd_nents
= ABD_SCATTER(sabd
).abd_nents
- i
;
827 abd
->abd_size
= size
;
828 abd
->abd_parent
= sabd
;
829 zfs_refcount_create(&abd
->abd_children
);
830 (void) zfs_refcount_add_many(&sabd
->abd_children
, abd
->abd_size
, abd
);
836 abd_get_offset(abd_t
*sabd
, size_t off
)
838 size_t size
= sabd
->abd_size
> off
? sabd
->abd_size
- off
: 0;
840 VERIFY3U(size
, >, 0);
842 return (abd_get_offset_impl(sabd
, off
, size
));
846 abd_get_offset_size(abd_t
*sabd
, size_t off
, size_t size
)
848 ASSERT3U(off
+ size
, <=, sabd
->abd_size
);
850 return (abd_get_offset_impl(sabd
, off
, size
));
854 * Allocate a linear ABD structure for buf. You must free this with abd_put()
855 * since the resulting ABD doesn't own its own buffer.
858 abd_get_from_buf(void *buf
, size_t size
)
860 abd_t
*abd
= abd_alloc_struct();
862 VERIFY3U(size
, <=, SPA_MAXBLOCKSIZE
);
865 * Even if this buf is filesystem metadata, we only track that if we
866 * own the underlying data buffer, which is not true in this case.
867 * Therefore, we don't ever use ABD_FLAG_META here.
869 abd
->abd_flags
= ABD_FLAG_LINEAR
;
870 abd
->abd_size
= size
;
871 abd
->abd_parent
= NULL
;
872 zfs_refcount_create(&abd
->abd_children
);
874 abd
->abd_u
.abd_linear
.abd_buf
= buf
;
880 * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
881 * free the underlying scatterlist or buffer.
887 ASSERT(!(abd
->abd_flags
& ABD_FLAG_OWNER
));
889 if (abd
->abd_parent
!= NULL
) {
890 (void) zfs_refcount_remove_many(&abd
->abd_parent
->abd_children
,
894 zfs_refcount_destroy(&abd
->abd_children
);
895 abd_free_struct(abd
);
899 * Get the raw buffer associated with a linear ABD.
902 abd_to_buf(abd_t
*abd
)
904 ASSERT(abd_is_linear(abd
));
906 return (abd
->abd_u
.abd_linear
.abd_buf
);
910 * Borrow a raw buffer from an ABD without copying the contents of the ABD
911 * into the buffer. If the ABD is scattered, this will allocate a raw buffer
912 * whose contents are undefined. To copy over the existing data in the ABD, use
913 * abd_borrow_buf_copy() instead.
916 abd_borrow_buf(abd_t
*abd
, size_t n
)
920 ASSERT3U(abd
->abd_size
, >=, n
);
921 if (abd_is_linear(abd
)) {
922 buf
= abd_to_buf(abd
);
924 buf
= zio_buf_alloc(n
);
926 (void) zfs_refcount_add_many(&abd
->abd_children
, n
, buf
);
932 abd_borrow_buf_copy(abd_t
*abd
, size_t n
)
934 void *buf
= abd_borrow_buf(abd
, n
);
935 if (!abd_is_linear(abd
)) {
936 abd_copy_to_buf(buf
, abd
, n
);
942 * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
943 * not change the contents of the ABD and will ASSERT that you didn't modify
944 * the buffer since it was borrowed. If you want any changes you made to buf to
945 * be copied back to abd, use abd_return_buf_copy() instead.
948 abd_return_buf(abd_t
*abd
, void *buf
, size_t n
)
951 ASSERT3U(abd
->abd_size
, >=, n
);
952 if (abd_is_linear(abd
)) {
953 ASSERT3P(buf
, ==, abd_to_buf(abd
));
955 ASSERT0(abd_cmp_buf(abd
, buf
, n
));
956 zio_buf_free(buf
, n
);
958 (void) zfs_refcount_remove_many(&abd
->abd_children
, n
, buf
);
962 abd_return_buf_copy(abd_t
*abd
, void *buf
, size_t n
)
964 if (!abd_is_linear(abd
)) {
965 abd_copy_from_buf(abd
, buf
, n
);
967 abd_return_buf(abd
, buf
, n
);
971 * Give this ABD ownership of the buffer that it's storing. Can only be used on
972 * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
973 * with abd_alloc_linear() which subsequently released ownership of their buf
974 * with abd_release_ownership_of_buf().
977 abd_take_ownership_of_buf(abd_t
*abd
, boolean_t is_metadata
)
979 ASSERT(abd_is_linear(abd
));
980 ASSERT(!(abd
->abd_flags
& ABD_FLAG_OWNER
));
983 abd
->abd_flags
|= ABD_FLAG_OWNER
;
985 abd
->abd_flags
|= ABD_FLAG_META
;
988 ABDSTAT_BUMP(abdstat_linear_cnt
);
989 ABDSTAT_INCR(abdstat_linear_data_size
, abd
->abd_size
);
993 abd_release_ownership_of_buf(abd_t
*abd
)
995 ASSERT(abd_is_linear(abd
));
996 ASSERT(abd
->abd_flags
& ABD_FLAG_OWNER
);
999 * abd_free() needs to handle LINEAR_PAGE ABD's specially.
1000 * Since that flag does not survive the
1001 * abd_release_ownership_of_buf() -> abd_get_from_buf() ->
1002 * abd_take_ownership_of_buf() sequence, we don't allow releasing
1003 * these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
1005 ASSERT(!abd_is_linear_page(abd
));
1009 abd
->abd_flags
&= ~ABD_FLAG_OWNER
;
1010 /* Disable this flag since we no longer own the data buffer */
1011 abd
->abd_flags
&= ~ABD_FLAG_META
;
1013 ABDSTAT_BUMPDOWN(abdstat_linear_cnt
);
1014 ABDSTAT_INCR(abdstat_linear_data_size
, -(int)abd
->abd_size
);
1017 #ifndef HAVE_1ARG_KMAP_ATOMIC
1018 #define NR_KM_TYPE (6)
1020 int km_table
[NR_KM_TYPE
] = {
1032 /* public interface */
1033 void *iter_mapaddr
; /* addr corresponding to iter_pos */
1034 size_t iter_mapsize
; /* length of data valid at mapaddr */
1037 abd_t
*iter_abd
; /* ABD being iterated through */
1039 size_t iter_offset
; /* offset in current sg/abd_buf, */
1040 /* abd_offset included */
1041 struct scatterlist
*iter_sg
; /* current sg */
1042 #ifndef HAVE_1ARG_KMAP_ATOMIC
1043 int iter_km
; /* KM_* for kmap_atomic */
1048 * Initialize the abd_iter.
1051 abd_iter_init(struct abd_iter
*aiter
, abd_t
*abd
, int km_type
)
1054 aiter
->iter_abd
= abd
;
1055 aiter
->iter_mapaddr
= NULL
;
1056 aiter
->iter_mapsize
= 0;
1057 aiter
->iter_pos
= 0;
1058 if (abd_is_linear(abd
)) {
1059 aiter
->iter_offset
= 0;
1060 aiter
->iter_sg
= NULL
;
1062 aiter
->iter_offset
= ABD_SCATTER(abd
).abd_offset
;
1063 aiter
->iter_sg
= ABD_SCATTER(abd
).abd_sgl
;
1065 #ifndef HAVE_1ARG_KMAP_ATOMIC
1066 ASSERT3U(km_type
, <, NR_KM_TYPE
);
1067 aiter
->iter_km
= km_type
;
1072 * Advance the iterator by a certain amount. Cannot be called when a chunk is
1073 * in use. This can be safely called when the aiter has already exhausted, in
1074 * which case this does nothing.
1077 abd_iter_advance(struct abd_iter
*aiter
, size_t amount
)
1079 ASSERT3P(aiter
->iter_mapaddr
, ==, NULL
);
1080 ASSERT0(aiter
->iter_mapsize
);
1082 /* There's nothing left to advance to, so do nothing */
1083 if (aiter
->iter_pos
== aiter
->iter_abd
->abd_size
)
1086 aiter
->iter_pos
+= amount
;
1087 aiter
->iter_offset
+= amount
;
1088 if (!abd_is_linear(aiter
->iter_abd
)) {
1089 while (aiter
->iter_offset
>= aiter
->iter_sg
->length
) {
1090 aiter
->iter_offset
-= aiter
->iter_sg
->length
;
1091 aiter
->iter_sg
= sg_next(aiter
->iter_sg
);
1092 if (aiter
->iter_sg
== NULL
) {
1093 ASSERT0(aiter
->iter_offset
);
1101 * Map the current chunk into aiter. This can be safely called when the aiter
1102 * has already exhausted, in which case this does nothing.
1105 abd_iter_map(struct abd_iter
*aiter
)
1110 ASSERT3P(aiter
->iter_mapaddr
, ==, NULL
);
1111 ASSERT0(aiter
->iter_mapsize
);
1113 /* There's nothing left to iterate over, so do nothing */
1114 if (aiter
->iter_pos
== aiter
->iter_abd
->abd_size
)
1117 if (abd_is_linear(aiter
->iter_abd
)) {
1118 ASSERT3U(aiter
->iter_pos
, ==, aiter
->iter_offset
);
1119 offset
= aiter
->iter_offset
;
1120 aiter
->iter_mapsize
= aiter
->iter_abd
->abd_size
- offset
;
1121 paddr
= aiter
->iter_abd
->abd_u
.abd_linear
.abd_buf
;
1123 offset
= aiter
->iter_offset
;
1124 aiter
->iter_mapsize
= MIN(aiter
->iter_sg
->length
- offset
,
1125 aiter
->iter_abd
->abd_size
- aiter
->iter_pos
);
1127 paddr
= zfs_kmap_atomic(sg_page(aiter
->iter_sg
),
1128 km_table
[aiter
->iter_km
]);
1131 aiter
->iter_mapaddr
= (char *)paddr
+ offset
;
1135 * Unmap the current chunk from aiter. This can be safely called when the aiter
1136 * has already exhausted, in which case this does nothing.
1139 abd_iter_unmap(struct abd_iter
*aiter
)
1141 /* There's nothing left to unmap, so do nothing */
1142 if (aiter
->iter_pos
== aiter
->iter_abd
->abd_size
)
1145 if (!abd_is_linear(aiter
->iter_abd
)) {
1146 /* LINTED E_FUNC_SET_NOT_USED */
1147 zfs_kunmap_atomic(aiter
->iter_mapaddr
- aiter
->iter_offset
,
1148 km_table
[aiter
->iter_km
]);
1151 ASSERT3P(aiter
->iter_mapaddr
, !=, NULL
);
1152 ASSERT3U(aiter
->iter_mapsize
, >, 0);
1154 aiter
->iter_mapaddr
= NULL
;
1155 aiter
->iter_mapsize
= 0;
1159 abd_iterate_func(abd_t
*abd
, size_t off
, size_t size
,
1160 abd_iter_func_t
*func
, void *private)
1163 struct abd_iter aiter
;
1166 ASSERT3U(off
+ size
, <=, abd
->abd_size
);
1168 abd_iter_init(&aiter
, abd
, 0);
1169 abd_iter_advance(&aiter
, off
);
1172 abd_iter_map(&aiter
);
1174 size_t len
= MIN(aiter
.iter_mapsize
, size
);
1175 ASSERT3U(len
, >, 0);
1177 ret
= func(aiter
.iter_mapaddr
, len
, private);
1179 abd_iter_unmap(&aiter
);
1185 abd_iter_advance(&aiter
, len
);
1196 abd_copy_to_buf_off_cb(void *buf
, size_t size
, void *private)
1198 struct buf_arg
*ba_ptr
= private;
1200 (void) memcpy(ba_ptr
->arg_buf
, buf
, size
);
1201 ba_ptr
->arg_buf
= (char *)ba_ptr
->arg_buf
+ size
;
1207 * Copy abd to buf. (off is the offset in abd.)
1210 abd_copy_to_buf_off(void *buf
, abd_t
*abd
, size_t off
, size_t size
)
1212 struct buf_arg ba_ptr
= { buf
};
1214 (void) abd_iterate_func(abd
, off
, size
, abd_copy_to_buf_off_cb
,
1219 abd_cmp_buf_off_cb(void *buf
, size_t size
, void *private)
1222 struct buf_arg
*ba_ptr
= private;
1224 ret
= memcmp(buf
, ba_ptr
->arg_buf
, size
);
1225 ba_ptr
->arg_buf
= (char *)ba_ptr
->arg_buf
+ size
;
1231 * Compare the contents of abd to buf. (off is the offset in abd.)
1234 abd_cmp_buf_off(abd_t
*abd
, const void *buf
, size_t off
, size_t size
)
1236 struct buf_arg ba_ptr
= { (void *) buf
};
1238 return (abd_iterate_func(abd
, off
, size
, abd_cmp_buf_off_cb
, &ba_ptr
));
1242 abd_copy_from_buf_off_cb(void *buf
, size_t size
, void *private)
1244 struct buf_arg
*ba_ptr
= private;
1246 (void) memcpy(buf
, ba_ptr
->arg_buf
, size
);
1247 ba_ptr
->arg_buf
= (char *)ba_ptr
->arg_buf
+ size
;
1253 * Copy from buf to abd. (off is the offset in abd.)
1256 abd_copy_from_buf_off(abd_t
*abd
, const void *buf
, size_t off
, size_t size
)
1258 struct buf_arg ba_ptr
= { (void *) buf
};
1260 (void) abd_iterate_func(abd
, off
, size
, abd_copy_from_buf_off_cb
,
1266 abd_zero_off_cb(void *buf
, size_t size
, void *private)
1268 (void) memset(buf
, 0, size
);
1273 * Zero out the abd from a particular offset to the end.
1276 abd_zero_off(abd_t
*abd
, size_t off
, size_t size
)
1278 (void) abd_iterate_func(abd
, off
, size
, abd_zero_off_cb
, NULL
);
1282 * Iterate over two ABDs and call func incrementally on the two ABDs' data in
1283 * equal-sized chunks (passed to func as raw buffers). func could be called many
1284 * times during this iteration.
1287 abd_iterate_func2(abd_t
*dabd
, abd_t
*sabd
, size_t doff
, size_t soff
,
1288 size_t size
, abd_iter_func2_t
*func
, void *private)
1291 struct abd_iter daiter
, saiter
;
1296 ASSERT3U(doff
+ size
, <=, dabd
->abd_size
);
1297 ASSERT3U(soff
+ size
, <=, sabd
->abd_size
);
1299 abd_iter_init(&daiter
, dabd
, 0);
1300 abd_iter_init(&saiter
, sabd
, 1);
1301 abd_iter_advance(&daiter
, doff
);
1302 abd_iter_advance(&saiter
, soff
);
1305 abd_iter_map(&daiter
);
1306 abd_iter_map(&saiter
);
1308 size_t dlen
= MIN(daiter
.iter_mapsize
, size
);
1309 size_t slen
= MIN(saiter
.iter_mapsize
, size
);
1310 size_t len
= MIN(dlen
, slen
);
1311 ASSERT(dlen
> 0 || slen
> 0);
1313 ret
= func(daiter
.iter_mapaddr
, saiter
.iter_mapaddr
, len
,
1316 abd_iter_unmap(&saiter
);
1317 abd_iter_unmap(&daiter
);
1323 abd_iter_advance(&daiter
, len
);
1324 abd_iter_advance(&saiter
, len
);
1332 abd_copy_off_cb(void *dbuf
, void *sbuf
, size_t size
, void *private)
1334 (void) memcpy(dbuf
, sbuf
, size
);
1339 * Copy from sabd to dabd starting from soff and doff.
1342 abd_copy_off(abd_t
*dabd
, abd_t
*sabd
, size_t doff
, size_t soff
, size_t size
)
1344 (void) abd_iterate_func2(dabd
, sabd
, doff
, soff
, size
,
1345 abd_copy_off_cb
, NULL
);
1350 abd_cmp_cb(void *bufa
, void *bufb
, size_t size
, void *private)
1352 return (memcmp(bufa
, bufb
, size
));
1356 * Compares the contents of two ABDs.
1359 abd_cmp(abd_t
*dabd
, abd_t
*sabd
)
1361 ASSERT3U(dabd
->abd_size
, ==, sabd
->abd_size
);
1362 return (abd_iterate_func2(dabd
, sabd
, 0, 0, dabd
->abd_size
,
1367 * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
1369 * @cabds parity ABDs, must have equal size
1370 * @dabd data ABD. Can be NULL (in this case @dsize = 0)
1371 * @func_raidz_gen should be implemented so that its behaviour
1372 * is the same when taking linear and when taking scatter
1375 abd_raidz_gen_iterate(abd_t
**cabds
, abd_t
*dabd
,
1376 ssize_t csize
, ssize_t dsize
, const unsigned parity
,
1377 void (*func_raidz_gen
)(void **, const void *, size_t, size_t))
1381 struct abd_iter caiters
[3];
1382 struct abd_iter daiter
= {0};
1384 unsigned long flags
;
1386 ASSERT3U(parity
, <=, 3);
1388 for (i
= 0; i
< parity
; i
++)
1389 abd_iter_init(&caiters
[i
], cabds
[i
], i
);
1392 abd_iter_init(&daiter
, dabd
, i
);
1394 ASSERT3S(dsize
, >=, 0);
1396 local_irq_save(flags
);
1400 if (dabd
&& dsize
> 0)
1401 abd_iter_map(&daiter
);
1403 for (i
= 0; i
< parity
; i
++) {
1404 abd_iter_map(&caiters
[i
]);
1405 caddrs
[i
] = caiters
[i
].iter_mapaddr
;
1410 len
= MIN(caiters
[2].iter_mapsize
, len
);
1413 len
= MIN(caiters
[1].iter_mapsize
, len
);
1416 len
= MIN(caiters
[0].iter_mapsize
, len
);
1419 /* must be progressive */
1420 ASSERT3S(len
, >, 0);
1422 if (dabd
&& dsize
> 0) {
1423 /* this needs precise iter.length */
1424 len
= MIN(daiter
.iter_mapsize
, len
);
1429 /* must be progressive */
1430 ASSERT3S(len
, >, 0);
1432 * The iterated function likely will not do well if each
1433 * segment except the last one is not multiple of 512 (raidz).
1435 ASSERT3U(((uint64_t)len
& 511ULL), ==, 0);
1437 func_raidz_gen(caddrs
, daiter
.iter_mapaddr
, len
, dlen
);
1439 for (i
= parity
-1; i
>= 0; i
--) {
1440 abd_iter_unmap(&caiters
[i
]);
1441 abd_iter_advance(&caiters
[i
], len
);
1444 if (dabd
&& dsize
> 0) {
1445 abd_iter_unmap(&daiter
);
1446 abd_iter_advance(&daiter
, dlen
);
1452 ASSERT3S(dsize
, >=, 0);
1453 ASSERT3S(csize
, >=, 0);
1455 local_irq_restore(flags
);
1459 * Iterate over code ABDs and data reconstruction target ABDs and call
1460 * @func_raidz_rec. Function maps at most 6 pages atomically.
1462 * @cabds parity ABDs, must have equal size
1463 * @tabds rec target ABDs, at most 3
1464 * @tsize size of data target columns
1465 * @func_raidz_rec expects syndrome data in target columns. Function
1466 * reconstructs data and overwrites target columns.
1469 abd_raidz_rec_iterate(abd_t
**cabds
, abd_t
**tabds
,
1470 ssize_t tsize
, const unsigned parity
,
1471 void (*func_raidz_rec
)(void **t
, const size_t tsize
, void **c
,
1472 const unsigned *mul
),
1473 const unsigned *mul
)
1477 struct abd_iter citers
[3];
1478 struct abd_iter xiters
[3];
1479 void *caddrs
[3], *xaddrs
[3];
1480 unsigned long flags
;
1482 ASSERT3U(parity
, <=, 3);
1484 for (i
= 0; i
< parity
; i
++) {
1485 abd_iter_init(&citers
[i
], cabds
[i
], 2*i
);
1486 abd_iter_init(&xiters
[i
], tabds
[i
], 2*i
+1);
1489 local_irq_save(flags
);
1492 for (i
= 0; i
< parity
; i
++) {
1493 abd_iter_map(&citers
[i
]);
1494 abd_iter_map(&xiters
[i
]);
1495 caddrs
[i
] = citers
[i
].iter_mapaddr
;
1496 xaddrs
[i
] = xiters
[i
].iter_mapaddr
;
1502 len
= MIN(xiters
[2].iter_mapsize
, len
);
1503 len
= MIN(citers
[2].iter_mapsize
, len
);
1506 len
= MIN(xiters
[1].iter_mapsize
, len
);
1507 len
= MIN(citers
[1].iter_mapsize
, len
);
1510 len
= MIN(xiters
[0].iter_mapsize
, len
);
1511 len
= MIN(citers
[0].iter_mapsize
, len
);
1513 /* must be progressive */
1514 ASSERT3S(len
, >, 0);
1516 * The iterated function likely will not do well if each
1517 * segment except the last one is not multiple of 512 (raidz).
1519 ASSERT3U(((uint64_t)len
& 511ULL), ==, 0);
1521 func_raidz_rec(xaddrs
, len
, caddrs
, mul
);
1523 for (i
= parity
-1; i
>= 0; i
--) {
1524 abd_iter_unmap(&xiters
[i
]);
1525 abd_iter_unmap(&citers
[i
]);
1526 abd_iter_advance(&xiters
[i
], len
);
1527 abd_iter_advance(&citers
[i
], len
);
1531 ASSERT3S(tsize
, >=, 0);
1533 local_irq_restore(flags
);
1536 #if defined(_KERNEL)
1538 * bio_nr_pages for ABD.
1539 * @off is the offset in @abd
1542 abd_nr_pages_off(abd_t
*abd
, unsigned int size
, size_t off
)
1546 if (abd_is_linear(abd
))
1547 pos
= (unsigned long)abd_to_buf(abd
) + off
;
1549 pos
= abd
->abd_u
.abd_scatter
.abd_offset
+ off
;
1551 return ((pos
+ size
+ PAGESIZE
- 1) >> PAGE_SHIFT
) -
1552 (pos
>> PAGE_SHIFT
);
1556 * bio_map for scatter ABD.
1557 * @off is the offset in @abd
1558 * Remaining IO size is returned
1561 abd_scatter_bio_map_off(struct bio
*bio
, abd_t
*abd
,
1562 unsigned int io_size
, size_t off
)
1565 struct abd_iter aiter
;
1567 ASSERT(!abd_is_linear(abd
));
1568 ASSERT3U(io_size
, <=, abd
->abd_size
- off
);
1570 abd_iter_init(&aiter
, abd
, 0);
1571 abd_iter_advance(&aiter
, off
);
1573 for (i
= 0; i
< bio
->bi_max_vecs
; i
++) {
1575 size_t len
, sgoff
, pgoff
;
1576 struct scatterlist
*sg
;
1582 sgoff
= aiter
.iter_offset
;
1583 pgoff
= sgoff
& (PAGESIZE
- 1);
1584 len
= MIN(io_size
, PAGESIZE
- pgoff
);
1587 pg
= nth_page(sg_page(sg
), sgoff
>> PAGE_SHIFT
);
1588 if (bio_add_page(bio
, pg
, len
, pgoff
) != len
)
1592 abd_iter_advance(&aiter
, len
);
1598 /* Tunable Parameters */
1599 module_param(zfs_abd_scatter_enabled
, int, 0644);
1600 MODULE_PARM_DESC(zfs_abd_scatter_enabled
,
1601 "Toggle whether ABD allocations must be linear.");
1602 module_param(zfs_abd_scatter_min_size
, int, 0644);
1603 MODULE_PARM_DESC(zfs_abd_scatter_min_size
,
1604 "Minimum size of scatter allocations.");
1606 module_param(zfs_abd_scatter_max_order
, uint
, 0644);
1607 MODULE_PARM_DESC(zfs_abd_scatter_max_order
,
1608 "Maximum order allocation used for a scatter ABD.");