module/os/linux/zfs/abd_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  23  * Copyright (c) 2019 by Delphix. All rights reserved.
  24  * Copyright (c) 2023, 2024, Klara Inc.
  25  */
  26
  27 /*
  28  * See abd.c for a general overview of the arc buffered data (ABD).
  29  *
  30  * Linear buffers act exactly like normal buffers and are always mapped into the
  31  * kernel's virtual memory space, while scattered ABD data chunks are allocated
  32  * as physical pages and then mapped in only while they are actually being
  33  * accessed through one of the abd_* library functions. Using scattered ABDs
  34  * provides several benefits:
  35  *
  36  *  (1) They avoid use of kmem_*, preventing performance problems where running
  37  *      kmem_reap on very large memory systems never finishes and causes
  38  *      constant TLB shootdowns.
  39  *
  40  *  (2) Fragmentation is less of an issue since when we are at the limit of
  41  *      allocatable space, we won't have to search around for a long free
  42  *      hole in the VA space for large ARC allocations. Each chunk is mapped in
  43  *      individually, so even if we are using HIGHMEM (see next point) we
  44  *      wouldn't need to worry about finding a contiguous address range.
  45  *
  46  *  (3) If we are not using HIGHMEM, then all physical memory is always
  47  *      mapped into the kernel's address space, so we also avoid the map /
  48  *      unmap costs on each ABD access.
  49  *
  50  * If we are not using HIGHMEM, scattered buffers which have only one chunk
  51  * can be treated as linear buffers, because they are contiguous in the
  52  * kernel's virtual address space.  See abd_alloc_chunks() for details.
  53  */
  54
  55 #include <sys/abd_impl.h>
  56 #include <sys/param.h>
  57 #include <sys/zio.h>
  58 #include <sys/arc.h>
  59 #include <sys/zfs_context.h>
  60 #include <sys/zfs_znode.h>
  61 #include <linux/kmap_compat.h>
  62 #include <linux/mm_compat.h>
  63 #include <linux/scatterlist.h>
  64 #include <linux/version.h>
  65
  66 #if defined(MAX_ORDER)
  67 #define ABD_MAX_ORDER   (MAX_ORDER)
  68 #elif defined(MAX_PAGE_ORDER)
  69 #define ABD_MAX_ORDER   (MAX_PAGE_ORDER)
  70 #endif
  71
  72 typedef struct abd_stats {
  73         kstat_named_t abdstat_struct_size;
  74         kstat_named_t abdstat_linear_cnt;
  75         kstat_named_t abdstat_linear_data_size;
  76         kstat_named_t abdstat_scatter_cnt;
  77         kstat_named_t abdstat_scatter_data_size;
  78         kstat_named_t abdstat_scatter_chunk_waste;
  79         kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER];
  80         kstat_named_t abdstat_scatter_page_multi_chunk;
  81         kstat_named_t abdstat_scatter_page_multi_zone;
  82         kstat_named_t abdstat_scatter_page_alloc_retry;
  83         kstat_named_t abdstat_scatter_sg_table_retry;
  84 } abd_stats_t;
  85
  86 static abd_stats_t abd_stats = {
  87         /* Amount of memory occupied by all of the abd_t struct allocations */
  88         { "struct_size",                        KSTAT_DATA_UINT64 },
  89         /*
  90          * The number of linear ABDs which are currently allocated, excluding
  91          * ABDs which don't own their data (for instance the ones which were
  92          * allocated through abd_get_offset() and abd_get_from_buf()). If an
  93          * ABD takes ownership of its buf then it will become tracked.
  94          */
  95         { "linear_cnt",                         KSTAT_DATA_UINT64 },
  96         /* Amount of data stored in all linear ABDs tracked by linear_cnt */
  97         { "linear_data_size",                   KSTAT_DATA_UINT64 },
  98         /*
  99          * The number of scatter ABDs which are currently allocated, excluding
 100          * ABDs which don't own their data (for instance the ones which were
 101          * allocated through abd_get_offset()).
 102          */
 103         { "scatter_cnt",                        KSTAT_DATA_UINT64 },
 104         /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
 105         { "scatter_data_size",                  KSTAT_DATA_UINT64 },
 106         /*
 107          * The amount of space wasted at the end of the last chunk across all
 108          * scatter ABDs tracked by scatter_cnt.
 109          */
 110         { "scatter_chunk_waste",                KSTAT_DATA_UINT64 },
 111         /*
 112          * The number of compound allocations of a given order.  These
 113          * allocations are spread over all currently allocated ABDs, and
 114          * act as a measure of memory fragmentation.
 115          */
 116         { { "scatter_order_N",                  KSTAT_DATA_UINT64 } },
 117         /*
 118          * The number of scatter ABDs which contain multiple chunks.
 119          * ABDs are preferentially allocated from the minimum number of
 120          * contiguous multi-page chunks, a single chunk is optimal.
 121          */
 122         { "scatter_page_multi_chunk",           KSTAT_DATA_UINT64 },
 123         /*
 124          * The number of scatter ABDs which are split across memory zones.
 125          * ABDs are preferentially allocated using pages from a single zone.
 126          */
 127         { "scatter_page_multi_zone",            KSTAT_DATA_UINT64 },
 128         /*
 129          *  The total number of retries encountered when attempting to
 130          *  allocate the pages to populate the scatter ABD.
 131          */
 132         { "scatter_page_alloc_retry",           KSTAT_DATA_UINT64 },
 133         /*
 134          *  The total number of retries encountered when attempting to
 135          *  allocate the sg table for an ABD.
 136          */
 137         { "scatter_sg_table_retry",             KSTAT_DATA_UINT64 },
 138 };
 139
 140 static struct {
 141         wmsum_t abdstat_struct_size;
 142         wmsum_t abdstat_linear_cnt;
 143         wmsum_t abdstat_linear_data_size;
 144         wmsum_t abdstat_scatter_cnt;
 145         wmsum_t abdstat_scatter_data_size;
 146         wmsum_t abdstat_scatter_chunk_waste;
 147         wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER];
 148         wmsum_t abdstat_scatter_page_multi_chunk;
 149         wmsum_t abdstat_scatter_page_multi_zone;
 150         wmsum_t abdstat_scatter_page_alloc_retry;
 151         wmsum_t abdstat_scatter_sg_table_retry;
 152 } abd_sums;
 153
 154 #define abd_for_each_sg(abd, sg, n, i)  \
 155         for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
 156
 157 /*
 158  * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
 159  * ABD's.  Smaller allocations will use linear ABD's which uses
 160  * zio_[data_]buf_alloc().
 161  *
 162  * Scatter ABD's use at least one page each, so sub-page allocations waste
 163  * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
 164  * half of each page).  Using linear ABD's for small allocations means that
 165  * they will be put on slabs which contain many allocations.  This can
 166  * improve memory efficiency, but it also makes it much harder for ARC
 167  * evictions to actually free pages, because all the buffers on one slab need
 168  * to be freed in order for the slab (and underlying pages) to be freed.
 169  * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
 170  * possible for them to actually waste more memory than scatter (one page per
 171  * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
 172  *
 173  * Spill blocks are typically 512B and are heavily used on systems running
 174  * selinux with the default dnode size and the `xattr=sa` property set.
 175  *
 176  * By default we use linear allocations for 512B and 1KB, and scatter
 177  * allocations for larger (1.5KB and up).
 178  */
 179 static int zfs_abd_scatter_min_size = 512 * 3;
 180
 181 /*
 182  * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
 183  * just a single zero'd page. This allows us to conserve memory by
 184  * only using a single zero page for the scatterlist.
 185  */
 186 abd_t *abd_zero_scatter = NULL;
 187
 188 struct page;
 189
 190 /*
 191  * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
 192  * point to ZERO_PAGE if it is available or it will be an allocated zero'd
 193  * PAGESIZE buffer.
 194  */
 195 static struct page *abd_zero_page = NULL;
 196
 197 static kmem_cache_t *abd_cache = NULL;
 198 static kstat_t *abd_ksp;
 199
 200 static uint_t
 201 abd_chunkcnt_for_bytes(size_t size)
 202 {
 203         return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
 204 }
 205
 206 abd_t *
 207 abd_alloc_struct_impl(size_t size)
 208 {
 209         /*
 210          * In Linux we do not use the size passed in during ABD
 211          * allocation, so we just ignore it.
 212          */
 213         (void) size;
 214         abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
 215         ASSERT3P(abd, !=, NULL);
 216         ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
 217
 218         return (abd);
 219 }
 220
 221 void
 222 abd_free_struct_impl(abd_t *abd)
 223 {
 224         kmem_cache_free(abd_cache, abd);
 225         ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
 226 }
 227
 228 static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;
 229
 230 /*
 231  * Mark zfs data pages so they can be excluded from kernel crash dumps
 232  */
 233 #ifdef _LP64
 234 #define ABD_FILE_CACHE_PAGE     0x2F5ABDF11ECAC4E
 235
 236 static inline void
 237 abd_mark_zfs_page(struct page *page)
 238 {
 239         get_page(page);
 240         SetPagePrivate(page);
 241         set_page_private(page, ABD_FILE_CACHE_PAGE);
 242 }
 243
 244 static inline void
 245 abd_unmark_zfs_page(struct page *page)
 246 {
 247         set_page_private(page, 0UL);
 248         ClearPagePrivate(page);
 249         put_page(page);
 250 }
 251 #else
 252 #define abd_mark_zfs_page(page)
 253 #define abd_unmark_zfs_page(page)
 254 #endif /* _LP64 */
 255
 256 #ifndef CONFIG_HIGHMEM
 257
 258 #ifndef __GFP_RECLAIM
 259 #define __GFP_RECLAIM           __GFP_WAIT
 260 #endif
 261
 262 /*
 263  * The goal is to minimize fragmentation by preferentially populating ABDs
 264  * with higher order compound pages from a single zone.  Allocation size is
 265  * progressively decreased until it can be satisfied without performing
 266  * reclaim or compaction.  When necessary this function will degenerate to
 267  * allocating individual pages and allowing reclaim to satisfy allocations.
 268  */
 269 void
 270 abd_alloc_chunks(abd_t *abd, size_t size)
 271 {
 272         struct list_head pages;
 273         struct sg_table table;
 274         struct scatterlist *sg;
 275         struct page *page, *tmp_page = NULL;
 276         gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO;
 277         gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
 278         unsigned int max_order = MIN(zfs_abd_scatter_max_order,
 279             ABD_MAX_ORDER - 1);
 280         unsigned int nr_pages = abd_chunkcnt_for_bytes(size);
 281         unsigned int chunks = 0, zones = 0;
 282         size_t remaining_size;
 283         int nid = NUMA_NO_NODE;
 284         unsigned int alloc_pages = 0;
 285
 286         INIT_LIST_HEAD(&pages);
 287
 288         ASSERT3U(alloc_pages, <, nr_pages);
 289
 290         while (alloc_pages < nr_pages) {
 291                 unsigned int chunk_pages;
 292                 unsigned int order;
 293
 294                 order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
 295                 chunk_pages = (1U << order);
 296
 297                 page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
 298                 if (page == NULL) {
 299                         if (order == 0) {
 300                                 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 301                                 schedule_timeout_interruptible(1);
 302                         } else {
 303                                 max_order = MAX(0, order - 1);
 304                         }
 305                         continue;
 306                 }
 307
 308                 list_add_tail(&page->lru, &pages);
 309
 310                 if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
 311                         zones++;
 312
 313                 nid = page_to_nid(page);
 314                 ABDSTAT_BUMP(abdstat_scatter_orders[order]);
 315                 chunks++;
 316                 alloc_pages += chunk_pages;
 317         }
 318
 319         ASSERT3S(alloc_pages, ==, nr_pages);
 320
 321         while (sg_alloc_table(&table, chunks, gfp)) {
 322                 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 323                 schedule_timeout_interruptible(1);
 324         }
 325
 326         sg = table.sgl;
 327         remaining_size = size;
 328         list_for_each_entry_safe(page, tmp_page, &pages, lru) {
 329                 size_t sg_size = MIN(PAGESIZE << compound_order(page),
 330                     remaining_size);
 331                 sg_set_page(sg, page, sg_size, 0);
 332                 abd_mark_zfs_page(page);
 333                 remaining_size -= sg_size;
 334
 335                 sg = sg_next(sg);
 336                 list_del(&page->lru);
 337         }
 338
 339         /*
 340          * These conditions ensure that a possible transformation to a linear
 341          * ABD would be valid.
 342          */
 343         ASSERT(!PageHighMem(sg_page(table.sgl)));
 344         ASSERT0(ABD_SCATTER(abd).abd_offset);
 345
 346         if (table.nents == 1) {
 347                 /*
 348                  * Since there is only one entry, this ABD can be represented
 349                  * as a linear buffer.  All single-page (4K) ABD's can be
 350                  * represented this way.  Some multi-page ABD's can also be
 351                  * represented this way, if we were able to allocate a single
 352                  * "chunk" (higher-order "page" which represents a power-of-2
 353                  * series of physically-contiguous pages).  This is often the
 354                  * case for 2-page (8K) ABD's.
 355                  *
 356                  * Representing a single-entry scatter ABD as a linear ABD
 357                  * has the performance advantage of avoiding the copy (and
 358                  * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
 359                  * A performance increase of around 5% has been observed for
 360                  * ARC-cached reads (of small blocks which can take advantage
 361                  * of this).
 362                  *
 363                  * Note that this optimization is only possible because the
 364                  * pages are always mapped into the kernel's address space.
 365                  * This is not the case for highmem pages, so the
 366                  * optimization can not be made there.
 367                  */
 368                 abd->abd_flags |= ABD_FLAG_LINEAR;
 369                 abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
 370                 abd->abd_u.abd_linear.abd_sgl = table.sgl;
 371                 ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl));
 372         } else if (table.nents > 1) {
 373                 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 374                 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 375
 376                 if (zones) {
 377                         ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
 378                         abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
 379                 }
 380
 381                 ABD_SCATTER(abd).abd_sgl = table.sgl;
 382                 ABD_SCATTER(abd).abd_nents = table.nents;
 383         }
 384 }
 385 #else
 386
 387 /*
 388  * Allocate N individual pages to construct a scatter ABD.  This function
 389  * makes no attempt to request contiguous pages and requires the minimal
 390  * number of kernel interfaces.  It's designed for maximum compatibility.
 391  */
 392 void
 393 abd_alloc_chunks(abd_t *abd, size_t size)
 394 {
 395         struct scatterlist *sg = NULL;
 396         struct sg_table table;
 397         struct page *page;
 398         gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO;
 399         int nr_pages = abd_chunkcnt_for_bytes(size);
 400         int i = 0;
 401
 402         while (sg_alloc_table(&table, nr_pages, gfp)) {
 403                 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 404                 schedule_timeout_interruptible(1);
 405         }
 406
 407         ASSERT3U(table.nents, ==, nr_pages);
 408         ABD_SCATTER(abd).abd_sgl = table.sgl;
 409         ABD_SCATTER(abd).abd_nents = nr_pages;
 410
 411         abd_for_each_sg(abd, sg, nr_pages, i) {
 412                 while ((page = __page_cache_alloc(gfp)) == NULL) {
 413                         ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 414                         schedule_timeout_interruptible(1);
 415                 }
 416
 417                 ABDSTAT_BUMP(abdstat_scatter_orders[0]);
 418                 sg_set_page(sg, page, PAGESIZE, 0);
 419                 abd_mark_zfs_page(page);
 420         }
 421
 422         if (nr_pages > 1) {
 423                 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 424                 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 425         }
 426 }
 427 #endif /* !CONFIG_HIGHMEM */
 428
 429 /*
 430  * This must be called if any of the sg_table allocation functions
 431  * are called.
 432  */
 433 static void
 434 abd_free_sg_table(abd_t *abd)
 435 {
 436         struct sg_table table;
 437
 438         table.sgl = ABD_SCATTER(abd).abd_sgl;
 439         table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents;
 440         sg_free_table(&table);
 441 }
 442
 443 void
 444 abd_free_chunks(abd_t *abd)
 445 {
 446         struct scatterlist *sg = NULL;
 447         struct page *page;
 448         int nr_pages = ABD_SCATTER(abd).abd_nents;
 449         int order, i = 0;
 450
 451         if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
 452                 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
 453
 454         if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
 455                 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
 456
 457         /*
 458          * Scatter ABDs may be constructed by abd_alloc_from_pages() from
 459          * an array of pages. In which case they should not be freed.
 460          */
 461         if (!abd_is_from_pages(abd)) {
 462                 abd_for_each_sg(abd, sg, nr_pages, i) {
 463                         page = sg_page(sg);
 464                         abd_unmark_zfs_page(page);
 465                         order = compound_order(page);
 466                         __free_pages(page, order);
 467                         ASSERT3U(sg->length, <=, PAGE_SIZE << order);
 468                         ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
 469                 }
 470         }
 471
 472         abd_free_sg_table(abd);
 473 }
 474
 475 /*
 476  * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in
 477  * the scatterlist will be set to the zero'd out buffer abd_zero_page.
 478  */
 479 static void
 480 abd_alloc_zero_scatter(void)
 481 {
 482         struct scatterlist *sg = NULL;
 483         struct sg_table table;
 484         gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
 485         int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
 486         int i = 0;
 487
 488 #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
 489         gfp_t gfp_zero_page = gfp | __GFP_ZERO;
 490         while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) {
 491                 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 492                 schedule_timeout_interruptible(1);
 493         }
 494         abd_mark_zfs_page(abd_zero_page);
 495 #else
 496         abd_zero_page = ZERO_PAGE(0);
 497 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
 498
 499         while (sg_alloc_table(&table, nr_pages, gfp)) {
 500                 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 501                 schedule_timeout_interruptible(1);
 502         }
 503         ASSERT3U(table.nents, ==, nr_pages);
 504
 505         abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
 506         abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
 507         ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
 508         ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
 509         ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
 510         abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
 511         abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 512
 513         abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
 514                 sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
 515         }
 516
 517         ABDSTAT_BUMP(abdstat_scatter_cnt);
 518         ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
 519         ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 520 }
 521
 522 boolean_t
 523 abd_size_alloc_linear(size_t size)
 524 {
 525         return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size);
 526 }
 527
 528 void
 529 abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
 530 {
 531         ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
 532         int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
 533         if (op == ABDSTAT_INCR) {
 534                 ABDSTAT_BUMP(abdstat_scatter_cnt);
 535                 ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
 536                 ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
 537                 arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
 538         } else {
 539                 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
 540                 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
 541                 ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
 542                 arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
 543         }
 544 }
 545
 546 void
 547 abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
 548 {
 549         ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
 550         if (op == ABDSTAT_INCR) {
 551                 ABDSTAT_BUMP(abdstat_linear_cnt);
 552                 ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
 553         } else {
 554                 ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
 555                 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
 556         }
 557 }
 558
 559 void
 560 abd_verify_scatter(abd_t *abd)
 561 {
 562         ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
 563         ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
 564             ABD_SCATTER(abd).abd_sgl->length);
 565
 566 #ifdef ZFS_DEBUG
 567         struct scatterlist *sg = NULL;
 568         size_t n = ABD_SCATTER(abd).abd_nents;
 569         int i = 0;
 570
 571         abd_for_each_sg(abd, sg, n, i) {
 572                 ASSERT3P(sg_page(sg), !=, NULL);
 573         }
 574 #endif
 575 }
 576
 577 static void
 578 abd_free_zero_scatter(void)
 579 {
 580         ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
 581         ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE);
 582         ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
 583
 584         abd_free_sg_table(abd_zero_scatter);
 585         abd_free_struct(abd_zero_scatter);
 586         abd_zero_scatter = NULL;
 587         ASSERT3P(abd_zero_page, !=, NULL);
 588 #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
 589         abd_unmark_zfs_page(abd_zero_page);
 590         __free_page(abd_zero_page);
 591 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
 592 }
 593
 594 static int
 595 abd_kstats_update(kstat_t *ksp, int rw)
 596 {
 597         abd_stats_t *as = ksp->ks_data;
 598
 599         if (rw == KSTAT_WRITE)
 600                 return (EACCES);
 601         as->abdstat_struct_size.value.ui64 =
 602             wmsum_value(&abd_sums.abdstat_struct_size);
 603         as->abdstat_linear_cnt.value.ui64 =
 604             wmsum_value(&abd_sums.abdstat_linear_cnt);
 605         as->abdstat_linear_data_size.value.ui64 =
 606             wmsum_value(&abd_sums.abdstat_linear_data_size);
 607         as->abdstat_scatter_cnt.value.ui64 =
 608             wmsum_value(&abd_sums.abdstat_scatter_cnt);
 609         as->abdstat_scatter_data_size.value.ui64 =
 610             wmsum_value(&abd_sums.abdstat_scatter_data_size);
 611         as->abdstat_scatter_chunk_waste.value.ui64 =
 612             wmsum_value(&abd_sums.abdstat_scatter_chunk_waste);
 613         for (int i = 0; i < ABD_MAX_ORDER; i++) {
 614                 as->abdstat_scatter_orders[i].value.ui64 =
 615                     wmsum_value(&abd_sums.abdstat_scatter_orders[i]);
 616         }
 617         as->abdstat_scatter_page_multi_chunk.value.ui64 =
 618             wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk);
 619         as->abdstat_scatter_page_multi_zone.value.ui64 =
 620             wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone);
 621         as->abdstat_scatter_page_alloc_retry.value.ui64 =
 622             wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry);
 623         as->abdstat_scatter_sg_table_retry.value.ui64 =
 624             wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry);
 625         return (0);
 626 }
 627
 628 void
 629 abd_init(void)
 630 {
 631         int i;
 632
 633         abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
 634             0, NULL, NULL, NULL, NULL, NULL, KMC_RECLAIMABLE);
 635
 636         wmsum_init(&abd_sums.abdstat_struct_size, 0);
 637         wmsum_init(&abd_sums.abdstat_linear_cnt, 0);
 638         wmsum_init(&abd_sums.abdstat_linear_data_size, 0);
 639         wmsum_init(&abd_sums.abdstat_scatter_cnt, 0);
 640         wmsum_init(&abd_sums.abdstat_scatter_data_size, 0);
 641         wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0);
 642         for (i = 0; i < ABD_MAX_ORDER; i++)
 643                 wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0);
 644         wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0);
 645         wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0);
 646         wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0);
 647         wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0);
 648
 649         abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
 650             sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 651         if (abd_ksp != NULL) {
 652                 for (i = 0; i < ABD_MAX_ORDER; i++) {
 653                         snprintf(abd_stats.abdstat_scatter_orders[i].name,
 654                             KSTAT_STRLEN, "scatter_order_%d", i);
 655                         abd_stats.abdstat_scatter_orders[i].data_type =
 656                             KSTAT_DATA_UINT64;
 657                 }
 658                 abd_ksp->ks_data = &abd_stats;
 659                 abd_ksp->ks_update = abd_kstats_update;
 660                 kstat_install(abd_ksp);
 661         }
 662
 663         abd_alloc_zero_scatter();
 664 }
 665
 666 void
 667 abd_fini(void)
 668 {
 669         abd_free_zero_scatter();
 670
 671         if (abd_ksp != NULL) {
 672                 kstat_delete(abd_ksp);
 673                 abd_ksp = NULL;
 674         }
 675
 676         wmsum_fini(&abd_sums.abdstat_struct_size);
 677         wmsum_fini(&abd_sums.abdstat_linear_cnt);
 678         wmsum_fini(&abd_sums.abdstat_linear_data_size);
 679         wmsum_fini(&abd_sums.abdstat_scatter_cnt);
 680         wmsum_fini(&abd_sums.abdstat_scatter_data_size);
 681         wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste);
 682         for (int i = 0; i < ABD_MAX_ORDER; i++)
 683                 wmsum_fini(&abd_sums.abdstat_scatter_orders[i]);
 684         wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk);
 685         wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone);
 686         wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry);
 687         wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry);
 688
 689         if (abd_cache) {
 690                 kmem_cache_destroy(abd_cache);
 691                 abd_cache = NULL;
 692         }
 693 }
 694
 695 void
 696 abd_free_linear_page(abd_t *abd)
 697 {
 698         /* Transform it back into a scatter ABD for freeing */
 699         struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
 700
 701         /* When backed by user page unmap it */
 702         if (abd_is_from_pages(abd))
 703                 zfs_kunmap(sg_page(sg));
 704         else
 705                 abd_update_scatter_stats(abd, ABDSTAT_DECR);
 706
 707         abd->abd_flags &= ~ABD_FLAG_LINEAR;
 708         abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
 709         ABD_SCATTER(abd).abd_nents = 1;
 710         ABD_SCATTER(abd).abd_offset = 0;
 711         ABD_SCATTER(abd).abd_sgl = sg;
 712         abd_free_chunks(abd);
 713 }
 714
 715 /*
 716  * Allocate a scatter ABD structure from user pages. The pages must be
 717  * pinned with get_user_pages, or similiar, but need not be mapped via
 718  * the kmap interfaces.
 719  */
 720 abd_t *
 721 abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size)
 722 {
 723         uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE);
 724         struct sg_table table;
 725
 726         VERIFY3U(size, <=, DMU_MAX_ACCESS);
 727         ASSERT3U(offset, <, PAGE_SIZE);
 728         ASSERT3P(pages, !=, NULL);
 729
 730         /*
 731          * Even if this buf is filesystem metadata, we only track that we
 732          * own the underlying data buffer, which is not true in this case.
 733          * Therefore, we don't ever use ABD_FLAG_META here.
 734          */
 735         abd_t *abd = abd_alloc_struct(0);
 736         abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER;
 737         abd->abd_size = size;
 738
 739         while (sg_alloc_table_from_pages(&table, pages, npages, offset,
 740             size, __GFP_NOWARN | GFP_NOIO) != 0) {
 741                 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 742                 schedule_timeout_interruptible(1);
 743         }
 744
 745         if ((offset + size) <= PAGE_SIZE) {
 746                 /*
 747                  * Since there is only one entry, this ABD can be represented
 748                  * as a linear buffer. All single-page (4K) ABD's constructed
 749                  * from a user page can be represented this way as long as the
 750                  * page is mapped to a virtual address. This allows us to
 751                  * apply an offset in to the mapped page.
 752                  *
 753                  * Note that kmap() must be used, not kmap_atomic(), because
 754                  * the mapping needs to bet set up on all CPUs. Using kmap()
 755                  * also enables the user of highmem pages when required.
 756                  */
 757                 abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE;
 758                 abd->abd_u.abd_linear.abd_sgl = table.sgl;
 759                 zfs_kmap(sg_page(table.sgl));
 760                 ABD_LINEAR_BUF(abd) = sg_virt(table.sgl);
 761         } else {
 762                 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 763                 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 764
 765                 ABD_SCATTER(abd).abd_offset = offset;
 766                 ABD_SCATTER(abd).abd_sgl = table.sgl;
 767                 ABD_SCATTER(abd).abd_nents = table.nents;
 768
 769                 ASSERT0(ABD_SCATTER(abd).abd_offset);
 770         }
 771
 772         return (abd);
 773 }
 774
 775 /*
 776  * If we're going to use this ABD for doing I/O using the block layer, the
 777  * consumer of the ABD data doesn't care if it's scattered or not, and we don't
 778  * plan to store this ABD in memory for a long period of time, we should
 779  * allocate the ABD type that requires the least data copying to do the I/O.
 780  *
 781  * On Linux the optimal thing to do would be to use abd_get_offset() and
 782  * construct a new ABD which shares the original pages thereby eliminating
 783  * the copy.  But for the moment a new linear ABD is allocated until this
 784  * performance optimization can be implemented.
 785  */
 786 abd_t *
 787 abd_alloc_for_io(size_t size, boolean_t is_metadata)
 788 {
 789         return (abd_alloc(size, is_metadata));
 790 }
 791
 792 abd_t *
 793 abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
 794     size_t size)
 795 {
 796         (void) size;
 797         int i = 0;
 798         struct scatterlist *sg = NULL;
 799
 800         abd_verify(sabd);
 801         ASSERT3U(off, <=, sabd->abd_size);
 802
 803         size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
 804
 805         if (abd == NULL)
 806                 abd = abd_alloc_struct(0);
 807
 808         /*
 809          * Even if this buf is filesystem metadata, we only track that
 810          * if we own the underlying data buffer, which is not true in
 811          * this case. Therefore, we don't ever use ABD_FLAG_META here.
 812          */
 813
 814         abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
 815                 if (new_offset < sg->length)
 816                         break;
 817                 new_offset -= sg->length;
 818         }
 819
 820         ABD_SCATTER(abd).abd_sgl = sg;
 821         ABD_SCATTER(abd).abd_offset = new_offset;
 822         ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
 823
 824         if (abd_is_from_pages(sabd))
 825                 abd->abd_flags |= ABD_FLAG_FROM_PAGES;
 826
 827         return (abd);
 828 }
 829
 830 /*
 831  * Initialize the abd_iter.
 832  */
 833 void
 834 abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 835 {
 836         ASSERT(!abd_is_gang(abd));
 837         abd_verify(abd);
 838         memset(aiter, 0, sizeof (struct abd_iter));
 839         aiter->iter_abd = abd;
 840         if (!abd_is_linear(abd)) {
 841                 aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
 842                 aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
 843         }
 844 }
 845
 846 /*
 847  * This is just a helper function to see if we have exhausted the
 848  * abd_iter and reached the end.
 849  */
 850 boolean_t
 851 abd_iter_at_end(struct abd_iter *aiter)
 852 {
 853         ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
 854         return (aiter->iter_pos == aiter->iter_abd->abd_size);
 855 }
 856
 857 /*
 858  * Advance the iterator by a certain amount. Cannot be called when a chunk is
 859  * in use. This can be safely called when the aiter has already exhausted, in
 860  * which case this does nothing.
 861  */
 862 void
 863 abd_iter_advance(struct abd_iter *aiter, size_t amount)
 864 {
 865         /*
 866          * Ensure that last chunk is not in use. abd_iterate_*() must clear
 867          * this state (directly or abd_iter_unmap()) before advancing.
 868          */
 869         ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 870         ASSERT0(aiter->iter_mapsize);
 871         ASSERT3P(aiter->iter_page, ==, NULL);
 872         ASSERT0(aiter->iter_page_doff);
 873         ASSERT0(aiter->iter_page_dsize);
 874
 875         /* There's nothing left to advance to, so do nothing */
 876         if (abd_iter_at_end(aiter))
 877                 return;
 878
 879         aiter->iter_pos += amount;
 880         aiter->iter_offset += amount;
 881         if (!abd_is_linear(aiter->iter_abd)) {
 882                 while (aiter->iter_offset >= aiter->iter_sg->length) {
 883                         aiter->iter_offset -= aiter->iter_sg->length;
 884                         aiter->iter_sg = sg_next(aiter->iter_sg);
 885                         if (aiter->iter_sg == NULL) {
 886                                 ASSERT0(aiter->iter_offset);
 887                                 break;
 888                         }
 889                 }
 890         }
 891 }
 892
 893 /*
 894  * Map the current chunk into aiter. This can be safely called when the aiter
 895  * has already exhausted, in which case this does nothing.
 896  */
 897 void
 898 abd_iter_map(struct abd_iter *aiter)
 899 {
 900         void *paddr;
 901         size_t offset = 0;
 902
 903         ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 904         ASSERT0(aiter->iter_mapsize);
 905
 906         /* There's nothing left to iterate over, so do nothing */
 907         if (abd_iter_at_end(aiter))
 908                 return;
 909
 910         if (abd_is_linear(aiter->iter_abd)) {
 911                 ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
 912                 offset = aiter->iter_offset;
 913                 aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
 914                 paddr = ABD_LINEAR_BUF(aiter->iter_abd);
 915         } else {
 916                 offset = aiter->iter_offset;
 917                 aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
 918                     aiter->iter_abd->abd_size - aiter->iter_pos);
 919
 920                 paddr = zfs_kmap_local(sg_page(aiter->iter_sg));
 921         }
 922
 923         aiter->iter_mapaddr = (char *)paddr + offset;
 924 }
 925
 926 /*
 927  * Unmap the current chunk from aiter. This can be safely called when the aiter
 928  * has already exhausted, in which case this does nothing.
 929  */
 930 void
 931 abd_iter_unmap(struct abd_iter *aiter)
 932 {
 933         /* There's nothing left to unmap, so do nothing */
 934         if (abd_iter_at_end(aiter))
 935                 return;
 936
 937         if (!abd_is_linear(aiter->iter_abd)) {
 938                 /* LINTED E_FUNC_SET_NOT_USED */
 939                 zfs_kunmap_local(aiter->iter_mapaddr - aiter->iter_offset);
 940         }
 941
 942         ASSERT3P(aiter->iter_mapaddr, !=, NULL);
 943         ASSERT3U(aiter->iter_mapsize, >, 0);
 944
 945         aiter->iter_mapaddr = NULL;
 946         aiter->iter_mapsize = 0;
 947 }
 948
 949 void
 950 abd_cache_reap_now(void)
 951 {
 952 }
 953
 954 /*
 955  * Borrow a raw buffer from an ABD without copying the contents of the ABD
 956  * into the buffer. If the ABD is scattered, this will allocate a raw buffer
 957  * whose contents are undefined. To copy over the existing data in the ABD, use
 958  * abd_borrow_buf_copy() instead.
 959  */
 960 void *
 961 abd_borrow_buf(abd_t *abd, size_t n)
 962 {
 963         void *buf;
 964         abd_verify(abd);
 965         ASSERT3U(abd->abd_size, >=, 0);
 966         /*
 967          * In the event the ABD is composed of a single user page from Direct
 968          * I/O we can not direclty return the raw buffer. This is a consequence
 969          * of not being able to write protect the page and the contents of the
 970          * page can be changed at any time by the user.
 971          */
 972         if (abd_is_from_pages(abd)) {
 973                 buf = zio_buf_alloc(n);
 974         } else if (abd_is_linear(abd)) {
 975                 buf = abd_to_buf(abd);
 976         } else {
 977                 buf = zio_buf_alloc(n);
 978         }
 979
 980 #ifdef ZFS_DEBUG
 981         (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
 982 #endif
 983         return (buf);
 984 }
 985
 986 void *
 987 abd_borrow_buf_copy(abd_t *abd, size_t n)
 988 {
 989         void *buf = abd_borrow_buf(abd, n);
 990
 991         /*
 992          * In the event the ABD is composed of a single user page from Direct
 993          * I/O we must make sure copy the data over into the newly allocated
 994          * buffer. This is a consequence of the fact that we can not write
 995          * protect the user page and there is a risk the contents of the page
 996          * could be changed by the user at any moment.
 997          */
 998         if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
 999                 abd_copy_to_buf(buf, abd, n);
1000         }
1001         return (buf);
1002 }
1003
1004 /*
1005  * Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will
1006  * not change the contents of the ABD. If you want any changes you made to
1007  * buf to be copied back to abd, use abd_return_buf_copy() instead. If the
1008  * ABD is not constructed from user pages for Direct I/O then an ASSERT
1009  * checks to make sure the contents of buffer have not changed since it was
1010  * borrowed. We can not ASSERT that the contents of the buffer have not changed
1011  * if it is composed of user pages because the pages can not be placed under
1012  * write protection and the user could have possibly changed the contents in
1013  * the pages at any time. This is also an issue for Direct I/O reads. Checksum
1014  * verifications in the ZIO pipeline check for this issue and handle it by
1015  * returning an error on checksum verification failure.
1016  */
1017 void
1018 abd_return_buf(abd_t *abd, void *buf, size_t n)
1019 {
1020         abd_verify(abd);
1021         ASSERT3U(abd->abd_size, >=, n);
1022 #ifdef ZFS_DEBUG
1023         (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
1024 #endif
1025         if (abd_is_from_pages(abd)) {
1026                 zio_buf_free(buf, n);
1027         } else if (abd_is_linear(abd)) {
1028                 ASSERT3P(buf, ==, abd_to_buf(abd));
1029         } else if (abd_is_gang(abd)) {
1030 #ifdef ZFS_DEBUG
1031                 /*
1032                  * We have to be careful with gang ABD's that we do not ASSERT0
1033                  * for any ABD's that contain user pages from Direct I/O. In
1034                  * order to handle this, we just iterate through the gang ABD
1035                  * and only verify ABDs that are not from user pages.
1036                  */
1037                 void *cmp_buf = buf;
1038
1039                 for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
1040                     cabd != NULL;
1041                     cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
1042                         if (!abd_is_from_pages(cabd)) {
1043                                 ASSERT0(abd_cmp_buf(cabd, cmp_buf,
1044                                     cabd->abd_size));
1045                         }
1046                         cmp_buf = (char *)cmp_buf + cabd->abd_size;
1047                 }
1048 #endif
1049                 zio_buf_free(buf, n);
1050         } else {
1051                 ASSERT0(abd_cmp_buf(abd, buf, n));
1052                 zio_buf_free(buf, n);
1053         }
1054 }
1055
1056 void
1057 abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
1058 {
1059         if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
1060                 abd_copy_from_buf(abd, buf, n);
1061         }
1062         abd_return_buf(abd, buf, n);
1063 }
1064
1065 /*
1066  * This is abd_iter_page(), the function underneath abd_iterate_page_func().
1067  * It yields the next page struct and data offset and size within it, without
1068  * mapping it into the address space.
1069  */
1070
1071 /*
1072  * "Compound pages" are a group of pages that can be referenced from a single
1073  * struct page *. Its organised as a "head" page, followed by a series of
1074  * "tail" pages.
1075  *
1076  * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we
1077  * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a
1078  * great many of the IO buffers we get are going to be of this type.
1079  *
1080  * The tail pages are just regular PAGESIZE pages, and can be safely used
1081  * as-is. However, the head page has length covering itself and all the tail
1082  * pages. If the ABD chunk spans multiple pages, then we can use the head page
1083  * and a >PAGESIZE length, which is far more efficient.
1084  *
1085  * Before kernel 4.5 however, compound page heads were refcounted separately
1086  * from tail pages, such that moving back to the head page would require us to
1087  * take a reference to it and releasing it once we're completely finished with
1088  * it. In practice, that meant when our caller is done with the ABD, which we
1089  * have no insight into from here. Rather than contort this API to track head
1090  * page references on such ancient kernels, we disabled this special compound
1091  * page handling on kernels before 4.5, instead just using treating each page
1092  * within it as a regular PAGESIZE page (which it is). This is slightly less
1093  * efficient, but makes everything far simpler.
1094  *
1095  * We no longer support kernels before 4.5, so in theory none of this is
1096  * necessary. However, this code is still relatively new in the grand scheme of
1097  * things, so I'm leaving the ability to compile this out for the moment.
1098  *
1099  * Setting/clearing ABD_ITER_COMPOUND_PAGES below enables/disables the special
1100  * handling, by defining the ABD_ITER_PAGE_SIZE(page) macro to understand
1101  * compound pages, or not, and compiling in/out the support to detect compound
1102  * tail pages and move back to the start.
1103  */
1104
1105 /* On by default */
1106 #define ABD_ITER_COMPOUND_PAGES
1107
1108 #ifdef ABD_ITER_COMPOUND_PAGES
1109 #define ABD_ITER_PAGE_SIZE(page)        \
1110         (PageCompound(page) ? page_size(page) : PAGESIZE)
1111 #else
1112 #define ABD_ITER_PAGE_SIZE(page)        (PAGESIZE)
1113 #endif
1114
1115 void
1116 abd_iter_page(struct abd_iter *aiter)
1117 {
1118         if (abd_iter_at_end(aiter)) {
1119                 aiter->iter_page = NULL;
1120                 aiter->iter_page_doff = 0;
1121                 aiter->iter_page_dsize = 0;
1122                 return;
1123         }
1124
1125         struct page *page;
1126         size_t doff, dsize;
1127
1128         /*
1129          * Find the page, and the start of the data within it. This is computed
1130          * differently for linear and scatter ABDs; linear is referenced by
1131          * virtual memory location, while scatter is referenced by page
1132          * pointer.
1133          */
1134         if (abd_is_linear(aiter->iter_abd)) {
1135                 ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
1136
1137                 /* memory address at iter_pos */
1138                 void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
1139
1140                 /* struct page for address */
1141                 page = is_vmalloc_addr(paddr) ?
1142                     vmalloc_to_page(paddr) : virt_to_page(paddr);
1143
1144                 /* offset of address within the page */
1145                 doff = offset_in_page(paddr);
1146         } else {
1147                 ASSERT(!abd_is_gang(aiter->iter_abd));
1148
1149                 /* current scatter page */
1150                 page = nth_page(sg_page(aiter->iter_sg),
1151                     aiter->iter_offset >> PAGE_SHIFT);
1152
1153                 /* position within page */
1154                 doff = aiter->iter_offset & (PAGESIZE - 1);
1155         }
1156
1157 #ifdef ABD_ITER_COMPOUND_PAGES
1158         if (PageTail(page)) {
1159                 /*
1160                  * If this is a compound tail page, move back to the head, and
1161                  * adjust the offset to match. This may let us yield a much
1162                  * larger amount of data from a single logical page, and so
1163                  * leave our caller with fewer pages to process.
1164                  */
1165                 struct page *head = compound_head(page);
1166                 doff += ((page - head) * PAGESIZE);
1167                 page = head;
1168         }
1169 #endif
1170
1171         ASSERT(page);
1172
1173         /*
1174          * Compute the maximum amount of data we can take from this page. This
1175          * is the smaller of:
1176          * - the remaining space in the page
1177          * - the remaining space in this scatterlist entry (which may not cover
1178          *   the entire page)
1179          * - the remaining space in the abd (which may not cover the entire
1180          *   scatterlist entry)
1181          */
1182         dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff,
1183             aiter->iter_abd->abd_size - aiter->iter_pos);
1184         if (!abd_is_linear(aiter->iter_abd))
1185                 dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset);
1186         ASSERT3U(dsize, >, 0);
1187
1188         /* final iterator outputs */
1189         aiter->iter_page = page;
1190         aiter->iter_page_doff = doff;
1191         aiter->iter_page_dsize = dsize;
1192 }
1193
1194 /*
1195  * Note: ABD BIO functions only needed to support vdev_classic. See comments in
1196  * vdev_disk.c.
1197  */
1198
1199 /*
1200  * bio_nr_pages for ABD.
1201  * @off is the offset in @abd
1202  */
1203 unsigned long
1204 abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
1205 {
1206         unsigned long pos;
1207
1208         if (abd_is_gang(abd)) {
1209                 unsigned long count = 0;
1210
1211                 for (abd_t *cabd = abd_gang_get_offset(abd, &off);
1212                     cabd != NULL && size != 0;
1213                     cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
1214                         ASSERT3U(off, <, cabd->abd_size);
1215                         int mysize = MIN(size, cabd->abd_size - off);
1216                         count += abd_nr_pages_off(cabd, mysize, off);
1217                         size -= mysize;
1218                         off = 0;
1219                 }
1220                 return (count);
1221         }
1222
1223         if (abd_is_linear(abd))
1224                 pos = (unsigned long)abd_to_buf(abd) + off;
1225         else
1226                 pos = ABD_SCATTER(abd).abd_offset + off;
1227
1228         return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
1229             (pos >> PAGE_SHIFT));
1230 }
1231
1232 static unsigned int
1233 bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size)
1234 {
1235         unsigned int offset, size, i;
1236         struct page *page;
1237
1238         offset = offset_in_page(buf_ptr);
1239         for (i = 0; i < bio->bi_max_vecs; i++) {
1240                 size = PAGE_SIZE - offset;
1241
1242                 if (bio_size <= 0)
1243                         break;
1244
1245                 if (size > bio_size)
1246                         size = bio_size;
1247
1248                 if (is_vmalloc_addr(buf_ptr))
1249                         page = vmalloc_to_page(buf_ptr);
1250                 else
1251                         page = virt_to_page(buf_ptr);
1252
1253                 /*
1254                  * Some network related block device uses tcp_sendpage, which
1255                  * doesn't behave well when using 0-count page, this is a
1256                  * safety net to catch them.
1257                  */
1258                 ASSERT3S(page_count(page), >, 0);
1259
1260                 if (bio_add_page(bio, page, size, offset) != size)
1261                         break;
1262
1263                 buf_ptr += size;
1264                 bio_size -= size;
1265                 offset = 0;
1266         }
1267
1268         return (bio_size);
1269 }
1270
1271 /*
1272  * bio_map for gang ABD.
1273  */
1274 static unsigned int
1275 abd_gang_bio_map_off(struct bio *bio, abd_t *abd,
1276     unsigned int io_size, size_t off)
1277 {
1278         ASSERT(abd_is_gang(abd));
1279
1280         for (abd_t *cabd = abd_gang_get_offset(abd, &off);
1281             cabd != NULL;
1282             cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
1283                 ASSERT3U(off, <, cabd->abd_size);
1284                 int size = MIN(io_size, cabd->abd_size - off);
1285                 int remainder = abd_bio_map_off(bio, cabd, size, off);
1286                 io_size -= (size - remainder);
1287                 if (io_size == 0 || remainder > 0)
1288                         return (io_size);
1289                 off = 0;
1290         }
1291         ASSERT0(io_size);
1292         return (io_size);
1293 }
1294
1295 /*
1296  * bio_map for ABD.
1297  * @off is the offset in @abd
1298  * Remaining IO size is returned
1299  */
1300 unsigned int
1301 abd_bio_map_off(struct bio *bio, abd_t *abd,
1302     unsigned int io_size, size_t off)
1303 {
1304         struct abd_iter aiter;
1305
1306         ASSERT3U(io_size, <=, abd->abd_size - off);
1307         if (abd_is_linear(abd))
1308                 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size));
1309
1310         ASSERT(!abd_is_linear(abd));
1311         if (abd_is_gang(abd))
1312                 return (abd_gang_bio_map_off(bio, abd, io_size, off));
1313
1314         abd_iter_init(&aiter, abd);
1315         abd_iter_advance(&aiter, off);
1316
1317         for (int i = 0; i < bio->bi_max_vecs; i++) {
1318                 struct page *pg;
1319                 size_t len, sgoff, pgoff;
1320                 struct scatterlist *sg;
1321
1322                 if (io_size <= 0)
1323                         break;
1324
1325                 sg = aiter.iter_sg;
1326                 sgoff = aiter.iter_offset;
1327                 pgoff = sgoff & (PAGESIZE - 1);
1328                 len = MIN(io_size, PAGESIZE - pgoff);
1329                 ASSERT(len > 0);
1330
1331                 pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
1332                 if (bio_add_page(bio, pg, len, pgoff) != len)
1333                         break;
1334
1335                 io_size -= len;
1336                 abd_iter_advance(&aiter, len);
1337         }
1338
1339         return (io_size);
1340 }
1341
1342 /* Tunable Parameters */
1343 module_param(zfs_abd_scatter_enabled, int, 0644);
1344 MODULE_PARM_DESC(zfs_abd_scatter_enabled,
1345         "Toggle whether ABD allocations must be linear.");
1346 module_param(zfs_abd_scatter_min_size, int, 0644);
1347 MODULE_PARM_DESC(zfs_abd_scatter_min_size,
1348         "Minimum size of scatter allocations.");
1349 /* CSTYLED */
1350 module_param(zfs_abd_scatter_max_order, uint, 0644);
1351 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
1352         "Maximum order allocation used for a scatter ABD.");