During pool export flush the ARC asynchronously
[zfs.git] / include / sys / arc_impl.h
blob2cf6aa3b1825c169ccf1faf36cbc6cd4a8456df5
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013, Delphix. All rights reserved.
24 * Copyright (c) 2013, Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2020, George Amanakis. All rights reserved.
29 #ifndef _SYS_ARC_IMPL_H
30 #define _SYS_ARC_IMPL_H
32 #include <sys/arc.h>
33 #include <sys/multilist.h>
34 #include <sys/zio_crypt.h>
35 #include <sys/zthr.h>
36 #include <sys/aggsum.h>
37 #include <sys/wmsum.h>
39 #ifdef __cplusplus
40 extern "C" {
41 #endif
44 * Note that buffers can be in one of 6 states:
45 * ARC_anon - anonymous (discussed below)
46 * ARC_mru - recently used, currently cached
47 * ARC_mru_ghost - recently used, no longer in cache
48 * ARC_mfu - frequently used, currently cached
49 * ARC_mfu_ghost - frequently used, no longer in cache
50 * ARC_uncached - uncacheable prefetch, to be evicted
51 * ARC_l2c_only - exists in L2ARC but not other states
52 * When there are no active references to the buffer, they are
53 * are linked onto a list in one of these arc states. These are
54 * the only buffers that can be evicted or deleted. Within each
55 * state there are multiple lists, one for meta-data and one for
56 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
57 * etc.) is tracked separately so that it can be managed more
58 * explicitly: favored over data, limited explicitly.
60 * Anonymous buffers are buffers that are not associated with
61 * a DVA. These are buffers that hold dirty block copies
62 * before they are written to stable storage. By definition,
63 * they are "ref'd" and are considered part of arc_mru
64 * that cannot be freed. Generally, they will acquire a DVA
65 * as they are written and migrate onto the arc_mru list.
67 * The ARC_l2c_only state is for buffers that are in the second
68 * level ARC but no longer in any of the ARC_m* lists. The second
69 * level ARC itself may also contain buffers that are in any of
70 * the ARC_m* states - meaning that a buffer can exist in two
71 * places. The reason for the ARC_l2c_only state is to keep the
72 * buffer header in the hash table, so that reads that hit the
73 * second level ARC benefit from these fast lookups.
76 typedef struct arc_state {
78 * list of evictable buffers
80 multilist_t arcs_list[ARC_BUFC_NUMTYPES];
82 * supports the "dbufs" kstat
84 arc_state_type_t arcs_state;
86 * total amount of data in this state.
88 zfs_refcount_t arcs_size[ARC_BUFC_NUMTYPES] ____cacheline_aligned;
90 * total amount of evictable data in this state
92 zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
94 * amount of hit bytes for this state (counted only for ghost states)
96 wmsum_t arcs_hits[ARC_BUFC_NUMTYPES];
97 } arc_state_t;
99 typedef struct arc_callback arc_callback_t;
101 struct arc_callback {
102 void *acb_private;
103 arc_read_done_func_t *acb_done;
104 arc_buf_t *acb_buf;
105 boolean_t acb_encrypted;
106 boolean_t acb_compressed;
107 boolean_t acb_noauth;
108 boolean_t acb_nobuf;
109 boolean_t acb_wait;
110 int acb_wait_error;
111 kmutex_t acb_wait_lock;
112 kcondvar_t acb_wait_cv;
113 zbookmark_phys_t acb_zb;
114 zio_t *acb_zio_dummy;
115 zio_t *acb_zio_head;
116 arc_callback_t *acb_prev;
117 arc_callback_t *acb_next;
120 typedef struct arc_write_callback arc_write_callback_t;
122 struct arc_write_callback {
123 void *awcb_private;
124 arc_write_done_func_t *awcb_ready;
125 arc_write_done_func_t *awcb_children_ready;
126 arc_write_done_func_t *awcb_done;
127 arc_buf_t *awcb_buf;
131 * ARC buffers are separated into multiple structs as a memory saving measure:
132 * - Common fields struct, always defined, and embedded within it:
133 * - L2-only fields, always allocated but undefined when not in L2ARC
134 * - L1-only fields, only allocated when in L1ARC
136 * Buffer in L1 Buffer only in L2
137 * +------------------------+ +------------------------+
138 * | arc_buf_hdr_t | | arc_buf_hdr_t |
139 * | | | |
140 * | | | |
141 * | | | |
142 * +------------------------+ +------------------------+
143 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
144 * | (undefined if L1-only) | | |
145 * +------------------------+ +------------------------+
146 * | l1arc_buf_hdr_t |
147 * | |
148 * | |
149 * | |
150 * | |
151 * +------------------------+
153 * Because it's possible for the L2ARC to become extremely large, we can wind
154 * up eating a lot of memory in L2ARC buffer headers, so the size of a header
155 * is minimized by only allocating the fields necessary for an L1-cached buffer
156 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
157 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
158 * words in pointers. arc_hdr_realloc() is used to switch a header between
159 * these two allocation states.
161 typedef struct l1arc_buf_hdr {
162 /* protected by arc state mutex */
163 arc_state_t *b_state;
164 multilist_node_t b_arc_node;
166 /* protected by hash lock */
167 clock_t b_arc_access;
168 uint32_t b_mru_hits;
169 uint32_t b_mru_ghost_hits;
170 uint32_t b_mfu_hits;
171 uint32_t b_mfu_ghost_hits;
172 uint8_t b_byteswap;
173 arc_buf_t *b_buf;
175 /* self protecting */
176 zfs_refcount_t b_refcnt;
178 arc_callback_t *b_acb;
179 abd_t *b_pabd;
181 #ifdef ZFS_DEBUG
182 zio_cksum_t *b_freeze_cksum;
183 kmutex_t b_freeze_lock;
184 #endif
185 } l1arc_buf_hdr_t;
187 typedef enum l2arc_dev_hdr_flags_t {
188 L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
189 } l2arc_dev_hdr_flags_t;
192 * Pointer used in persistent L2ARC (for pointing to log blocks).
194 typedef struct l2arc_log_blkptr {
196 * Offset of log block within the device, in bytes
198 uint64_t lbp_daddr;
200 * Aligned payload size (in bytes) of the log block
202 uint64_t lbp_payload_asize;
204 * Offset in bytes of the first buffer in the payload
206 uint64_t lbp_payload_start;
208 * lbp_prop has the following format:
209 * * logical size (in bytes)
210 * * aligned (after compression) size (in bytes)
211 * * compression algorithm (we always LZ4-compress l2arc logs)
212 * * checksum algorithm (used for lbp_cksum)
214 uint64_t lbp_prop;
215 zio_cksum_t lbp_cksum; /* checksum of log */
216 } l2arc_log_blkptr_t;
219 * The persistent L2ARC device header.
220 * Byte order of magic determines whether 64-bit bswap of fields is necessary.
222 typedef struct l2arc_dev_hdr_phys {
223 uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */
224 uint64_t dh_version; /* Persistent L2ARC version */
227 * Global L2ARC device state and metadata.
229 uint64_t dh_spa_guid;
230 uint64_t dh_vdev_guid;
231 uint64_t dh_log_entries; /* mirror of l2ad_log_entries */
232 uint64_t dh_evict; /* evicted offset in bytes */
233 uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */
235 * Used in zdb.c for determining if a log block is valid, in the same
236 * way that l2arc_rebuild() does.
238 uint64_t dh_start; /* mirror of l2ad_start */
239 uint64_t dh_end; /* mirror of l2ad_end */
241 * Start of log block chain. [0] -> newest log, [1] -> one older (used
242 * for initiating prefetch).
244 l2arc_log_blkptr_t dh_start_lbps[2];
246 * Aligned size of all log blocks as accounted by vdev_space_update().
248 uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */
249 uint64_t dh_lb_count; /* mirror of l2ad_lb_count */
251 * Mirrors of vdev_trim_action_time and vdev_trim_state, used to
252 * display when the cache device was fully trimmed for the last
253 * time.
255 uint64_t dh_trim_action_time;
256 uint64_t dh_trim_state;
257 const uint64_t dh_pad[30]; /* pad to 512 bytes */
258 zio_eck_t dh_tail;
259 } l2arc_dev_hdr_phys_t;
260 _Static_assert(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE,
261 "l2arc_dev_hdr_phys_t wrong size");
264 * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
266 typedef struct l2arc_log_ent_phys {
267 dva_t le_dva; /* dva of buffer */
268 uint64_t le_birth; /* birth txg of buffer */
270 * le_prop has the following format:
271 * * logical size (in bytes)
272 * * physical (compressed) size (in bytes)
273 * * compression algorithm
274 * * object type (used to restore arc_buf_contents_t)
275 * * protected status (used for encryption)
276 * * prefetch status (used in l2arc_read_done())
278 uint64_t le_prop;
279 uint64_t le_daddr; /* buf location on l2dev */
280 uint64_t le_complevel;
282 * We pad the size of each entry to a power of 2 so that the size of
283 * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT,
284 * because of the L2ARC_SET_*SIZE macros.
286 const uint64_t le_pad[2]; /* pad to 64 bytes */
287 } l2arc_log_ent_phys_t;
289 #define L2ARC_LOG_BLK_MAX_ENTRIES (1022)
292 * A log block of up to 1022 ARC buffer log entries, chained into the
293 * persistent L2ARC metadata linked list. Byte order of magic determines
294 * whether 64-bit bswap of fields is necessary.
296 typedef struct l2arc_log_blk_phys {
297 uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */
299 * There are 2 chains (headed by dh_start_lbps[2]), and this field
300 * points back to the previous block in this chain. We alternate
301 * which chain we append to, so they are time-wise and offset-wise
302 * interleaved, but that is an optimization rather than for
303 * correctness.
305 l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */
307 * Pad header section to 128 bytes
309 uint64_t lb_pad[7];
310 /* Payload */
311 l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES];
312 } l2arc_log_blk_phys_t; /* 64K total */
315 * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with
316 * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros.
318 _Static_assert(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t),
319 1ULL << SPA_MINBLOCKSHIFT), "l2arc_log_blk_phys_t misaligned");
320 _Static_assert(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE,
321 "l2arc_log_blk_phys_t too small");
322 _Static_assert(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE,
323 "l2arc_log_blk_phys_t too big");
326 * These structures hold in-flight abd buffers for log blocks as they're being
327 * written to the L2ARC device.
329 typedef struct l2arc_lb_abd_buf {
330 abd_t *abd;
331 list_node_t node;
332 } l2arc_lb_abd_buf_t;
335 * These structures hold pointers to log blocks present on the L2ARC device.
337 typedef struct l2arc_lb_ptr_buf {
338 l2arc_log_blkptr_t *lb_ptr;
339 list_node_t node;
340 } l2arc_lb_ptr_buf_t;
342 /* Macros for setting fields in le_prop and lbp_prop */
343 #define L2BLK_GET_LSIZE(field) \
344 BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
345 #define L2BLK_SET_LSIZE(field, x) \
346 BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
347 #define L2BLK_GET_PSIZE(field) \
348 BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
349 #define L2BLK_SET_PSIZE(field, x) \
350 BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
351 #define L2BLK_GET_COMPRESS(field) \
352 BF64_GET((field), 32, SPA_COMPRESSBITS)
353 #define L2BLK_SET_COMPRESS(field, x) \
354 BF64_SET((field), 32, SPA_COMPRESSBITS, x)
355 #define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1)
356 #define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x)
357 #define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8)
358 #define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x)
359 /* +/- 1 here are to keep compatibility after ARC_BUFC_INVALID removal. */
360 #define L2BLK_GET_TYPE(field) (BF64_GET((field), 48, 8) - 1)
361 #define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, (x) + 1)
362 #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1)
363 #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x)
364 #define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4)
365 #define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x)
367 #define PTR_SWAP(x, y) \
368 do { \
369 void *tmp = (x);\
370 x = y; \
371 y = tmp; \
372 } while (0)
374 #define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */
375 #define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */
378 * L2ARC Internals
380 typedef struct l2arc_dev {
381 vdev_t *l2ad_vdev; /* can be NULL during remove */
382 spa_t *l2ad_spa; /* can be NULL during remove */
383 uint64_t l2ad_hand; /* next write location */
384 uint64_t l2ad_start; /* first addr on device */
385 uint64_t l2ad_end; /* last addr on device */
386 boolean_t l2ad_first; /* first sweep through */
387 boolean_t l2ad_writing; /* currently writing */
388 kmutex_t l2ad_mtx; /* lock for buffer list */
389 list_t l2ad_buflist; /* buffer list */
390 list_node_t l2ad_node; /* device list node */
391 zfs_refcount_t l2ad_alloc; /* allocated bytes */
393 * Persistence-related stuff
395 l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */
396 uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */
397 l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
398 int l2ad_log_ent_idx; /* index into cur log blk */
399 /* Number of bytes in current log block's payload */
400 uint64_t l2ad_log_blk_payload_asize;
402 * Offset (in bytes) of the first buffer in current log block's
403 * payload.
405 uint64_t l2ad_log_blk_payload_start;
406 /* Flag indicating whether a rebuild is scheduled or is going on */
407 boolean_t l2ad_rebuild;
408 boolean_t l2ad_rebuild_cancel;
409 boolean_t l2ad_rebuild_began;
410 uint64_t l2ad_log_entries; /* entries per log blk */
411 uint64_t l2ad_evict; /* evicted offset in bytes */
412 /* List of pointers to log blocks present in the L2ARC device */
413 list_t l2ad_lbptr_list;
415 * Aligned size of all log blocks as accounted by vdev_space_update().
417 zfs_refcount_t l2ad_lb_asize;
419 * Number of log blocks present on the device.
421 zfs_refcount_t l2ad_lb_count;
422 boolean_t l2ad_trim_all; /* TRIM whole device */
423 } l2arc_dev_t;
426 * Encrypted blocks will need to be stored encrypted on the L2ARC
427 * disk as they appear in the main pool. In order for this to work we
428 * need to pass around the encryption parameters so they can be used
429 * to write data to the L2ARC. This struct is only defined in the
430 * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
431 * flag set.
433 typedef struct arc_buf_hdr_crypt {
434 abd_t *b_rabd; /* raw encrypted data */
436 /* dsobj for looking up encryption key for l2arc encryption */
437 uint64_t b_dsobj;
439 dmu_object_type_t b_ot; /* object type */
441 /* encryption parameters */
442 uint8_t b_salt[ZIO_DATA_SALT_LEN];
443 uint8_t b_iv[ZIO_DATA_IV_LEN];
446 * Technically this could be removed since we will always be able to
447 * get the mac from the bp when we need it. However, it is inconvenient
448 * for callers of arc code to have to pass a bp in all the time. This
449 * also allows us to assert that L2ARC data is properly encrypted to
450 * match the data in the main storage pool.
452 uint8_t b_mac[ZIO_DATA_MAC_LEN];
453 } arc_buf_hdr_crypt_t;
455 typedef struct l2arc_buf_hdr {
456 /* protected by arc_buf_hdr mutex */
457 l2arc_dev_t *b_dev; /* L2ARC device */
458 uint64_t b_daddr; /* disk address, offset byte */
459 uint32_t b_hits;
460 arc_state_type_t b_arcs_state;
461 list_node_t b_l2node;
462 } l2arc_buf_hdr_t;
464 typedef struct l2arc_write_callback {
465 l2arc_dev_t *l2wcb_dev; /* device info */
466 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
467 /* in-flight list of log blocks */
468 list_t l2wcb_abd_list;
469 } l2arc_write_callback_t;
471 struct arc_buf_hdr {
472 /* protected by hash lock */
473 dva_t b_dva;
474 uint64_t b_birth;
476 arc_buf_contents_t b_type;
477 uint8_t b_complevel;
478 uint8_t b_reserved1; /* used for 4 byte alignment */
479 uint16_t b_l2size; /* alignment or L2-only size */
480 arc_buf_hdr_t *b_hash_next;
481 arc_flags_t b_flags;
484 * This field stores the size of the data buffer after
485 * compression, and is set in the arc's zio completion handlers.
486 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
488 * While the block pointers can store up to 32MB in their psize
489 * field, we can only store up to 32MB minus 512B. This is due
490 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
491 * a field of zeros represents 512B in the bp). We can't use a
492 * bias of 1 since we need to reserve a psize of zero, here, to
493 * represent holes and embedded blocks.
495 * This isn't a problem in practice, since the maximum size of a
496 * buffer is limited to 16MB, so we never need to store 32MB in
497 * this field. Even in the upstream illumos code base, the
498 * maximum size of a buffer is limited to 16MB.
500 uint16_t b_psize;
503 * This field stores the size of the data buffer before
504 * compression, and cannot change once set. It is in units
505 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
507 uint16_t b_lsize; /* immutable */
508 uint64_t b_spa; /* immutable */
510 /* L2ARC fields. Undefined when not in L2ARC. */
511 l2arc_buf_hdr_t b_l2hdr;
512 /* L1ARC fields. Undefined when in l2arc_only state */
513 l1arc_buf_hdr_t b_l1hdr;
515 * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
516 * is set and the L1 header exists.
518 arc_buf_hdr_crypt_t b_crypt_hdr;
521 typedef struct arc_stats {
522 /* Number of requests that were satisfied without I/O. */
523 kstat_named_t arcstat_hits;
524 /* Number of requests for which I/O was already running. */
525 kstat_named_t arcstat_iohits;
526 /* Number of requests for which I/O has to be issued. */
527 kstat_named_t arcstat_misses;
528 /* Same three, but specifically for demand data. */
529 kstat_named_t arcstat_demand_data_hits;
530 kstat_named_t arcstat_demand_data_iohits;
531 kstat_named_t arcstat_demand_data_misses;
532 /* Same three, but specifically for demand metadata. */
533 kstat_named_t arcstat_demand_metadata_hits;
534 kstat_named_t arcstat_demand_metadata_iohits;
535 kstat_named_t arcstat_demand_metadata_misses;
536 /* Same three, but specifically for prefetch data. */
537 kstat_named_t arcstat_prefetch_data_hits;
538 kstat_named_t arcstat_prefetch_data_iohits;
539 kstat_named_t arcstat_prefetch_data_misses;
540 /* Same three, but specifically for prefetch metadata. */
541 kstat_named_t arcstat_prefetch_metadata_hits;
542 kstat_named_t arcstat_prefetch_metadata_iohits;
543 kstat_named_t arcstat_prefetch_metadata_misses;
544 kstat_named_t arcstat_mru_hits;
545 kstat_named_t arcstat_mru_ghost_hits;
546 kstat_named_t arcstat_mfu_hits;
547 kstat_named_t arcstat_mfu_ghost_hits;
548 kstat_named_t arcstat_uncached_hits;
549 kstat_named_t arcstat_deleted;
551 * Number of buffers that could not be evicted because the hash lock
552 * was held by another thread. The lock may not necessarily be held
553 * by something using the same buffer, since hash locks are shared
554 * by multiple buffers.
556 kstat_named_t arcstat_mutex_miss;
558 * Number of buffers skipped when updating the access state due to the
559 * header having already been released after acquiring the hash lock.
561 kstat_named_t arcstat_access_skip;
563 * Number of buffers skipped because they have I/O in progress, are
564 * indirect prefetch buffers that have not lived long enough, or are
565 * not from the spa we're trying to evict from.
567 kstat_named_t arcstat_evict_skip;
569 * Number of times arc_evict_state() was unable to evict enough
570 * buffers to reach its target amount.
572 kstat_named_t arcstat_evict_not_enough;
573 kstat_named_t arcstat_evict_l2_cached;
574 kstat_named_t arcstat_evict_l2_eligible;
575 kstat_named_t arcstat_evict_l2_eligible_mfu;
576 kstat_named_t arcstat_evict_l2_eligible_mru;
577 kstat_named_t arcstat_evict_l2_ineligible;
578 kstat_named_t arcstat_evict_l2_skip;
579 kstat_named_t arcstat_hash_elements;
580 kstat_named_t arcstat_hash_elements_max;
581 kstat_named_t arcstat_hash_collisions;
582 kstat_named_t arcstat_hash_chains;
583 kstat_named_t arcstat_hash_chain_max;
584 kstat_named_t arcstat_meta;
585 kstat_named_t arcstat_pd;
586 kstat_named_t arcstat_pm;
587 kstat_named_t arcstat_c;
588 kstat_named_t arcstat_c_min;
589 kstat_named_t arcstat_c_max;
590 kstat_named_t arcstat_size;
592 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
593 * Note that the compressed bytes may match the uncompressed bytes
594 * if the block is either not compressed or compressed arc is disabled.
596 kstat_named_t arcstat_compressed_size;
598 * Uncompressed size of the data stored in b_pabd. If compressed
599 * arc is disabled then this value will be identical to the stat
600 * above.
602 kstat_named_t arcstat_uncompressed_size;
604 * Number of bytes stored in all the arc_buf_t's. This is classified
605 * as "overhead" since this data is typically short-lived and will
606 * be evicted from the arc when it becomes unreferenced unless the
607 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
608 * values have been set (see comment in dbuf.c for more information).
610 kstat_named_t arcstat_overhead_size;
612 * Number of bytes consumed by internal ARC structures necessary
613 * for tracking purposes; these structures are not actually
614 * backed by ARC buffers. This includes arc_buf_hdr_t structures
615 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
616 * caches), and arc_buf_t structures (allocated via arc_buf_t
617 * cache).
619 kstat_named_t arcstat_hdr_size;
621 * Number of bytes consumed by ARC buffers of type equal to
622 * ARC_BUFC_DATA. This is generally consumed by buffers backing
623 * on disk user data (e.g. plain file contents).
625 kstat_named_t arcstat_data_size;
627 * Number of bytes consumed by ARC buffers of type equal to
628 * ARC_BUFC_METADATA. This is generally consumed by buffers
629 * backing on disk data that is used for internal ZFS
630 * structures (e.g. ZAP, dnode, indirect blocks, etc).
632 kstat_named_t arcstat_metadata_size;
634 * Number of bytes consumed by dmu_buf_impl_t objects.
636 kstat_named_t arcstat_dbuf_size;
638 * Number of bytes consumed by dnode_t objects.
640 kstat_named_t arcstat_dnode_size;
642 * Number of bytes consumed by bonus buffers.
644 kstat_named_t arcstat_bonus_size;
645 #if defined(COMPAT_FREEBSD11)
647 * Sum of the previous three counters, provided for compatibility.
649 kstat_named_t arcstat_other_size;
650 #endif
653 * Total number of bytes consumed by ARC buffers residing in the
654 * arc_anon state. This includes *all* buffers in the arc_anon
655 * state; e.g. data, metadata, evictable, and unevictable buffers
656 * are all included in this value.
658 kstat_named_t arcstat_anon_size;
659 kstat_named_t arcstat_anon_data;
660 kstat_named_t arcstat_anon_metadata;
662 * Number of bytes consumed by ARC buffers that meet the
663 * following criteria: backing buffers of type ARC_BUFC_DATA,
664 * residing in the arc_anon state, and are eligible for eviction
665 * (e.g. have no outstanding holds on the buffer).
667 kstat_named_t arcstat_anon_evictable_data;
669 * Number of bytes consumed by ARC buffers that meet the
670 * following criteria: backing buffers of type ARC_BUFC_METADATA,
671 * residing in the arc_anon state, and are eligible for eviction
672 * (e.g. have no outstanding holds on the buffer).
674 kstat_named_t arcstat_anon_evictable_metadata;
676 * Total number of bytes consumed by ARC buffers residing in the
677 * arc_mru state. This includes *all* buffers in the arc_mru
678 * state; e.g. data, metadata, evictable, and unevictable buffers
679 * are all included in this value.
681 kstat_named_t arcstat_mru_size;
682 kstat_named_t arcstat_mru_data;
683 kstat_named_t arcstat_mru_metadata;
685 * Number of bytes consumed by ARC buffers that meet the
686 * following criteria: backing buffers of type ARC_BUFC_DATA,
687 * residing in the arc_mru state, and are eligible for eviction
688 * (e.g. have no outstanding holds on the buffer).
690 kstat_named_t arcstat_mru_evictable_data;
692 * Number of bytes consumed by ARC buffers that meet the
693 * following criteria: backing buffers of type ARC_BUFC_METADATA,
694 * residing in the arc_mru state, and are eligible for eviction
695 * (e.g. have no outstanding holds on the buffer).
697 kstat_named_t arcstat_mru_evictable_metadata;
699 * Total number of bytes that *would have been* consumed by ARC
700 * buffers in the arc_mru_ghost state. The key thing to note
701 * here, is the fact that this size doesn't actually indicate
702 * RAM consumption. The ghost lists only consist of headers and
703 * don't actually have ARC buffers linked off of these headers.
704 * Thus, *if* the headers had associated ARC buffers, these
705 * buffers *would have* consumed this number of bytes.
707 kstat_named_t arcstat_mru_ghost_size;
708 kstat_named_t arcstat_mru_ghost_data;
709 kstat_named_t arcstat_mru_ghost_metadata;
711 * Number of bytes that *would have been* consumed by ARC
712 * buffers that are eligible for eviction, of type
713 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
715 kstat_named_t arcstat_mru_ghost_evictable_data;
717 * Number of bytes that *would have been* consumed by ARC
718 * buffers that are eligible for eviction, of type
719 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
721 kstat_named_t arcstat_mru_ghost_evictable_metadata;
723 * Total number of bytes consumed by ARC buffers residing in the
724 * arc_mfu state. This includes *all* buffers in the arc_mfu
725 * state; e.g. data, metadata, evictable, and unevictable buffers
726 * are all included in this value.
728 kstat_named_t arcstat_mfu_size;
729 kstat_named_t arcstat_mfu_data;
730 kstat_named_t arcstat_mfu_metadata;
732 * Number of bytes consumed by ARC buffers that are eligible for
733 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
734 * state.
736 kstat_named_t arcstat_mfu_evictable_data;
738 * Number of bytes consumed by ARC buffers that are eligible for
739 * eviction, of type ARC_BUFC_METADATA, and reside in the
740 * arc_mfu state.
742 kstat_named_t arcstat_mfu_evictable_metadata;
744 * Total number of bytes that *would have been* consumed by ARC
745 * buffers in the arc_mfu_ghost state. See the comment above
746 * arcstat_mru_ghost_size for more details.
748 kstat_named_t arcstat_mfu_ghost_size;
749 kstat_named_t arcstat_mfu_ghost_data;
750 kstat_named_t arcstat_mfu_ghost_metadata;
752 * Number of bytes that *would have been* consumed by ARC
753 * buffers that are eligible for eviction, of type
754 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
756 kstat_named_t arcstat_mfu_ghost_evictable_data;
758 * Number of bytes that *would have been* consumed by ARC
759 * buffers that are eligible for eviction, of type
760 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
762 kstat_named_t arcstat_mfu_ghost_evictable_metadata;
764 * Total number of bytes that are going to be evicted from ARC due to
765 * ARC_FLAG_UNCACHED being set.
767 kstat_named_t arcstat_uncached_size;
768 kstat_named_t arcstat_uncached_data;
769 kstat_named_t arcstat_uncached_metadata;
771 * Number of data bytes that are going to be evicted from ARC due to
772 * ARC_FLAG_UNCACHED being set.
774 kstat_named_t arcstat_uncached_evictable_data;
776 * Number of metadata bytes that that are going to be evicted from ARC
777 * due to ARC_FLAG_UNCACHED being set.
779 kstat_named_t arcstat_uncached_evictable_metadata;
780 kstat_named_t arcstat_l2_hits;
781 kstat_named_t arcstat_l2_misses;
783 * Allocated size (in bytes) of L2ARC cached buffers by ARC state.
785 kstat_named_t arcstat_l2_prefetch_asize;
786 kstat_named_t arcstat_l2_mru_asize;
787 kstat_named_t arcstat_l2_mfu_asize;
789 * Allocated size (in bytes) of L2ARC cached buffers by buffer content
790 * type.
792 kstat_named_t arcstat_l2_bufc_data_asize;
793 kstat_named_t arcstat_l2_bufc_metadata_asize;
794 kstat_named_t arcstat_l2_feeds;
795 kstat_named_t arcstat_l2_rw_clash;
796 kstat_named_t arcstat_l2_read_bytes;
797 kstat_named_t arcstat_l2_write_bytes;
798 kstat_named_t arcstat_l2_writes_sent;
799 kstat_named_t arcstat_l2_writes_done;
800 kstat_named_t arcstat_l2_writes_error;
801 kstat_named_t arcstat_l2_writes_lock_retry;
802 kstat_named_t arcstat_l2_evict_lock_retry;
803 kstat_named_t arcstat_l2_evict_reading;
804 kstat_named_t arcstat_l2_evict_l1cached;
805 kstat_named_t arcstat_l2_free_on_write;
806 kstat_named_t arcstat_l2_abort_lowmem;
807 kstat_named_t arcstat_l2_cksum_bad;
808 kstat_named_t arcstat_l2_io_error;
809 kstat_named_t arcstat_l2_lsize;
810 kstat_named_t arcstat_l2_psize;
811 kstat_named_t arcstat_l2_hdr_size;
813 * Number of L2ARC log blocks written. These are used for restoring the
814 * L2ARC. Updated during writing of L2ARC log blocks.
816 kstat_named_t arcstat_l2_log_blk_writes;
818 * Moving average of the aligned size of the L2ARC log blocks, in
819 * bytes. Updated during L2ARC rebuild and during writing of L2ARC
820 * log blocks.
822 kstat_named_t arcstat_l2_log_blk_avg_asize;
823 /* Aligned size of L2ARC log blocks on L2ARC devices. */
824 kstat_named_t arcstat_l2_log_blk_asize;
825 /* Number of L2ARC log blocks present on L2ARC devices. */
826 kstat_named_t arcstat_l2_log_blk_count;
828 * Moving average of the aligned size of L2ARC restored data, in bytes,
829 * to the aligned size of their metadata in L2ARC, in bytes.
830 * Updated during L2ARC rebuild and during writing of L2ARC log blocks.
832 kstat_named_t arcstat_l2_data_to_meta_ratio;
834 * Number of times the L2ARC rebuild was successful for an L2ARC device.
836 kstat_named_t arcstat_l2_rebuild_success;
838 * Number of times the L2ARC rebuild failed because the device header
839 * was in an unsupported format or corrupted.
841 kstat_named_t arcstat_l2_rebuild_abort_unsupported;
843 * Number of times the L2ARC rebuild failed because of IO errors
844 * while reading a log block.
846 kstat_named_t arcstat_l2_rebuild_abort_io_errors;
848 * Number of times the L2ARC rebuild failed because of IO errors when
849 * reading the device header.
851 kstat_named_t arcstat_l2_rebuild_abort_dh_errors;
853 * Number of L2ARC log blocks which failed to be restored due to
854 * checksum errors.
856 kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors;
858 * Number of times the L2ARC rebuild was aborted due to low system
859 * memory.
861 kstat_named_t arcstat_l2_rebuild_abort_lowmem;
862 /* Logical size of L2ARC restored data, in bytes. */
863 kstat_named_t arcstat_l2_rebuild_size;
864 /* Aligned size of L2ARC restored data, in bytes. */
865 kstat_named_t arcstat_l2_rebuild_asize;
867 * Number of L2ARC log entries (buffers) that were successfully
868 * restored in ARC.
870 kstat_named_t arcstat_l2_rebuild_bufs;
872 * Number of L2ARC log entries (buffers) already cached in ARC. These
873 * were not restored again.
875 kstat_named_t arcstat_l2_rebuild_bufs_precached;
877 * Number of L2ARC log blocks that were restored successfully. Each
878 * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.
880 kstat_named_t arcstat_l2_rebuild_log_blks;
881 kstat_named_t arcstat_memory_throttle_count;
882 kstat_named_t arcstat_memory_direct_count;
883 kstat_named_t arcstat_memory_indirect_count;
884 kstat_named_t arcstat_memory_all_bytes;
885 kstat_named_t arcstat_memory_free_bytes;
886 kstat_named_t arcstat_memory_available_bytes;
887 kstat_named_t arcstat_no_grow;
888 kstat_named_t arcstat_tempreserve;
889 kstat_named_t arcstat_loaned_bytes;
890 kstat_named_t arcstat_prune;
891 kstat_named_t arcstat_meta_used;
892 kstat_named_t arcstat_dnode_limit;
893 kstat_named_t arcstat_async_upgrade_sync;
894 /* Number of predictive prefetch requests. */
895 kstat_named_t arcstat_predictive_prefetch;
896 /* Number of requests for which predictive prefetch has completed. */
897 kstat_named_t arcstat_demand_hit_predictive_prefetch;
898 /* Number of requests for which predictive prefetch was running. */
899 kstat_named_t arcstat_demand_iohit_predictive_prefetch;
900 /* Number of prescient prefetch requests. */
901 kstat_named_t arcstat_prescient_prefetch;
902 /* Number of requests for which prescient prefetch has completed. */
903 kstat_named_t arcstat_demand_hit_prescient_prefetch;
904 /* Number of requests for which prescient prefetch was running. */
905 kstat_named_t arcstat_demand_iohit_prescient_prefetch;
906 kstat_named_t arcstat_need_free;
907 kstat_named_t arcstat_sys_free;
908 kstat_named_t arcstat_raw_size;
909 kstat_named_t arcstat_cached_only_in_progress;
910 kstat_named_t arcstat_abd_chunk_waste_size;
911 } arc_stats_t;
913 typedef struct arc_sums {
914 wmsum_t arcstat_hits;
915 wmsum_t arcstat_iohits;
916 wmsum_t arcstat_misses;
917 wmsum_t arcstat_demand_data_hits;
918 wmsum_t arcstat_demand_data_iohits;
919 wmsum_t arcstat_demand_data_misses;
920 wmsum_t arcstat_demand_metadata_hits;
921 wmsum_t arcstat_demand_metadata_iohits;
922 wmsum_t arcstat_demand_metadata_misses;
923 wmsum_t arcstat_prefetch_data_hits;
924 wmsum_t arcstat_prefetch_data_iohits;
925 wmsum_t arcstat_prefetch_data_misses;
926 wmsum_t arcstat_prefetch_metadata_hits;
927 wmsum_t arcstat_prefetch_metadata_iohits;
928 wmsum_t arcstat_prefetch_metadata_misses;
929 wmsum_t arcstat_mru_hits;
930 wmsum_t arcstat_mru_ghost_hits;
931 wmsum_t arcstat_mfu_hits;
932 wmsum_t arcstat_mfu_ghost_hits;
933 wmsum_t arcstat_uncached_hits;
934 wmsum_t arcstat_deleted;
935 wmsum_t arcstat_mutex_miss;
936 wmsum_t arcstat_access_skip;
937 wmsum_t arcstat_evict_skip;
938 wmsum_t arcstat_evict_not_enough;
939 wmsum_t arcstat_evict_l2_cached;
940 wmsum_t arcstat_evict_l2_eligible;
941 wmsum_t arcstat_evict_l2_eligible_mfu;
942 wmsum_t arcstat_evict_l2_eligible_mru;
943 wmsum_t arcstat_evict_l2_ineligible;
944 wmsum_t arcstat_evict_l2_skip;
945 wmsum_t arcstat_hash_elements;
946 wmsum_t arcstat_hash_collisions;
947 wmsum_t arcstat_hash_chains;
948 aggsum_t arcstat_size;
949 wmsum_t arcstat_compressed_size;
950 wmsum_t arcstat_uncompressed_size;
951 wmsum_t arcstat_overhead_size;
952 wmsum_t arcstat_hdr_size;
953 wmsum_t arcstat_data_size;
954 wmsum_t arcstat_metadata_size;
955 wmsum_t arcstat_dbuf_size;
956 wmsum_t arcstat_dnode_size;
957 wmsum_t arcstat_bonus_size;
958 wmsum_t arcstat_l2_hits;
959 wmsum_t arcstat_l2_misses;
960 wmsum_t arcstat_l2_prefetch_asize;
961 wmsum_t arcstat_l2_mru_asize;
962 wmsum_t arcstat_l2_mfu_asize;
963 wmsum_t arcstat_l2_bufc_data_asize;
964 wmsum_t arcstat_l2_bufc_metadata_asize;
965 wmsum_t arcstat_l2_feeds;
966 wmsum_t arcstat_l2_rw_clash;
967 wmsum_t arcstat_l2_read_bytes;
968 wmsum_t arcstat_l2_write_bytes;
969 wmsum_t arcstat_l2_writes_sent;
970 wmsum_t arcstat_l2_writes_done;
971 wmsum_t arcstat_l2_writes_error;
972 wmsum_t arcstat_l2_writes_lock_retry;
973 wmsum_t arcstat_l2_evict_lock_retry;
974 wmsum_t arcstat_l2_evict_reading;
975 wmsum_t arcstat_l2_evict_l1cached;
976 wmsum_t arcstat_l2_free_on_write;
977 wmsum_t arcstat_l2_abort_lowmem;
978 wmsum_t arcstat_l2_cksum_bad;
979 wmsum_t arcstat_l2_io_error;
980 wmsum_t arcstat_l2_lsize;
981 wmsum_t arcstat_l2_psize;
982 aggsum_t arcstat_l2_hdr_size;
983 wmsum_t arcstat_l2_log_blk_writes;
984 wmsum_t arcstat_l2_log_blk_asize;
985 wmsum_t arcstat_l2_log_blk_count;
986 wmsum_t arcstat_l2_rebuild_success;
987 wmsum_t arcstat_l2_rebuild_abort_unsupported;
988 wmsum_t arcstat_l2_rebuild_abort_io_errors;
989 wmsum_t arcstat_l2_rebuild_abort_dh_errors;
990 wmsum_t arcstat_l2_rebuild_abort_cksum_lb_errors;
991 wmsum_t arcstat_l2_rebuild_abort_lowmem;
992 wmsum_t arcstat_l2_rebuild_size;
993 wmsum_t arcstat_l2_rebuild_asize;
994 wmsum_t arcstat_l2_rebuild_bufs;
995 wmsum_t arcstat_l2_rebuild_bufs_precached;
996 wmsum_t arcstat_l2_rebuild_log_blks;
997 wmsum_t arcstat_memory_throttle_count;
998 wmsum_t arcstat_memory_direct_count;
999 wmsum_t arcstat_memory_indirect_count;
1000 wmsum_t arcstat_prune;
1001 wmsum_t arcstat_meta_used;
1002 wmsum_t arcstat_async_upgrade_sync;
1003 wmsum_t arcstat_predictive_prefetch;
1004 wmsum_t arcstat_demand_hit_predictive_prefetch;
1005 wmsum_t arcstat_demand_iohit_predictive_prefetch;
1006 wmsum_t arcstat_prescient_prefetch;
1007 wmsum_t arcstat_demand_hit_prescient_prefetch;
1008 wmsum_t arcstat_demand_iohit_prescient_prefetch;
1009 wmsum_t arcstat_raw_size;
1010 wmsum_t arcstat_cached_only_in_progress;
1011 wmsum_t arcstat_abd_chunk_waste_size;
1012 } arc_sums_t;
1014 typedef struct arc_evict_waiter {
1015 list_node_t aew_node;
1016 kcondvar_t aew_cv;
1017 uint64_t aew_count;
1018 } arc_evict_waiter_t;
1020 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
1022 #define ARCSTAT_INCR(stat, val) \
1023 wmsum_add(&arc_sums.stat, (val))
1025 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
1026 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
1028 #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */
1029 #define arc_meta ARCSTAT(arcstat_meta) /* target frac of metadata */
1030 #define arc_pd ARCSTAT(arcstat_pd) /* target frac of data MRU */
1031 #define arc_pm ARCSTAT(arcstat_pm) /* target frac of meta MRU */
1032 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
1033 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
1034 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
1035 #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
1037 #define arc_anon (&ARC_anon)
1038 #define arc_mru (&ARC_mru)
1039 #define arc_mru_ghost (&ARC_mru_ghost)
1040 #define arc_mfu (&ARC_mfu)
1041 #define arc_mfu_ghost (&ARC_mfu_ghost)
1042 #define arc_l2c_only (&ARC_l2c_only)
1043 #define arc_uncached (&ARC_uncached)
1045 extern taskq_t *arc_prune_taskq;
1046 extern arc_stats_t arc_stats;
1047 extern arc_sums_t arc_sums;
1048 extern hrtime_t arc_growtime;
1049 extern boolean_t arc_warm;
1050 extern uint_t arc_grow_retry;
1051 extern uint_t arc_no_grow_shift;
1052 extern uint_t arc_shrink_shift;
1053 extern kmutex_t arc_prune_mtx;
1054 extern list_t arc_prune_list;
1055 extern arc_state_t ARC_mfu;
1056 extern arc_state_t ARC_mru;
1057 extern uint_t zfs_arc_pc_percent;
1058 extern uint_t arc_lotsfree_percent;
1059 extern uint64_t zfs_arc_min;
1060 extern uint64_t zfs_arc_max;
1062 extern uint64_t arc_reduce_target_size(uint64_t to_free);
1063 extern boolean_t arc_reclaim_needed(void);
1064 extern void arc_kmem_reap_soon(void);
1065 extern void arc_wait_for_eviction(uint64_t, boolean_t, boolean_t);
1067 extern void arc_lowmem_init(void);
1068 extern void arc_lowmem_fini(void);
1069 extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg);
1070 extern uint64_t arc_free_memory(void);
1071 extern int64_t arc_available_memory(void);
1072 extern void arc_tuning_update(boolean_t);
1073 extern void arc_register_hotplug(void);
1074 extern void arc_unregister_hotplug(void);
1076 extern int param_set_arc_u64(ZFS_MODULE_PARAM_ARGS);
1077 extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
1078 extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS);
1079 extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS);
1081 /* used in zdb.c */
1082 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
1083 const l2arc_log_blkptr_t *lbp);
1085 /* used in vdev_trim.c */
1086 void l2arc_dev_hdr_update(l2arc_dev_t *dev);
1087 l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
1089 #ifdef __cplusplus
1091 #endif
1093 #endif /* _SYS_ARC_IMPL_H */