4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #pragma ident "%Z%%M% %I% %E% SMI"
30 #include <sys/spa_impl.h>
32 #include <sys/dsl_synctask.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/dmu_objset.h>
35 #include <sys/utsname.h>
36 #include <sys/cmn_err.h>
37 #include <sys/sunddi.h>
43 * Routines to manage the on-disk history log.
45 * The history log is stored as a dmu object containing
46 * <packed record length, record nvlist> tuples.
48 * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
49 * "packed record length" is the packed length of the "record nvlist" stored
50 * as a little endian uint64_t.
52 * The log is implemented as a ring buffer, though the original creation
53 * of the pool ('zpool create') is never overwritten.
55 * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer
56 * of 'spa_history' stores the offsets for logging/retrieving history as
57 * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of
58 * where the 'zpool create' record is stored. This allows us to never
59 * overwrite the original creation of the pool. 'sh_phys_max_off' is the
60 * physical ending offset in bytes of the log. This tells you the length of
61 * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record
62 * is added, 'sh_eof' is incremented by the the size of the record.
63 * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes).
64 * This is where the consumer should start reading from after reading in
65 * the 'zpool create' portion of the log.
67 * 'sh_records_lost' keeps track of how many records have been overwritten
68 * and permanently lost.
71 /* convert a logical offset to physical */
73 spa_history_log_to_phys(uint64_t log_off
, spa_history_phys_t
*shpp
)
77 phys_len
= shpp
->sh_phys_max_off
- shpp
->sh_pool_create_len
;
78 return ((log_off
- shpp
->sh_pool_create_len
) % phys_len
79 + shpp
->sh_pool_create_len
);
83 spa_history_create_obj(spa_t
*spa
, dmu_tx_t
*tx
)
86 spa_history_phys_t
*shpp
;
87 objset_t
*mos
= spa
->spa_meta_objset
;
89 ASSERT(spa
->spa_history
== 0);
90 spa
->spa_history
= dmu_object_alloc(mos
, DMU_OT_SPA_HISTORY
,
91 SPA_MAXBLOCKSIZE
, DMU_OT_SPA_HISTORY_OFFSETS
,
92 sizeof (spa_history_phys_t
), tx
);
94 VERIFY(zap_add(mos
, DMU_POOL_DIRECTORY_OBJECT
,
95 DMU_POOL_HISTORY
, sizeof (uint64_t), 1,
96 &spa
->spa_history
, tx
) == 0);
98 VERIFY(0 == dmu_bonus_hold(mos
, spa
->spa_history
, FTAG
, &dbp
));
99 ASSERT(dbp
->db_size
>= sizeof (spa_history_phys_t
));
102 dmu_buf_will_dirty(dbp
, tx
);
105 * Figure out maximum size of history log. We set it at
106 * 1% of pool size, with a max of 32MB and min of 128KB.
108 shpp
->sh_phys_max_off
= spa_get_dspace(spa
) / 100;
109 shpp
->sh_phys_max_off
= MIN(shpp
->sh_phys_max_off
, 32<<20);
110 shpp
->sh_phys_max_off
= MAX(shpp
->sh_phys_max_off
, 128<<10);
112 dmu_buf_rele(dbp
, FTAG
);
116 * Change 'sh_bof' to the beginning of the next record.
119 spa_history_advance_bof(spa_t
*spa
, spa_history_phys_t
*shpp
)
121 objset_t
*mos
= spa
->spa_meta_objset
;
122 uint64_t firstread
, reclen
, phys_bof
;
123 char buf
[sizeof (reclen
)];
126 phys_bof
= spa_history_log_to_phys(shpp
->sh_bof
, shpp
);
127 firstread
= MIN(sizeof (reclen
), shpp
->sh_phys_max_off
- phys_bof
);
129 if ((err
= dmu_read(mos
, spa
->spa_history
, phys_bof
, firstread
,
132 if (firstread
!= sizeof (reclen
)) {
133 if ((err
= dmu_read(mos
, spa
->spa_history
,
134 shpp
->sh_pool_create_len
, sizeof (reclen
) - firstread
,
135 buf
+ firstread
)) != 0)
139 reclen
= LE_64(*((uint64_t *)buf
));
140 shpp
->sh_bof
+= reclen
+ sizeof (reclen
);
141 shpp
->sh_records_lost
++;
146 spa_history_write(spa_t
*spa
, void *buf
, uint64_t len
, spa_history_phys_t
*shpp
,
149 uint64_t firstwrite
, phys_eof
;
150 objset_t
*mos
= spa
->spa_meta_objset
;
153 ASSERT(MUTEX_HELD(&spa
->spa_history_lock
));
155 /* see if we need to reset logical BOF */
156 while (shpp
->sh_phys_max_off
- shpp
->sh_pool_create_len
-
157 (shpp
->sh_eof
- shpp
->sh_bof
) <= len
) {
158 if ((err
= spa_history_advance_bof(spa
, shpp
)) != 0) {
163 phys_eof
= spa_history_log_to_phys(shpp
->sh_eof
, shpp
);
164 firstwrite
= MIN(len
, shpp
->sh_phys_max_off
- phys_eof
);
166 dmu_write(mos
, spa
->spa_history
, phys_eof
, firstwrite
, buf
, tx
);
170 /* write out the rest at the beginning of physical file */
171 dmu_write(mos
, spa
->spa_history
, shpp
->sh_pool_create_len
,
172 len
, (char *)buf
+ firstwrite
, tx
);
181 #if defined(_KERNEL) && !defined(__NetBSD__)
182 return (curproc
->p_zone
->zone_name
);
189 * Write out a history event.
192 spa_history_log_sync(void *arg1
, void *arg2
, cred_t
*cr
, dmu_tx_t
*tx
)
195 history_arg_t
*hap
= arg2
;
196 const char *history_str
= hap
->ha_history_str
;
197 objset_t
*mos
= spa
->spa_meta_objset
;
199 spa_history_phys_t
*shpp
;
203 char *record_packed
= NULL
;
207 * If we have an older pool that doesn't have a command
208 * history object, create it now.
210 mutex_enter(&spa
->spa_history_lock
);
211 if (!spa
->spa_history
)
212 spa_history_create_obj(spa
, tx
);
213 mutex_exit(&spa
->spa_history_lock
);
216 * Get the offset of where we need to write via the bonus buffer.
217 * Update the offset when the write completes.
219 VERIFY(0 == dmu_bonus_hold(mos
, spa
->spa_history
, FTAG
, &dbp
));
222 dmu_buf_will_dirty(dbp
, tx
);
226 dmu_object_info_t doi
;
227 dmu_object_info_from_db(dbp
, &doi
);
228 ASSERT3U(doi
.doi_bonus_type
, ==, DMU_OT_SPA_HISTORY_OFFSETS
);
232 VERIFY(nvlist_alloc(&nvrecord
, NV_UNIQUE_NAME
, KM_SLEEP
) == 0);
233 VERIFY(nvlist_add_uint64(nvrecord
, ZPOOL_HIST_TIME
,
234 gethrestime_sec()) == 0);
235 VERIFY(nvlist_add_uint64(nvrecord
, ZPOOL_HIST_WHO
,
236 (uint64_t)crgetuid(cr
)) == 0);
237 if (hap
->ha_zone
[0] != '\0')
238 VERIFY(nvlist_add_string(nvrecord
, ZPOOL_HIST_ZONE
,
241 VERIFY(nvlist_add_string(nvrecord
, ZPOOL_HIST_HOST
,
242 utsname
.nodename
) == 0);
244 if (hap
->ha_log_type
== LOG_CMD_POOL_CREATE
||
245 hap
->ha_log_type
== LOG_CMD_NORMAL
) {
246 VERIFY(nvlist_add_string(nvrecord
, ZPOOL_HIST_CMD
,
249 VERIFY(nvlist_add_uint64(nvrecord
, ZPOOL_HIST_INT_EVENT
,
250 hap
->ha_event
) == 0);
251 VERIFY(nvlist_add_uint64(nvrecord
, ZPOOL_HIST_TXG
,
253 VERIFY(nvlist_add_string(nvrecord
, ZPOOL_HIST_INT_STR
,
257 VERIFY(nvlist_size(nvrecord
, &reclen
, NV_ENCODE_XDR
) == 0);
258 record_packed
= kmem_alloc(reclen
, KM_SLEEP
);
260 VERIFY(nvlist_pack(nvrecord
, &record_packed
, &reclen
,
261 NV_ENCODE_XDR
, KM_SLEEP
) == 0);
263 mutex_enter(&spa
->spa_history_lock
);
264 if (hap
->ha_log_type
== LOG_CMD_POOL_CREATE
)
265 VERIFY(shpp
->sh_eof
== shpp
->sh_pool_create_len
);
267 /* write out the packed length as little endian */
268 le_len
= LE_64((uint64_t)reclen
);
269 ret
= spa_history_write(spa
, &le_len
, sizeof (le_len
), shpp
, tx
);
271 ret
= spa_history_write(spa
, record_packed
, reclen
, shpp
, tx
);
273 if (!ret
&& hap
->ha_log_type
== LOG_CMD_POOL_CREATE
) {
274 shpp
->sh_pool_create_len
+= sizeof (le_len
) + reclen
;
275 shpp
->sh_bof
= shpp
->sh_pool_create_len
;
278 mutex_exit(&spa
->spa_history_lock
);
279 nvlist_free(nvrecord
);
280 kmem_free(record_packed
, reclen
);
281 dmu_buf_rele(dbp
, FTAG
);
283 if (hap
->ha_log_type
== LOG_INTERNAL
) {
284 kmem_free((void*)hap
->ha_history_str
, HIS_MAX_RECORD_LEN
);
285 kmem_free(hap
, sizeof (history_arg_t
));
290 * Write out a history event.
293 spa_history_log(spa_t
*spa
, const char *history_str
, history_log_type_t what
)
297 ASSERT(what
!= LOG_INTERNAL
);
299 ha
.ha_history_str
= history_str
;
300 ha
.ha_log_type
= what
;
301 (void) strlcpy(ha
.ha_zone
, spa_history_zone(), sizeof (ha
.ha_zone
));
302 return (dsl_sync_task_do(spa_get_dsl(spa
), NULL
, spa_history_log_sync
,
307 * Read out the command history.
310 spa_history_get(spa_t
*spa
, uint64_t *offp
, uint64_t *len
, char *buf
)
312 objset_t
*mos
= spa
->spa_meta_objset
;
314 uint64_t read_len
, phys_read_off
, phys_eof
;
315 uint64_t leftover
= 0;
316 spa_history_phys_t
*shpp
;
320 * If the command history doesn't exist (older pool),
321 * that's ok, just return ENOENT.
323 if (!spa
->spa_history
)
326 if ((err
= dmu_bonus_hold(mos
, spa
->spa_history
, FTAG
, &dbp
)) != 0)
332 dmu_object_info_t doi
;
333 dmu_object_info_from_db(dbp
, &doi
);
334 ASSERT3U(doi
.doi_bonus_type
, ==, DMU_OT_SPA_HISTORY_OFFSETS
);
338 mutex_enter(&spa
->spa_history_lock
);
339 phys_eof
= spa_history_log_to_phys(shpp
->sh_eof
, shpp
);
341 if (*offp
< shpp
->sh_pool_create_len
) {
342 /* read in just the zpool create history */
343 phys_read_off
= *offp
;
344 read_len
= MIN(*len
, shpp
->sh_pool_create_len
-
348 * Need to reset passed in offset to BOF if the passed in
349 * offset has since been overwritten.
351 *offp
= MAX(*offp
, shpp
->sh_bof
);
352 phys_read_off
= spa_history_log_to_phys(*offp
, shpp
);
355 * Read up to the minimum of what the user passed down or
356 * the EOF (physical or logical). If we hit physical EOF,
357 * use 'leftover' to read from the physical BOF.
359 if (phys_read_off
<= phys_eof
) {
360 read_len
= MIN(*len
, phys_eof
- phys_read_off
);
363 shpp
->sh_phys_max_off
- phys_read_off
);
364 if (phys_read_off
+ *len
> shpp
->sh_phys_max_off
) {
365 leftover
= MIN(*len
- read_len
,
366 phys_eof
- shpp
->sh_pool_create_len
);
371 /* offset for consumer to use next */
372 *offp
+= read_len
+ leftover
;
374 /* tell the consumer how much you actually read */
375 *len
= read_len
+ leftover
;
378 mutex_exit(&spa
->spa_history_lock
);
379 dmu_buf_rele(dbp
, FTAG
);
383 err
= dmu_read(mos
, spa
->spa_history
, phys_read_off
, read_len
, buf
);
384 if (leftover
&& err
== 0) {
385 err
= dmu_read(mos
, spa
->spa_history
, shpp
->sh_pool_create_len
,
386 leftover
, buf
+ read_len
);
388 mutex_exit(&spa
->spa_history_lock
);
390 dmu_buf_rele(dbp
, FTAG
);
395 spa_history_internal_log(history_internal_events_t event
, spa_t
*spa
,
396 dmu_tx_t
*tx
, cred_t
*cr
, const char *fmt
, ...)
403 * If this is part of creating a pool, not everything is
404 * initialized yet, so don't bother logging the internal events.
406 if (tx
->tx_txg
== TXG_INITIAL
)
409 hap
= kmem_alloc(sizeof (history_arg_t
), KM_SLEEP
);
410 str
= kmem_alloc(HIS_MAX_RECORD_LEN
, KM_SLEEP
);
413 (void) vsnprintf(str
, HIS_MAX_RECORD_LEN
, fmt
, adx
);
416 hap
->ha_log_type
= LOG_INTERNAL
;
417 hap
->ha_history_str
= str
;
418 hap
->ha_event
= event
;
419 hap
->ha_zone
[0] = '\0';
421 if (dmu_tx_is_syncing(tx
)) {
422 spa_history_log_sync(spa
, hap
, cr
, tx
);
424 dsl_sync_task_do_nowait(spa_get_dsl(spa
), NULL
,
425 spa_history_log_sync
, spa
, hap
, 0, tx
);
427 /* spa_history_log_sync() will free hap and str */