4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2015, 2018 by Delphix. All rights reserved.
24 * Copyright (c) 2022 by Pawel Jakub Dawidek
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/cmn_err.h>
33 #include <sys/thread.h>
36 #include <sys/zfs_znode.h>
37 #include <sys/zfs_dir.h>
39 #include <sys/zil_impl.h>
40 #include <sys/byteorder.h>
41 #include <sys/policy.h>
47 #include <sys/zfs_fuid.h>
48 #include <sys/dsl_dataset.h>
51 * These zfs_log_* functions must be called within a dmu tx, in one
52 * of 2 contexts depending on zilog->z_replay:
56 * We need to record the transaction so that if it is committed to
57 * the Intent Log then it can be replayed. An intent log transaction
58 * structure (itx_t) is allocated and all the information necessary to
59 * possibly replay the transaction is saved in it. The itx is then assigned
60 * a sequence number and inserted in the in-memory list anchored in the zilog.
64 * We need to mark the intent log record as replayed in the log header.
65 * This is done in the same transaction as the replay so that they
70 zfs_log_create_txtype(zil_create_t type
, vsecattr_t
*vsecp
, vattr_t
*vap
)
72 int isxvattr
= (vap
->va_mask
& ATTR_XVATTR
);
75 if (vsecp
== NULL
&& !isxvattr
)
77 if (vsecp
&& isxvattr
)
78 return (TX_CREATE_ACL_ATTR
);
80 return (TX_CREATE_ACL
);
82 return (TX_CREATE_ATTR
);
84 if (vsecp
== NULL
&& !isxvattr
)
86 if (vsecp
&& isxvattr
)
87 return (TX_MKDIR_ACL_ATTR
);
89 return (TX_MKDIR_ACL
);
91 return (TX_MKDIR_ATTR
);
100 * build up the log data necessary for logging xvattr_t
101 * First lr_attr_t is initialized. following the lr_attr_t
102 * is the mapsize and attribute bitmap copied from the xvattr_t.
103 * Following the bitmap and bitmapsize two 64 bit words are reserved
104 * for the create time which may be set. Following the create time
105 * records a single 64 bit integer which has the bits to set on
106 * replay for the xvattr.
109 zfs_log_xvattr(lr_attr_t
*lrattr
, xvattr_t
*xvap
)
113 xoap
= xva_getxoptattr(xvap
);
116 lrattr
->lr_attr_masksize
= xvap
->xva_mapsize
;
117 uint32_t *bitmap
= &lrattr
->lr_attr_bitmap
;
118 for (int i
= 0; i
!= xvap
->xva_mapsize
; i
++, bitmap
++)
119 *bitmap
= xvap
->xva_reqattrmap
[i
];
121 lr_attr_end_t
*end
= (lr_attr_end_t
*)bitmap
;
122 end
->lr_attr_attrs
= 0;
123 end
->lr_attr_crtime
[0] = 0;
124 end
->lr_attr_crtime
[1] = 0;
125 memset(end
->lr_attr_scanstamp
, 0, AV_SCANSTAMP_SZ
);
127 if (XVA_ISSET_REQ(xvap
, XAT_READONLY
))
128 end
->lr_attr_attrs
|= (xoap
->xoa_readonly
== 0) ? 0 :
130 if (XVA_ISSET_REQ(xvap
, XAT_HIDDEN
))
131 end
->lr_attr_attrs
|= (xoap
->xoa_hidden
== 0) ? 0 :
133 if (XVA_ISSET_REQ(xvap
, XAT_SYSTEM
))
134 end
->lr_attr_attrs
|= (xoap
->xoa_system
== 0) ? 0 :
136 if (XVA_ISSET_REQ(xvap
, XAT_ARCHIVE
))
137 end
->lr_attr_attrs
|= (xoap
->xoa_archive
== 0) ? 0 :
139 if (XVA_ISSET_REQ(xvap
, XAT_IMMUTABLE
))
140 end
->lr_attr_attrs
|= (xoap
->xoa_immutable
== 0) ? 0 :
142 if (XVA_ISSET_REQ(xvap
, XAT_NOUNLINK
))
143 end
->lr_attr_attrs
|= (xoap
->xoa_nounlink
== 0) ? 0 :
145 if (XVA_ISSET_REQ(xvap
, XAT_APPENDONLY
))
146 end
->lr_attr_attrs
|= (xoap
->xoa_appendonly
== 0) ? 0 :
148 if (XVA_ISSET_REQ(xvap
, XAT_OPAQUE
))
149 end
->lr_attr_attrs
|= (xoap
->xoa_opaque
== 0) ? 0 :
151 if (XVA_ISSET_REQ(xvap
, XAT_NODUMP
))
152 end
->lr_attr_attrs
|= (xoap
->xoa_nodump
== 0) ? 0 :
154 if (XVA_ISSET_REQ(xvap
, XAT_AV_QUARANTINED
))
155 end
->lr_attr_attrs
|= (xoap
->xoa_av_quarantined
== 0) ? 0 :
157 if (XVA_ISSET_REQ(xvap
, XAT_AV_MODIFIED
))
158 end
->lr_attr_attrs
|= (xoap
->xoa_av_modified
== 0) ? 0 :
160 if (XVA_ISSET_REQ(xvap
, XAT_CREATETIME
))
161 ZFS_TIME_ENCODE(&xoap
->xoa_createtime
, end
->lr_attr_crtime
);
162 if (XVA_ISSET_REQ(xvap
, XAT_AV_SCANSTAMP
)) {
163 ASSERT(!XVA_ISSET_REQ(xvap
, XAT_PROJID
));
165 memcpy(end
->lr_attr_scanstamp
, xoap
->xoa_av_scanstamp
,
167 } else if (XVA_ISSET_REQ(xvap
, XAT_PROJID
)) {
169 * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
170 * at the same time, so we can share the same space.
172 memcpy(end
->lr_attr_scanstamp
, &xoap
->xoa_projid
,
175 if (XVA_ISSET_REQ(xvap
, XAT_REPARSE
))
176 end
->lr_attr_attrs
|= (xoap
->xoa_reparse
== 0) ? 0 :
178 if (XVA_ISSET_REQ(xvap
, XAT_OFFLINE
))
179 end
->lr_attr_attrs
|= (xoap
->xoa_offline
== 0) ? 0 :
181 if (XVA_ISSET_REQ(xvap
, XAT_SPARSE
))
182 end
->lr_attr_attrs
|= (xoap
->xoa_sparse
== 0) ? 0 :
184 if (XVA_ISSET_REQ(xvap
, XAT_PROJINHERIT
))
185 end
->lr_attr_attrs
|= (xoap
->xoa_projinherit
== 0) ? 0 :
190 zfs_log_fuid_ids(zfs_fuid_info_t
*fuidp
, void *start
)
193 uint64_t *fuidloc
= start
;
195 /* First copy in the ACE FUIDs */
196 for (zfuid
= list_head(&fuidp
->z_fuids
); zfuid
;
197 zfuid
= list_next(&fuidp
->z_fuids
, zfuid
)) {
198 *fuidloc
++ = zfuid
->z_logfuid
;
205 zfs_log_fuid_domains(zfs_fuid_info_t
*fuidp
, void *start
)
207 zfs_fuid_domain_t
*zdomain
;
209 /* now copy in the domain info, if any */
210 if (fuidp
->z_domain_str_sz
!= 0) {
211 for (zdomain
= list_head(&fuidp
->z_domains
); zdomain
;
212 zdomain
= list_next(&fuidp
->z_domains
, zdomain
)) {
213 memcpy(start
, zdomain
->z_domain
,
214 strlen(zdomain
->z_domain
) + 1);
215 start
= (caddr_t
)start
+
216 strlen(zdomain
->z_domain
) + 1;
223 * If zp is an xattr node, check whether the xattr owner is unlinked.
224 * We don't want to log anything if the owner is unlinked.
227 zfs_xattr_owner_unlinked(znode_t
*zp
)
235 * zrele drops the vnode lock which violates the VOP locking contract
236 * on FreeBSD. See comment at the top of zfs_replay.c for more detail.
239 * if zp is XATTR node, keep walking up via z_xattr_parent until we
242 while (tzp
->z_pflags
& ZFS_XATTR
) {
243 ASSERT3U(zp
->z_xattr_parent
, !=, 0);
244 if (zfs_zget(ZTOZSB(tzp
), tzp
->z_xattr_parent
, &dzp
) != 0) {
252 unlinked
= tzp
->z_unlinked
;
259 * if zp is XATTR node, keep walking up via z_xattr_parent until we
262 while (zp
->z_pflags
& ZFS_XATTR
) {
263 ASSERT3U(zp
->z_xattr_parent
, !=, 0);
264 if (zfs_zget(ZTOZSB(zp
), zp
->z_xattr_parent
, &dzp
) != 0) {
271 unlinked
= zp
->z_unlinked
;
279 * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and
280 * TK_MKXATTR transactions.
282 * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
283 * domain information appended prior to the name. In this case the
284 * uid/gid in the log record will be a log centric FUID.
286 * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
287 * may contain attributes, ACL and optional fuid information.
289 * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
290 * and ACL and normal users/groups in the ACEs.
292 * There may be an optional xvattr attribute information similar
293 * to zfs_log_setattr.
295 * Also, after the file name "domain" strings may be appended.
298 zfs_log_create(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
,
299 znode_t
*dzp
, znode_t
*zp
, const char *name
, vsecattr_t
*vsecp
,
300 zfs_fuid_info_t
*fuidp
, vattr_t
*vap
)
304 lr_acl_create_t
*lracl
= NULL
;
309 xvattr_t
*xvap
= (xvattr_t
*)vap
;
310 size_t namesize
= strlen(name
) + 1;
313 if (zil_replaying(zilog
, tx
) || zfs_xattr_owner_unlinked(dzp
))
317 * If we have FUIDs present then add in space for
318 * domains and ACE fuid's if any.
321 fuidsz
+= fuidp
->z_domain_str_sz
;
322 fuidsz
+= fuidp
->z_fuid_cnt
* sizeof (uint64_t);
325 if (vap
->va_mask
& ATTR_XVATTR
)
326 xvatsize
= ZIL_XVAT_SIZE(xvap
->xva_mapsize
);
328 if ((int)txtype
== TX_CREATE_ATTR
|| (int)txtype
== TX_MKDIR_ATTR
||
329 (int)txtype
== TX_CREATE
|| (int)txtype
== TX_MKDIR
||
330 (int)txtype
== TX_MKXATTR
) {
331 txsize
= sizeof (lr_create_t
) + namesize
+ fuidsz
+ xvatsize
;
332 itx
= zil_itx_create(txtype
, txsize
);
333 lr_create_t
*lrc
= (lr_create_t
*)&itx
->itx_lr
;
334 lrdata
= &lrc
->lr_data
[0];
337 sizeof (lr_acl_create_t
) + namesize
+ fuidsz
+
338 ZIL_ACE_LENGTH(aclsize
) + xvatsize
;
339 itx
= zil_itx_create(txtype
, txsize
);
340 lracl
= (lr_acl_create_t
*)&itx
->itx_lr
;
341 lrdata
= &lracl
->lr_data
[0];
345 lr
= (_lr_create_t
*)&itx
->itx_lr
;
346 lr
->lr_doid
= dzp
->z_id
;
347 lr
->lr_foid
= zp
->z_id
;
348 /* Store dnode slot count in 8 bits above object id. */
349 LR_FOID_SET_SLOTS(lr
->lr_foid
, zp
->z_dnodesize
>> DNODE_SHIFT
);
350 lr
->lr_mode
= zp
->z_mode
;
351 if (!IS_EPHEMERAL(KUID_TO_SUID(ZTOUID(zp
)))) {
352 lr
->lr_uid
= (uint64_t)KUID_TO_SUID(ZTOUID(zp
));
354 lr
->lr_uid
= fuidp
->z_fuid_owner
;
356 if (!IS_EPHEMERAL(KGID_TO_SGID(ZTOGID(zp
)))) {
357 lr
->lr_gid
= (uint64_t)KGID_TO_SGID(ZTOGID(zp
));
359 lr
->lr_gid
= fuidp
->z_fuid_group
;
361 (void) sa_lookup(zp
->z_sa_hdl
, SA_ZPL_GEN(ZTOZSB(zp
)), &lr
->lr_gen
,
363 (void) sa_lookup(zp
->z_sa_hdl
, SA_ZPL_CRTIME(ZTOZSB(zp
)),
364 lr
->lr_crtime
, sizeof (uint64_t) * 2);
366 if (sa_lookup(zp
->z_sa_hdl
, SA_ZPL_RDEV(ZTOZSB(zp
)), &lr
->lr_rdev
,
367 sizeof (lr
->lr_rdev
)) != 0)
371 * Fill in xvattr info if any
373 if (vap
->va_mask
& ATTR_XVATTR
) {
374 zfs_log_xvattr((lr_attr_t
*)lrdata
, xvap
);
375 lrdata
= &lrdata
[xvatsize
];
378 /* Now fill in any ACL info */
381 ASSERT3P(lracl
, !=, NULL
);
382 lracl
->lr_aclcnt
= vsecp
->vsa_aclcnt
;
383 lracl
->lr_acl_bytes
= aclsize
;
384 lracl
->lr_domcnt
= fuidp
? fuidp
->z_domain_cnt
: 0;
385 lracl
->lr_fuidcnt
= fuidp
? fuidp
->z_fuid_cnt
: 0;
386 if (vsecp
->vsa_aclflags
& VSA_ACE_ACLFLAGS
)
387 lracl
->lr_acl_flags
= (uint64_t)vsecp
->vsa_aclflags
;
389 lracl
->lr_acl_flags
= 0;
391 memcpy(lrdata
, vsecp
->vsa_aclentp
, aclsize
);
392 lrdata
= &lrdata
[ZIL_ACE_LENGTH(aclsize
)];
395 /* drop in FUID info */
397 lrdata
= zfs_log_fuid_ids(fuidp
, lrdata
);
398 lrdata
= zfs_log_fuid_domains(fuidp
, lrdata
);
401 * Now place file name in log record
403 memcpy(lrdata
, name
, namesize
);
405 zil_itx_assign(zilog
, itx
, tx
);
409 * Handles both TX_REMOVE and TX_RMDIR transactions.
412 zfs_log_remove(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
,
413 znode_t
*dzp
, const char *name
, uint64_t foid
, boolean_t unlinked
)
417 size_t namesize
= strlen(name
) + 1;
419 if (zil_replaying(zilog
, tx
) || zfs_xattr_owner_unlinked(dzp
))
422 itx
= zil_itx_create(txtype
, sizeof (*lr
) + namesize
);
423 lr
= (lr_remove_t
*)&itx
->itx_lr
;
424 lr
->lr_doid
= dzp
->z_id
;
425 memcpy(&lr
->lr_data
[0], name
, namesize
);
430 * Object ids can be re-instantiated in the next txg so
431 * remove any async transactions to avoid future leaks.
432 * This can happen if a fsync occurs on the re-instantiated
433 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
434 * the new file data and flushes a write record for the old object.
437 ASSERT((txtype
& ~TX_CI
) == TX_REMOVE
);
438 zil_remove_async(zilog
, foid
);
440 zil_itx_assign(zilog
, itx
, tx
);
444 * Handles TX_LINK transactions.
447 zfs_log_link(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
,
448 znode_t
*dzp
, znode_t
*zp
, const char *name
)
452 size_t namesize
= strlen(name
) + 1;
454 if (zil_replaying(zilog
, tx
))
457 itx
= zil_itx_create(txtype
, sizeof (*lr
) + namesize
);
458 lr
= (lr_link_t
*)&itx
->itx_lr
;
459 lr
->lr_doid
= dzp
->z_id
;
460 lr
->lr_link_obj
= zp
->z_id
;
461 memcpy(&lr
->lr_data
[0], name
, namesize
);
463 zil_itx_assign(zilog
, itx
, tx
);
467 * Handles TX_SYMLINK transactions.
470 zfs_log_symlink(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
,
471 znode_t
*dzp
, znode_t
*zp
, const char *name
, const char *link
)
476 size_t namesize
= strlen(name
) + 1;
477 size_t linksize
= strlen(link
) + 1;
479 if (zil_replaying(zilog
, tx
))
482 itx
= zil_itx_create(txtype
, sizeof (*lrc
) + namesize
+ linksize
);
483 lrc
= (lr_create_t
*)&itx
->itx_lr
;
484 lr
= &lrc
->lr_create
;
485 lr
->lr_doid
= dzp
->z_id
;
486 lr
->lr_foid
= zp
->z_id
;
487 lr
->lr_uid
= KUID_TO_SUID(ZTOUID(zp
));
488 lr
->lr_gid
= KGID_TO_SGID(ZTOGID(zp
));
489 lr
->lr_mode
= zp
->z_mode
;
490 (void) sa_lookup(zp
->z_sa_hdl
, SA_ZPL_GEN(ZTOZSB(zp
)), &lr
->lr_gen
,
492 (void) sa_lookup(zp
->z_sa_hdl
, SA_ZPL_CRTIME(ZTOZSB(zp
)),
493 lr
->lr_crtime
, sizeof (uint64_t) * 2);
494 memcpy(&lrc
->lr_data
[0], name
, namesize
);
495 memcpy(&lrc
->lr_data
[namesize
], link
, linksize
);
497 zil_itx_assign(zilog
, itx
, tx
);
501 do_zfs_log_rename(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
, znode_t
*sdzp
,
502 const char *sname
, znode_t
*tdzp
, const char *dname
, znode_t
*szp
)
507 size_t snamesize
= strlen(sname
) + 1;
508 size_t dnamesize
= strlen(dname
) + 1;
510 if (zil_replaying(zilog
, tx
))
513 itx
= zil_itx_create(txtype
, sizeof (*lr
) + snamesize
+ dnamesize
);
514 lrr
= (lr_rename_t
*)&itx
->itx_lr
;
515 lr
= &lrr
->lr_rename
;
516 lr
->lr_sdoid
= sdzp
->z_id
;
517 lr
->lr_tdoid
= tdzp
->z_id
;
518 memcpy(&lrr
->lr_data
[0], sname
, snamesize
);
519 memcpy(&lrr
->lr_data
[snamesize
], dname
, dnamesize
);
520 itx
->itx_oid
= szp
->z_id
;
522 zil_itx_assign(zilog
, itx
, tx
);
526 * Handles TX_RENAME transactions.
529 zfs_log_rename(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
, znode_t
*sdzp
,
530 const char *sname
, znode_t
*tdzp
, const char *dname
, znode_t
*szp
)
533 do_zfs_log_rename(zilog
, tx
, txtype
, sdzp
, sname
, tdzp
, dname
, szp
);
537 * Handles TX_RENAME_EXCHANGE transactions.
540 zfs_log_rename_exchange(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
,
541 znode_t
*sdzp
, const char *sname
, znode_t
*tdzp
, const char *dname
,
544 txtype
|= TX_RENAME_EXCHANGE
;
545 do_zfs_log_rename(zilog
, tx
, txtype
, sdzp
, sname
, tdzp
, dname
, szp
);
549 * Handles TX_RENAME_WHITEOUT transactions.
551 * Unfortunately we cannot reuse do_zfs_log_rename because we we need to call
552 * zfs_mknode() on replay which requires stashing bits as with TX_CREATE.
555 zfs_log_rename_whiteout(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
,
556 znode_t
*sdzp
, const char *sname
, znode_t
*tdzp
, const char *dname
,
557 znode_t
*szp
, znode_t
*wzp
)
560 lr_rename_whiteout_t
*lr
;
561 size_t snamesize
= strlen(sname
) + 1;
562 size_t dnamesize
= strlen(dname
) + 1;
564 if (zil_replaying(zilog
, tx
))
567 txtype
|= TX_RENAME_WHITEOUT
;
568 itx
= zil_itx_create(txtype
, sizeof (*lr
) + snamesize
+ dnamesize
);
569 lr
= (lr_rename_whiteout_t
*)&itx
->itx_lr
;
570 lr
->lr_rename
.lr_sdoid
= sdzp
->z_id
;
571 lr
->lr_rename
.lr_tdoid
= tdzp
->z_id
;
574 * RENAME_WHITEOUT will create an entry at the source znode, so we need
575 * to store the same data that the equivalent call to zfs_log_create()
578 lr
->lr_wfoid
= wzp
->z_id
;
579 LR_FOID_SET_SLOTS(lr
->lr_wfoid
, wzp
->z_dnodesize
>> DNODE_SHIFT
);
580 (void) sa_lookup(wzp
->z_sa_hdl
, SA_ZPL_GEN(ZTOZSB(wzp
)), &lr
->lr_wgen
,
582 (void) sa_lookup(wzp
->z_sa_hdl
, SA_ZPL_CRTIME(ZTOZSB(wzp
)),
583 lr
->lr_wcrtime
, sizeof (uint64_t) * 2);
584 lr
->lr_wmode
= wzp
->z_mode
;
585 lr
->lr_wuid
= (uint64_t)KUID_TO_SUID(ZTOUID(wzp
));
586 lr
->lr_wgid
= (uint64_t)KGID_TO_SGID(ZTOGID(wzp
));
589 * This rdev will always be makdevice(0, 0) but because the ZIL log and
590 * replay code needs to be platform independent (and there is no
591 * platform independent makdev()) we need to copy the one created
592 * during the rename operation.
594 (void) sa_lookup(wzp
->z_sa_hdl
, SA_ZPL_RDEV(ZTOZSB(wzp
)), &lr
->lr_wrdev
,
595 sizeof (lr
->lr_wrdev
));
597 memcpy(&lr
->lr_data
[0], sname
, snamesize
);
598 memcpy(&lr
->lr_data
[snamesize
], dname
, dnamesize
);
599 itx
->itx_oid
= szp
->z_id
;
601 zil_itx_assign(zilog
, itx
, tx
);
605 * zfs_log_write() handles TX_WRITE transactions. The specified callback is
606 * called as soon as the write is on stable storage (be it via a DMU sync or a
609 static int64_t zfs_immediate_write_sz
= 32768;
612 zfs_log_write(zilog_t
*zilog
, dmu_tx_t
*tx
, int txtype
,
613 znode_t
*zp
, offset_t off
, ssize_t resid
, boolean_t commit
,
614 boolean_t o_direct
, zil_callback_t callback
, void *callback_data
)
616 dmu_buf_impl_t
*db
= (dmu_buf_impl_t
*)sa_get_db(zp
->z_sa_hdl
);
617 uint32_t blocksize
= zp
->z_blksz
;
618 itx_wr_state_t write_state
;
620 ssize_t size
= resid
;
622 if (zil_replaying(zilog
, tx
) || zp
->z_unlinked
||
623 zfs_xattr_owner_unlinked(zp
)) {
624 if (callback
!= NULL
)
625 callback(callback_data
);
629 if (zilog
->zl_logbias
== ZFS_LOGBIAS_THROUGHPUT
|| o_direct
)
630 write_state
= WR_INDIRECT
;
631 else if (!spa_has_slogs(zilog
->zl_spa
) &&
632 resid
>= zfs_immediate_write_sz
)
633 write_state
= WR_INDIRECT
;
635 write_state
= WR_COPIED
;
637 write_state
= WR_NEED_COPY
;
639 (void) sa_lookup(zp
->z_sa_hdl
, SA_ZPL_GEN(ZTOZSB(zp
)), &gen
,
645 itx_wr_state_t wr_state
= write_state
;
649 * A WR_COPIED record must fit entirely in one log block.
650 * Large writes can use WR_NEED_COPY, which the ZIL will
651 * split into multiple records across several log blocks
654 if (wr_state
== WR_COPIED
&&
655 resid
> zil_max_copied_data(zilog
))
656 wr_state
= WR_NEED_COPY
;
657 else if (wr_state
== WR_INDIRECT
)
658 len
= MIN(blocksize
- P2PHASE(off
, blocksize
), resid
);
660 itx
= zil_itx_create(txtype
, sizeof (*lr
) +
661 (wr_state
== WR_COPIED
? len
: 0));
662 lr
= (lr_write_t
*)&itx
->itx_lr
;
665 * For WR_COPIED records, copy the data into the lr_write_t.
667 if (wr_state
== WR_COPIED
) {
670 err
= dmu_read_by_dnode(DB_DNODE(db
), off
, len
,
671 &lr
->lr_data
[0], DMU_READ_NO_PREFETCH
);
674 zil_itx_destroy(itx
);
675 itx
= zil_itx_create(txtype
, sizeof (*lr
));
676 lr
= (lr_write_t
*)&itx
->itx_lr
;
677 wr_state
= WR_NEED_COPY
;
681 itx
->itx_wr_state
= wr_state
;
682 lr
->lr_foid
= zp
->z_id
;
686 BP_ZERO(&lr
->lr_blkptr
);
688 itx
->itx_private
= ZTOZSB(zp
);
689 itx
->itx_sync
= (zp
->z_sync_cnt
!= 0);
692 itx
->itx_callback
= callback
;
693 itx
->itx_callback_data
= callback_data
;
694 zil_itx_assign(zilog
, itx
, tx
);
700 if (write_state
== WR_COPIED
|| write_state
== WR_NEED_COPY
) {
701 dsl_pool_wrlog_count(zilog
->zl_dmu_pool
, size
, tx
->tx_txg
);
706 * Handles TX_TRUNCATE transactions.
709 zfs_log_truncate(zilog_t
*zilog
, dmu_tx_t
*tx
, int txtype
,
710 znode_t
*zp
, uint64_t off
, uint64_t len
)
715 if (zil_replaying(zilog
, tx
) || zp
->z_unlinked
||
716 zfs_xattr_owner_unlinked(zp
))
719 itx
= zil_itx_create(txtype
, sizeof (*lr
));
720 lr
= (lr_truncate_t
*)&itx
->itx_lr
;
721 lr
->lr_foid
= zp
->z_id
;
725 itx
->itx_sync
= (zp
->z_sync_cnt
!= 0);
726 zil_itx_assign(zilog
, itx
, tx
);
730 * Handles TX_SETATTR transactions.
733 zfs_log_setattr(zilog_t
*zilog
, dmu_tx_t
*tx
, int txtype
,
734 znode_t
*zp
, vattr_t
*vap
, uint_t mask_applied
, zfs_fuid_info_t
*fuidp
)
738 xvattr_t
*xvap
= (xvattr_t
*)vap
;
739 size_t recsize
= sizeof (lr_setattr_t
);
742 if (zil_replaying(zilog
, tx
) || zp
->z_unlinked
)
746 * If XVATTR set, then log record size needs to allow
747 * for lr_attr_t + xvattr mask, mapsize and create time
748 * plus actual attribute values
750 if (vap
->va_mask
& ATTR_XVATTR
)
751 recsize
= sizeof (*lr
) + ZIL_XVAT_SIZE(xvap
->xva_mapsize
);
754 recsize
+= fuidp
->z_domain_str_sz
;
756 itx
= zil_itx_create(txtype
, recsize
);
757 lr
= (lr_setattr_t
*)&itx
->itx_lr
;
758 lr
->lr_foid
= zp
->z_id
;
759 lr
->lr_mask
= (uint64_t)mask_applied
;
760 lr
->lr_mode
= (uint64_t)vap
->va_mode
;
761 if ((mask_applied
& ATTR_UID
) && IS_EPHEMERAL(vap
->va_uid
))
762 lr
->lr_uid
= fuidp
->z_fuid_owner
;
764 lr
->lr_uid
= (uint64_t)vap
->va_uid
;
766 if ((mask_applied
& ATTR_GID
) && IS_EPHEMERAL(vap
->va_gid
))
767 lr
->lr_gid
= fuidp
->z_fuid_group
;
769 lr
->lr_gid
= (uint64_t)vap
->va_gid
;
771 lr
->lr_size
= (uint64_t)vap
->va_size
;
772 ZFS_TIME_ENCODE(&vap
->va_atime
, lr
->lr_atime
);
773 ZFS_TIME_ENCODE(&vap
->va_mtime
, lr
->lr_mtime
);
774 start
= &lr
->lr_data
[0];
775 if (vap
->va_mask
& ATTR_XVATTR
) {
776 zfs_log_xvattr((lr_attr_t
*)start
, xvap
);
777 start
= &lr
->lr_data
[ZIL_XVAT_SIZE(xvap
->xva_mapsize
)];
781 * Now stick on domain information if any on end
785 (void) zfs_log_fuid_domains(fuidp
, start
);
787 itx
->itx_sync
= (zp
->z_sync_cnt
!= 0);
788 zil_itx_assign(zilog
, itx
, tx
);
792 * Handles TX_SETSAXATTR transactions.
795 zfs_log_setsaxattr(zilog_t
*zilog
, dmu_tx_t
*tx
, int txtype
,
796 znode_t
*zp
, const char *name
, const void *value
, size_t size
)
800 size_t recsize
= sizeof (lr_setsaxattr_t
);
803 if (zil_replaying(zilog
, tx
) || zp
->z_unlinked
)
806 namelen
= strlen(name
) + 1;
807 recsize
+= (namelen
+ size
);
808 itx
= zil_itx_create(txtype
, recsize
);
809 lr
= (lr_setsaxattr_t
*)&itx
->itx_lr
;
810 lr
->lr_foid
= zp
->z_id
;
811 memcpy(&lr
->lr_data
[0], name
, namelen
);
813 memcpy(&lr
->lr_data
[namelen
], value
, size
);
819 itx
->itx_sync
= (zp
->z_sync_cnt
!= 0);
820 zil_itx_assign(zilog
, itx
, tx
);
824 * Handles TX_ACL transactions.
827 zfs_log_acl(zilog_t
*zilog
, dmu_tx_t
*tx
, znode_t
*zp
,
828 vsecattr_t
*vsecp
, zfs_fuid_info_t
*fuidp
)
836 size_t aclbytes
= vsecp
->vsa_aclentsz
;
838 if (zil_replaying(zilog
, tx
) || zp
->z_unlinked
)
841 txtype
= (ZTOZSB(zp
)->z_version
< ZPL_VERSION_FUID
) ?
844 if (txtype
== TX_ACL
)
845 lrsize
= sizeof (*lr
);
847 lrsize
= sizeof (*lrv0
);
850 ((txtype
== TX_ACL
) ? ZIL_ACE_LENGTH(aclbytes
) : aclbytes
) +
851 (fuidp
? fuidp
->z_domain_str_sz
: 0) +
852 sizeof (uint64_t) * (fuidp
? fuidp
->z_fuid_cnt
: 0);
854 itx
= zil_itx_create(txtype
, txsize
);
856 lr
= (lr_acl_t
*)&itx
->itx_lr
;
857 lr
->lr_foid
= zp
->z_id
;
858 if (txtype
== TX_ACL
) {
859 lr
->lr_acl_bytes
= aclbytes
;
860 lr
->lr_domcnt
= fuidp
? fuidp
->z_domain_cnt
: 0;
861 lr
->lr_fuidcnt
= fuidp
? fuidp
->z_fuid_cnt
: 0;
862 if (vsecp
->vsa_mask
& VSA_ACE_ACLFLAGS
)
863 lr
->lr_acl_flags
= (uint64_t)vsecp
->vsa_aclflags
;
865 lr
->lr_acl_flags
= 0;
867 lr
->lr_aclcnt
= (uint64_t)vsecp
->vsa_aclcnt
;
869 if (txtype
== TX_ACL_V0
) {
870 lrv0
= (lr_acl_v0_t
*)lr
;
871 memcpy(&lrv0
->lr_data
[0], vsecp
->vsa_aclentp
, aclbytes
);
873 uint8_t *start
= &lr
->lr_data
[0];
875 memcpy(start
, vsecp
->vsa_aclentp
, aclbytes
);
877 start
= &lr
->lr_data
[ZIL_ACE_LENGTH(aclbytes
)];
880 start
= zfs_log_fuid_ids(fuidp
, start
);
881 (void) zfs_log_fuid_domains(fuidp
, start
);
885 itx
->itx_sync
= (zp
->z_sync_cnt
!= 0);
886 zil_itx_assign(zilog
, itx
, tx
);
890 * Handles TX_CLONE_RANGE transactions.
893 zfs_log_clone_range(zilog_t
*zilog
, dmu_tx_t
*tx
, int txtype
, znode_t
*zp
,
894 uint64_t off
, uint64_t len
, uint64_t blksz
, const blkptr_t
*bps
,
898 lr_clone_range_t
*lr
;
899 uint64_t partlen
, max_log_data
;
902 if (zil_replaying(zilog
, tx
) || zp
->z_unlinked
)
905 max_log_data
= zil_max_log_data(zilog
, sizeof (lr_clone_range_t
));
908 partnbps
= MIN(nbps
, max_log_data
/ sizeof (bps
[0]));
909 partlen
= partnbps
* blksz
;
910 ASSERT3U(partlen
, <, len
+ blksz
);
911 partlen
= MIN(partlen
, len
);
913 itx
= zil_itx_create(txtype
,
914 sizeof (*lr
) + sizeof (bps
[0]) * partnbps
);
915 lr
= (lr_clone_range_t
*)&itx
->itx_lr
;
916 lr
->lr_foid
= zp
->z_id
;
918 lr
->lr_length
= partlen
;
919 lr
->lr_blksz
= blksz
;
920 lr
->lr_nbps
= partnbps
;
921 memcpy(lr
->lr_bps
, bps
, sizeof (bps
[0]) * partnbps
);
923 itx
->itx_sync
= (zp
->z_sync_cnt
!= 0);
925 zil_itx_assign(zilog
, itx
, tx
);
928 ASSERT3U(nbps
, >=, partnbps
);
931 ASSERT3U(len
, >=, partlen
);
936 ZFS_MODULE_PARAM(zfs
, zfs_
, immediate_write_sz
, S64
, ZMOD_RW
,
937 "Largest data block to write to zil");