4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2015, 2018 by Delphix. All rights reserved.
24 * Copyright (c) 2022 by Pawel Jakub Dawidek
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/cmn_err.h>
33 #include <sys/thread.h>
36 #include <sys/zfs_znode.h>
37 #include <sys/zfs_dir.h>
39 #include <sys/zil_impl.h>
40 #include <sys/byteorder.h>
41 #include <sys/policy.h>
47 #include <sys/zfs_fuid.h>
48 #include <sys/dsl_dataset.h>
51 * These zfs_log_* functions must be called within a dmu tx, in one
52 * of 2 contexts depending on zilog->z_replay:
56 * We need to record the transaction so that if it is committed to
57 * the Intent Log then it can be replayed. An intent log transaction
58 * structure (itx_t) is allocated and all the information necessary to
59 * possibly replay the transaction is saved in it. The itx is then assigned
60 * a sequence number and inserted in the in-memory list anchored in the zilog.
64 * We need to mark the intent log record as replayed in the log header.
65 * This is done in the same transaction as the replay so that they
70 zfs_log_create_txtype(zil_create_t type
, vsecattr_t
*vsecp
, vattr_t
*vap
)
72 int isxvattr
= (vap
->va_mask
& ATTR_XVATTR
);
75 if (vsecp
== NULL
&& !isxvattr
)
77 if (vsecp
&& isxvattr
)
78 return (TX_CREATE_ACL_ATTR
);
80 return (TX_CREATE_ACL
);
82 return (TX_CREATE_ATTR
);
84 if (vsecp
== NULL
&& !isxvattr
)
86 if (vsecp
&& isxvattr
)
87 return (TX_MKDIR_ACL_ATTR
);
89 return (TX_MKDIR_ACL
);
91 return (TX_MKDIR_ATTR
);
100 * build up the log data necessary for logging xvattr_t
101 * First lr_attr_t is initialized. following the lr_attr_t
102 * is the mapsize and attribute bitmap copied from the xvattr_t.
103 * Following the bitmap and bitmapsize two 64 bit words are reserved
104 * for the create time which may be set. Following the create time
105 * records a single 64 bit integer which has the bits to set on
106 * replay for the xvattr.
109 zfs_log_xvattr(lr_attr_t
*lrattr
, xvattr_t
*xvap
)
113 xoap
= xva_getxoptattr(xvap
);
116 lrattr
->lr_attr_masksize
= xvap
->xva_mapsize
;
117 uint32_t *bitmap
= &lrattr
->lr_attr_bitmap
;
118 for (int i
= 0; i
!= xvap
->xva_mapsize
; i
++, bitmap
++)
119 *bitmap
= xvap
->xva_reqattrmap
[i
];
121 lr_attr_end_t
*end
= (lr_attr_end_t
*)bitmap
;
122 end
->lr_attr_attrs
= 0;
123 end
->lr_attr_crtime
[0] = 0;
124 end
->lr_attr_crtime
[1] = 0;
125 memset(end
->lr_attr_scanstamp
, 0, AV_SCANSTAMP_SZ
);
127 if (XVA_ISSET_REQ(xvap
, XAT_READONLY
))
128 end
->lr_attr_attrs
|= (xoap
->xoa_readonly
== 0) ? 0 :
130 if (XVA_ISSET_REQ(xvap
, XAT_HIDDEN
))
131 end
->lr_attr_attrs
|= (xoap
->xoa_hidden
== 0) ? 0 :
133 if (XVA_ISSET_REQ(xvap
, XAT_SYSTEM
))
134 end
->lr_attr_attrs
|= (xoap
->xoa_system
== 0) ? 0 :
136 if (XVA_ISSET_REQ(xvap
, XAT_ARCHIVE
))
137 end
->lr_attr_attrs
|= (xoap
->xoa_archive
== 0) ? 0 :
139 if (XVA_ISSET_REQ(xvap
, XAT_IMMUTABLE
))
140 end
->lr_attr_attrs
|= (xoap
->xoa_immutable
== 0) ? 0 :
142 if (XVA_ISSET_REQ(xvap
, XAT_NOUNLINK
))
143 end
->lr_attr_attrs
|= (xoap
->xoa_nounlink
== 0) ? 0 :
145 if (XVA_ISSET_REQ(xvap
, XAT_APPENDONLY
))
146 end
->lr_attr_attrs
|= (xoap
->xoa_appendonly
== 0) ? 0 :
148 if (XVA_ISSET_REQ(xvap
, XAT_OPAQUE
))
149 end
->lr_attr_attrs
|= (xoap
->xoa_opaque
== 0) ? 0 :
151 if (XVA_ISSET_REQ(xvap
, XAT_NODUMP
))
152 end
->lr_attr_attrs
|= (xoap
->xoa_nodump
== 0) ? 0 :
154 if (XVA_ISSET_REQ(xvap
, XAT_AV_QUARANTINED
))
155 end
->lr_attr_attrs
|= (xoap
->xoa_av_quarantined
== 0) ? 0 :
157 if (XVA_ISSET_REQ(xvap
, XAT_AV_MODIFIED
))
158 end
->lr_attr_attrs
|= (xoap
->xoa_av_modified
== 0) ? 0 :
160 if (XVA_ISSET_REQ(xvap
, XAT_CREATETIME
))
161 ZFS_TIME_ENCODE(&xoap
->xoa_createtime
, end
->lr_attr_crtime
);
162 if (XVA_ISSET_REQ(xvap
, XAT_AV_SCANSTAMP
)) {
163 ASSERT(!XVA_ISSET_REQ(xvap
, XAT_PROJID
));
165 memcpy(end
->lr_attr_scanstamp
, xoap
->xoa_av_scanstamp
,
167 } else if (XVA_ISSET_REQ(xvap
, XAT_PROJID
)) {
169 * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
170 * at the same time, so we can share the same space.
172 memcpy(end
->lr_attr_scanstamp
, &xoap
->xoa_projid
,
175 if (XVA_ISSET_REQ(xvap
, XAT_REPARSE
))
176 end
->lr_attr_attrs
|= (xoap
->xoa_reparse
== 0) ? 0 :
178 if (XVA_ISSET_REQ(xvap
, XAT_OFFLINE
))
179 end
->lr_attr_attrs
|= (xoap
->xoa_offline
== 0) ? 0 :
181 if (XVA_ISSET_REQ(xvap
, XAT_SPARSE
))
182 end
->lr_attr_attrs
|= (xoap
->xoa_sparse
== 0) ? 0 :
184 if (XVA_ISSET_REQ(xvap
, XAT_PROJINHERIT
))
185 end
->lr_attr_attrs
|= (xoap
->xoa_projinherit
== 0) ? 0 :
190 zfs_log_fuid_ids(zfs_fuid_info_t
*fuidp
, void *start
)
193 uint64_t *fuidloc
= start
;
195 /* First copy in the ACE FUIDs */
196 for (zfuid
= list_head(&fuidp
->z_fuids
); zfuid
;
197 zfuid
= list_next(&fuidp
->z_fuids
, zfuid
)) {
198 *fuidloc
++ = zfuid
->z_logfuid
;
205 zfs_log_fuid_domains(zfs_fuid_info_t
*fuidp
, void *start
)
207 zfs_fuid_domain_t
*zdomain
;
209 /* now copy in the domain info, if any */
210 if (fuidp
->z_domain_str_sz
!= 0) {
211 for (zdomain
= list_head(&fuidp
->z_domains
); zdomain
;
212 zdomain
= list_next(&fuidp
->z_domains
, zdomain
)) {
213 memcpy(start
, zdomain
->z_domain
,
214 strlen(zdomain
->z_domain
) + 1);
215 start
= (caddr_t
)start
+
216 strlen(zdomain
->z_domain
) + 1;
223 * If zp is an xattr node, check whether the xattr owner is unlinked.
224 * We don't want to log anything if the owner is unlinked.
227 zfs_xattr_owner_unlinked(znode_t
*zp
)
235 * zrele drops the vnode lock which violates the VOP locking contract
236 * on FreeBSD. See comment at the top of zfs_replay.c for more detail.
239 * if zp is XATTR node, keep walking up via z_xattr_parent until we
242 while (tzp
->z_pflags
& ZFS_XATTR
) {
243 ASSERT3U(zp
->z_xattr_parent
, !=, 0);
244 if (zfs_zget(ZTOZSB(tzp
), tzp
->z_xattr_parent
, &dzp
) != 0) {
252 unlinked
= tzp
->z_unlinked
;
259 * if zp is XATTR node, keep walking up via z_xattr_parent until we
262 while (zp
->z_pflags
& ZFS_XATTR
) {
263 ASSERT3U(zp
->z_xattr_parent
, !=, 0);
264 if (zfs_zget(ZTOZSB(zp
), zp
->z_xattr_parent
, &dzp
) != 0) {
271 unlinked
= zp
->z_unlinked
;
279 * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and
280 * TK_MKXATTR transactions.
282 * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
283 * domain information appended prior to the name. In this case the
284 * uid/gid in the log record will be a log centric FUID.
286 * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
287 * may contain attributes, ACL and optional fuid information.
289 * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
290 * and ACL and normal users/groups in the ACEs.
292 * There may be an optional xvattr attribute information similar
293 * to zfs_log_setattr.
295 * Also, after the file name "domain" strings may be appended.
298 zfs_log_create(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
,
299 znode_t
*dzp
, znode_t
*zp
, const char *name
, vsecattr_t
*vsecp
,
300 zfs_fuid_info_t
*fuidp
, vattr_t
*vap
)
304 lr_acl_create_t
*lracl
;
308 xvattr_t
*xvap
= (xvattr_t
*)vap
;
311 size_t namesize
= strlen(name
) + 1;
314 if (zil_replaying(zilog
, tx
) || zfs_xattr_owner_unlinked(dzp
))
318 * If we have FUIDs present then add in space for
319 * domains and ACE fuid's if any.
322 fuidsz
+= fuidp
->z_domain_str_sz
;
323 fuidsz
+= fuidp
->z_fuid_cnt
* sizeof (uint64_t);
326 if (vap
->va_mask
& ATTR_XVATTR
)
327 xvatsize
= ZIL_XVAT_SIZE(xvap
->xva_mapsize
);
329 if ((int)txtype
== TX_CREATE_ATTR
|| (int)txtype
== TX_MKDIR_ATTR
||
330 (int)txtype
== TX_CREATE
|| (int)txtype
== TX_MKDIR
||
331 (int)txtype
== TX_MKXATTR
) {
332 txsize
= sizeof (*lr
) + namesize
+ fuidsz
+ xvatsize
;
333 lrsize
= sizeof (*lr
);
336 sizeof (lr_acl_create_t
) + namesize
+ fuidsz
+
337 ZIL_ACE_LENGTH(aclsize
) + xvatsize
;
338 lrsize
= sizeof (lr_acl_create_t
);
341 itx
= zil_itx_create(txtype
, txsize
);
343 lr
= (lr_create_t
*)&itx
->itx_lr
;
344 lr
->lr_doid
= dzp
->z_id
;
345 lr
->lr_foid
= zp
->z_id
;
346 /* Store dnode slot count in 8 bits above object id. */
347 LR_FOID_SET_SLOTS(lr
->lr_foid
, zp
->z_dnodesize
>> DNODE_SHIFT
);
348 lr
->lr_mode
= zp
->z_mode
;
349 if (!IS_EPHEMERAL(KUID_TO_SUID(ZTOUID(zp
)))) {
350 lr
->lr_uid
= (uint64_t)KUID_TO_SUID(ZTOUID(zp
));
352 lr
->lr_uid
= fuidp
->z_fuid_owner
;
354 if (!IS_EPHEMERAL(KGID_TO_SGID(ZTOGID(zp
)))) {
355 lr
->lr_gid
= (uint64_t)KGID_TO_SGID(ZTOGID(zp
));
357 lr
->lr_gid
= fuidp
->z_fuid_group
;
359 (void) sa_lookup(zp
->z_sa_hdl
, SA_ZPL_GEN(ZTOZSB(zp
)), &lr
->lr_gen
,
361 (void) sa_lookup(zp
->z_sa_hdl
, SA_ZPL_CRTIME(ZTOZSB(zp
)),
362 lr
->lr_crtime
, sizeof (uint64_t) * 2);
364 if (sa_lookup(zp
->z_sa_hdl
, SA_ZPL_RDEV(ZTOZSB(zp
)), &lr
->lr_rdev
,
365 sizeof (lr
->lr_rdev
)) != 0)
369 * Fill in xvattr info if any
371 if (vap
->va_mask
& ATTR_XVATTR
) {
372 zfs_log_xvattr((lr_attr_t
*)((caddr_t
)lr
+ lrsize
), xvap
);
373 end
= (caddr_t
)lr
+ lrsize
+ xvatsize
;
375 end
= (caddr_t
)lr
+ lrsize
;
378 /* Now fill in any ACL info */
381 lracl
= (lr_acl_create_t
*)&itx
->itx_lr
;
382 lracl
->lr_aclcnt
= vsecp
->vsa_aclcnt
;
383 lracl
->lr_acl_bytes
= aclsize
;
384 lracl
->lr_domcnt
= fuidp
? fuidp
->z_domain_cnt
: 0;
385 lracl
->lr_fuidcnt
= fuidp
? fuidp
->z_fuid_cnt
: 0;
386 if (vsecp
->vsa_aclflags
& VSA_ACE_ACLFLAGS
)
387 lracl
->lr_acl_flags
= (uint64_t)vsecp
->vsa_aclflags
;
389 lracl
->lr_acl_flags
= 0;
391 memcpy(end
, vsecp
->vsa_aclentp
, aclsize
);
392 end
= (caddr_t
)end
+ ZIL_ACE_LENGTH(aclsize
);
395 /* drop in FUID info */
397 end
= zfs_log_fuid_ids(fuidp
, end
);
398 end
= zfs_log_fuid_domains(fuidp
, end
);
401 * Now place file name in log record
403 memcpy(end
, name
, namesize
);
405 zil_itx_assign(zilog
, itx
, tx
);
409 * Handles both TX_REMOVE and TX_RMDIR transactions.
412 zfs_log_remove(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
,
413 znode_t
*dzp
, const char *name
, uint64_t foid
, boolean_t unlinked
)
417 size_t namesize
= strlen(name
) + 1;
419 if (zil_replaying(zilog
, tx
) || zfs_xattr_owner_unlinked(dzp
))
422 itx
= zil_itx_create(txtype
, sizeof (*lr
) + namesize
);
423 lr
= (lr_remove_t
*)&itx
->itx_lr
;
424 lr
->lr_doid
= dzp
->z_id
;
425 memcpy(lr
+ 1, name
, namesize
);
430 * Object ids can be re-instantiated in the next txg so
431 * remove any async transactions to avoid future leaks.
432 * This can happen if a fsync occurs on the re-instantiated
433 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
434 * the new file data and flushes a write record for the old object.
437 ASSERT((txtype
& ~TX_CI
) == TX_REMOVE
);
438 zil_remove_async(zilog
, foid
);
440 zil_itx_assign(zilog
, itx
, tx
);
444 * Handles TX_LINK transactions.
447 zfs_log_link(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
,
448 znode_t
*dzp
, znode_t
*zp
, const char *name
)
452 size_t namesize
= strlen(name
) + 1;
454 if (zil_replaying(zilog
, tx
))
457 itx
= zil_itx_create(txtype
, sizeof (*lr
) + namesize
);
458 lr
= (lr_link_t
*)&itx
->itx_lr
;
459 lr
->lr_doid
= dzp
->z_id
;
460 lr
->lr_link_obj
= zp
->z_id
;
461 memcpy(lr
+ 1, name
, namesize
);
463 zil_itx_assign(zilog
, itx
, tx
);
467 * Handles TX_SYMLINK transactions.
470 zfs_log_symlink(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
,
471 znode_t
*dzp
, znode_t
*zp
, const char *name
, const char *link
)
475 size_t namesize
= strlen(name
) + 1;
476 size_t linksize
= strlen(link
) + 1;
478 if (zil_replaying(zilog
, tx
))
481 itx
= zil_itx_create(txtype
, sizeof (*lr
) + namesize
+ linksize
);
482 lr
= (lr_create_t
*)&itx
->itx_lr
;
483 lr
->lr_doid
= dzp
->z_id
;
484 lr
->lr_foid
= zp
->z_id
;
485 lr
->lr_uid
= KUID_TO_SUID(ZTOUID(zp
));
486 lr
->lr_gid
= KGID_TO_SGID(ZTOGID(zp
));
487 lr
->lr_mode
= zp
->z_mode
;
488 (void) sa_lookup(zp
->z_sa_hdl
, SA_ZPL_GEN(ZTOZSB(zp
)), &lr
->lr_gen
,
490 (void) sa_lookup(zp
->z_sa_hdl
, SA_ZPL_CRTIME(ZTOZSB(zp
)),
491 lr
->lr_crtime
, sizeof (uint64_t) * 2);
492 memcpy((char *)(lr
+ 1), name
, namesize
);
493 memcpy((char *)(lr
+ 1) + namesize
, link
, linksize
);
495 zil_itx_assign(zilog
, itx
, tx
);
499 do_zfs_log_rename(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
, znode_t
*sdzp
,
500 const char *sname
, znode_t
*tdzp
, const char *dname
, znode_t
*szp
)
504 size_t snamesize
= strlen(sname
) + 1;
505 size_t dnamesize
= strlen(dname
) + 1;
507 if (zil_replaying(zilog
, tx
))
510 itx
= zil_itx_create(txtype
, sizeof (*lr
) + snamesize
+ dnamesize
);
511 lr
= (lr_rename_t
*)&itx
->itx_lr
;
512 lr
->lr_sdoid
= sdzp
->z_id
;
513 lr
->lr_tdoid
= tdzp
->z_id
;
514 memcpy((char *)(lr
+ 1), sname
, snamesize
);
515 memcpy((char *)(lr
+ 1) + snamesize
, dname
, dnamesize
);
516 itx
->itx_oid
= szp
->z_id
;
518 zil_itx_assign(zilog
, itx
, tx
);
522 * Handles TX_RENAME transactions.
525 zfs_log_rename(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
, znode_t
*sdzp
,
526 const char *sname
, znode_t
*tdzp
, const char *dname
, znode_t
*szp
)
529 do_zfs_log_rename(zilog
, tx
, txtype
, sdzp
, sname
, tdzp
, dname
, szp
);
533 * Handles TX_RENAME_EXCHANGE transactions.
536 zfs_log_rename_exchange(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
,
537 znode_t
*sdzp
, const char *sname
, znode_t
*tdzp
, const char *dname
,
540 txtype
|= TX_RENAME_EXCHANGE
;
541 do_zfs_log_rename(zilog
, tx
, txtype
, sdzp
, sname
, tdzp
, dname
, szp
);
545 * Handles TX_RENAME_WHITEOUT transactions.
547 * Unfortunately we cannot reuse do_zfs_log_rename because we we need to call
548 * zfs_mknode() on replay which requires stashing bits as with TX_CREATE.
551 zfs_log_rename_whiteout(zilog_t
*zilog
, dmu_tx_t
*tx
, uint64_t txtype
,
552 znode_t
*sdzp
, const char *sname
, znode_t
*tdzp
, const char *dname
,
553 znode_t
*szp
, znode_t
*wzp
)
556 lr_rename_whiteout_t
*lr
;
557 size_t snamesize
= strlen(sname
) + 1;
558 size_t dnamesize
= strlen(dname
) + 1;
560 if (zil_replaying(zilog
, tx
))
563 txtype
|= TX_RENAME_WHITEOUT
;
564 itx
= zil_itx_create(txtype
, sizeof (*lr
) + snamesize
+ dnamesize
);
565 lr
= (lr_rename_whiteout_t
*)&itx
->itx_lr
;
566 lr
->lr_rename
.lr_sdoid
= sdzp
->z_id
;
567 lr
->lr_rename
.lr_tdoid
= tdzp
->z_id
;
570 * RENAME_WHITEOUT will create an entry at the source znode, so we need
571 * to store the same data that the equivalent call to zfs_log_create()
574 lr
->lr_wfoid
= wzp
->z_id
;
575 LR_FOID_SET_SLOTS(lr
->lr_wfoid
, wzp
->z_dnodesize
>> DNODE_SHIFT
);
576 (void) sa_lookup(wzp
->z_sa_hdl
, SA_ZPL_GEN(ZTOZSB(wzp
)), &lr
->lr_wgen
,
578 (void) sa_lookup(wzp
->z_sa_hdl
, SA_ZPL_CRTIME(ZTOZSB(wzp
)),
579 lr
->lr_wcrtime
, sizeof (uint64_t) * 2);
580 lr
->lr_wmode
= wzp
->z_mode
;
581 lr
->lr_wuid
= (uint64_t)KUID_TO_SUID(ZTOUID(wzp
));
582 lr
->lr_wgid
= (uint64_t)KGID_TO_SGID(ZTOGID(wzp
));
585 * This rdev will always be makdevice(0, 0) but because the ZIL log and
586 * replay code needs to be platform independent (and there is no
587 * platform independent makdev()) we need to copy the one created
588 * during the rename operation.
590 (void) sa_lookup(wzp
->z_sa_hdl
, SA_ZPL_RDEV(ZTOZSB(wzp
)), &lr
->lr_wrdev
,
591 sizeof (lr
->lr_wrdev
));
593 memcpy((char *)(lr
+ 1), sname
, snamesize
);
594 memcpy((char *)(lr
+ 1) + snamesize
, dname
, dnamesize
);
595 itx
->itx_oid
= szp
->z_id
;
597 zil_itx_assign(zilog
, itx
, tx
);
601 * zfs_log_write() handles TX_WRITE transactions. The specified callback is
602 * called as soon as the write is on stable storage (be it via a DMU sync or a
605 static int64_t zfs_immediate_write_sz
= 32768;
608 zfs_log_write(zilog_t
*zilog
, dmu_tx_t
*tx
, int txtype
,
609 znode_t
*zp
, offset_t off
, ssize_t resid
, int ioflag
,
610 zil_callback_t callback
, void *callback_data
)
612 dmu_buf_impl_t
*db
= (dmu_buf_impl_t
*)sa_get_db(zp
->z_sa_hdl
);
613 uint32_t blocksize
= zp
->z_blksz
;
614 itx_wr_state_t write_state
;
617 ssize_t size
= resid
;
619 if (zil_replaying(zilog
, tx
) || zp
->z_unlinked
||
620 zfs_xattr_owner_unlinked(zp
)) {
621 if (callback
!= NULL
)
622 callback(callback_data
);
626 if (zilog
->zl_logbias
== ZFS_LOGBIAS_THROUGHPUT
)
627 write_state
= WR_INDIRECT
;
628 else if (!spa_has_slogs(zilog
->zl_spa
) &&
629 resid
>= zfs_immediate_write_sz
)
630 write_state
= WR_INDIRECT
;
631 else if (ioflag
& (O_SYNC
| O_DSYNC
))
632 write_state
= WR_COPIED
;
634 write_state
= WR_NEED_COPY
;
636 if ((fsync_cnt
= (uintptr_t)tsd_get(zfs_fsyncer_key
)) != 0) {
637 (void) tsd_set(zfs_fsyncer_key
, (void *)(fsync_cnt
- 1));
640 (void) sa_lookup(zp
->z_sa_hdl
, SA_ZPL_GEN(ZTOZSB(zp
)), &gen
,
646 itx_wr_state_t wr_state
= write_state
;
650 * A WR_COPIED record must fit entirely in one log block.
651 * Large writes can use WR_NEED_COPY, which the ZIL will
652 * split into multiple records across several log blocks
655 if (wr_state
== WR_COPIED
&&
656 resid
> zil_max_copied_data(zilog
))
657 wr_state
= WR_NEED_COPY
;
658 else if (wr_state
== WR_INDIRECT
)
659 len
= MIN(blocksize
- P2PHASE(off
, blocksize
), resid
);
661 itx
= zil_itx_create(txtype
, sizeof (*lr
) +
662 (wr_state
== WR_COPIED
? len
: 0));
663 lr
= (lr_write_t
*)&itx
->itx_lr
;
666 * For WR_COPIED records, copy the data into the lr_write_t.
668 if (wr_state
== WR_COPIED
) {
671 err
= dmu_read_by_dnode(DB_DNODE(db
), off
, len
, lr
+ 1,
672 DMU_READ_NO_PREFETCH
);
674 zil_itx_destroy(itx
);
675 itx
= zil_itx_create(txtype
, sizeof (*lr
));
676 lr
= (lr_write_t
*)&itx
->itx_lr
;
677 wr_state
= WR_NEED_COPY
;
682 itx
->itx_wr_state
= wr_state
;
683 lr
->lr_foid
= zp
->z_id
;
687 BP_ZERO(&lr
->lr_blkptr
);
689 itx
->itx_private
= ZTOZSB(zp
);
692 if (!(ioflag
& (O_SYNC
| O_DSYNC
)) && (zp
->z_sync_cnt
== 0) &&
694 itx
->itx_sync
= B_FALSE
;
696 itx
->itx_callback
= callback
;
697 itx
->itx_callback_data
= callback_data
;
698 zil_itx_assign(zilog
, itx
, tx
);
704 if (write_state
== WR_COPIED
|| write_state
== WR_NEED_COPY
) {
705 dsl_pool_wrlog_count(zilog
->zl_dmu_pool
, size
, tx
->tx_txg
);
710 * Handles TX_TRUNCATE transactions.
713 zfs_log_truncate(zilog_t
*zilog
, dmu_tx_t
*tx
, int txtype
,
714 znode_t
*zp
, uint64_t off
, uint64_t len
)
719 if (zil_replaying(zilog
, tx
) || zp
->z_unlinked
||
720 zfs_xattr_owner_unlinked(zp
))
723 itx
= zil_itx_create(txtype
, sizeof (*lr
));
724 lr
= (lr_truncate_t
*)&itx
->itx_lr
;
725 lr
->lr_foid
= zp
->z_id
;
729 itx
->itx_sync
= (zp
->z_sync_cnt
!= 0);
730 zil_itx_assign(zilog
, itx
, tx
);
734 * Handles TX_SETATTR transactions.
737 zfs_log_setattr(zilog_t
*zilog
, dmu_tx_t
*tx
, int txtype
,
738 znode_t
*zp
, vattr_t
*vap
, uint_t mask_applied
, zfs_fuid_info_t
*fuidp
)
742 xvattr_t
*xvap
= (xvattr_t
*)vap
;
743 size_t recsize
= sizeof (lr_setattr_t
);
746 if (zil_replaying(zilog
, tx
) || zp
->z_unlinked
)
750 * If XVATTR set, then log record size needs to allow
751 * for lr_attr_t + xvattr mask, mapsize and create time
752 * plus actual attribute values
754 if (vap
->va_mask
& ATTR_XVATTR
)
755 recsize
= sizeof (*lr
) + ZIL_XVAT_SIZE(xvap
->xva_mapsize
);
758 recsize
+= fuidp
->z_domain_str_sz
;
760 itx
= zil_itx_create(txtype
, recsize
);
761 lr
= (lr_setattr_t
*)&itx
->itx_lr
;
762 lr
->lr_foid
= zp
->z_id
;
763 lr
->lr_mask
= (uint64_t)mask_applied
;
764 lr
->lr_mode
= (uint64_t)vap
->va_mode
;
765 if ((mask_applied
& ATTR_UID
) && IS_EPHEMERAL(vap
->va_uid
))
766 lr
->lr_uid
= fuidp
->z_fuid_owner
;
768 lr
->lr_uid
= (uint64_t)vap
->va_uid
;
770 if ((mask_applied
& ATTR_GID
) && IS_EPHEMERAL(vap
->va_gid
))
771 lr
->lr_gid
= fuidp
->z_fuid_group
;
773 lr
->lr_gid
= (uint64_t)vap
->va_gid
;
775 lr
->lr_size
= (uint64_t)vap
->va_size
;
776 ZFS_TIME_ENCODE(&vap
->va_atime
, lr
->lr_atime
);
777 ZFS_TIME_ENCODE(&vap
->va_mtime
, lr
->lr_mtime
);
778 start
= (lr_setattr_t
*)(lr
+ 1);
779 if (vap
->va_mask
& ATTR_XVATTR
) {
780 zfs_log_xvattr((lr_attr_t
*)start
, xvap
);
781 start
= (caddr_t
)start
+ ZIL_XVAT_SIZE(xvap
->xva_mapsize
);
785 * Now stick on domain information if any on end
789 (void) zfs_log_fuid_domains(fuidp
, start
);
791 itx
->itx_sync
= (zp
->z_sync_cnt
!= 0);
792 zil_itx_assign(zilog
, itx
, tx
);
796 * Handles TX_SETSAXATTR transactions.
799 zfs_log_setsaxattr(zilog_t
*zilog
, dmu_tx_t
*tx
, int txtype
,
800 znode_t
*zp
, const char *name
, const void *value
, size_t size
)
804 size_t recsize
= sizeof (lr_setsaxattr_t
);
808 if (zil_replaying(zilog
, tx
) || zp
->z_unlinked
)
811 namelen
= strlen(name
) + 1;
812 recsize
+= (namelen
+ size
);
813 itx
= zil_itx_create(txtype
, recsize
);
814 lr
= (lr_setsaxattr_t
*)&itx
->itx_lr
;
815 lr
->lr_foid
= zp
->z_id
;
816 xattrstart
= (char *)(lr
+ 1);
817 memcpy(xattrstart
, name
, namelen
);
819 memcpy((char *)xattrstart
+ namelen
, value
, size
);
825 itx
->itx_sync
= (zp
->z_sync_cnt
!= 0);
826 zil_itx_assign(zilog
, itx
, tx
);
830 * Handles TX_ACL transactions.
833 zfs_log_acl(zilog_t
*zilog
, dmu_tx_t
*tx
, znode_t
*zp
,
834 vsecattr_t
*vsecp
, zfs_fuid_info_t
*fuidp
)
842 size_t aclbytes
= vsecp
->vsa_aclentsz
;
844 if (zil_replaying(zilog
, tx
) || zp
->z_unlinked
)
847 txtype
= (ZTOZSB(zp
)->z_version
< ZPL_VERSION_FUID
) ?
850 if (txtype
== TX_ACL
)
851 lrsize
= sizeof (*lr
);
853 lrsize
= sizeof (*lrv0
);
856 ((txtype
== TX_ACL
) ? ZIL_ACE_LENGTH(aclbytes
) : aclbytes
) +
857 (fuidp
? fuidp
->z_domain_str_sz
: 0) +
858 sizeof (uint64_t) * (fuidp
? fuidp
->z_fuid_cnt
: 0);
860 itx
= zil_itx_create(txtype
, txsize
);
862 lr
= (lr_acl_t
*)&itx
->itx_lr
;
863 lr
->lr_foid
= zp
->z_id
;
864 if (txtype
== TX_ACL
) {
865 lr
->lr_acl_bytes
= aclbytes
;
866 lr
->lr_domcnt
= fuidp
? fuidp
->z_domain_cnt
: 0;
867 lr
->lr_fuidcnt
= fuidp
? fuidp
->z_fuid_cnt
: 0;
868 if (vsecp
->vsa_mask
& VSA_ACE_ACLFLAGS
)
869 lr
->lr_acl_flags
= (uint64_t)vsecp
->vsa_aclflags
;
871 lr
->lr_acl_flags
= 0;
873 lr
->lr_aclcnt
= (uint64_t)vsecp
->vsa_aclcnt
;
875 if (txtype
== TX_ACL_V0
) {
876 lrv0
= (lr_acl_v0_t
*)lr
;
877 memcpy(lrv0
+ 1, vsecp
->vsa_aclentp
, aclbytes
);
879 void *start
= (ace_t
*)(lr
+ 1);
881 memcpy(start
, vsecp
->vsa_aclentp
, aclbytes
);
883 start
= (caddr_t
)start
+ ZIL_ACE_LENGTH(aclbytes
);
886 start
= zfs_log_fuid_ids(fuidp
, start
);
887 (void) zfs_log_fuid_domains(fuidp
, start
);
891 itx
->itx_sync
= (zp
->z_sync_cnt
!= 0);
892 zil_itx_assign(zilog
, itx
, tx
);
896 * Handles TX_CLONE_RANGE transactions.
899 zfs_log_clone_range(zilog_t
*zilog
, dmu_tx_t
*tx
, int txtype
, znode_t
*zp
,
900 uint64_t off
, uint64_t len
, uint64_t blksz
, const blkptr_t
*bps
,
904 lr_clone_range_t
*lr
;
905 uint64_t partlen
, max_log_data
;
908 if (zil_replaying(zilog
, tx
) || zp
->z_unlinked
)
911 max_log_data
= zil_max_log_data(zilog
, sizeof (lr_clone_range_t
));
914 partnbps
= MIN(nbps
, max_log_data
/ sizeof (bps
[0]));
916 for (i
= 0; i
< partnbps
; i
++) {
917 partlen
+= BP_GET_LSIZE(&bps
[i
]);
919 partlen
= MIN(partlen
, len
);
921 itx
= zil_itx_create(txtype
,
922 sizeof (*lr
) + sizeof (bps
[0]) * partnbps
);
923 lr
= (lr_clone_range_t
*)&itx
->itx_lr
;
924 lr
->lr_foid
= zp
->z_id
;
926 lr
->lr_length
= partlen
;
927 lr
->lr_blksz
= blksz
;
928 lr
->lr_nbps
= partnbps
;
929 memcpy(lr
->lr_bps
, bps
, sizeof (bps
[0]) * partnbps
);
931 itx
->itx_sync
= (zp
->z_sync_cnt
!= 0);
933 zil_itx_assign(zilog
, itx
, tx
);
936 ASSERT3U(nbps
, >=, partnbps
);
939 ASSERT3U(len
, >=, partlen
);
944 ZFS_MODULE_PARAM(zfs
, zfs_
, immediate_write_sz
, S64
, ZMOD_RW
,
945 "Largest data block to write to zil");