Linux 6.14-rc1
[linux.git] / fs / xfs / xfs_inode_item_recover.c
blobf3bfb814378c066b26f16e3803de13515b750fd3
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_trace.h"
17 #include "xfs_trans_priv.h"
18 #include "xfs_buf_item.h"
19 #include "xfs_log.h"
20 #include "xfs_error.h"
21 #include "xfs_log_priv.h"
22 #include "xfs_log_recover.h"
23 #include "xfs_icache.h"
24 #include "xfs_bmap_btree.h"
25 #include "xfs_rtrmap_btree.h"
26 #include "xfs_rtrefcount_btree.h"
28 STATIC void
29 xlog_recover_inode_ra_pass2(
30 struct xlog *log,
31 struct xlog_recover_item *item)
33 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
34 struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr;
36 xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
37 &xfs_inode_buf_ra_ops);
38 } else {
39 struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr;
41 xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
42 &xfs_inode_buf_ra_ops);
47 * Inode fork owner changes
49 * If we have been told that we have to reparent the inode fork, it's because an
50 * extent swap operation on a CRC enabled filesystem has been done and we are
51 * replaying it. We need to walk the BMBT of the appropriate fork and change the
52 * owners of it.
54 * The complexity here is that we don't have an inode context to work with, so
55 * after we've replayed the inode we need to instantiate one. This is where the
56 * fun begins.
58 * We are in the middle of log recovery, so we can't run transactions. That
59 * means we cannot use cache coherent inode instantiation via xfs_iget(), as
60 * that will result in the corresponding iput() running the inode through
61 * xfs_inactive(). If we've just replayed an inode core that changes the link
62 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
63 * transactions (bad!).
65 * So, to avoid this, we instantiate an inode directly from the inode core we've
66 * just recovered. We have the buffer still locked, and all we really need to
67 * instantiate is the inode core and the forks being modified. We can do this
68 * manually, then run the inode btree owner change, and then tear down the
69 * xfs_inode without having to run any transactions at all.
71 * Also, because we don't have a transaction context available here but need to
72 * gather all the buffers we modify for writeback so we pass the buffer_list
73 * instead for the operation to use.
76 STATIC int
77 xfs_recover_inode_owner_change(
78 struct xfs_mount *mp,
79 struct xfs_dinode *dip,
80 struct xfs_inode_log_format *in_f,
81 struct list_head *buffer_list)
83 struct xfs_inode *ip;
84 int error;
86 ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
88 ip = xfs_inode_alloc(mp, in_f->ilf_ino);
89 if (!ip)
90 return -ENOMEM;
92 /* instantiate the inode */
93 ASSERT(dip->di_version >= 3);
95 error = xfs_inode_from_disk(ip, dip);
96 if (error)
97 goto out_free_ip;
99 if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
100 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
101 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
102 ip->i_ino, buffer_list);
103 if (error)
104 goto out_free_ip;
107 if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
108 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
109 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
110 ip->i_ino, buffer_list);
111 if (error)
112 goto out_free_ip;
115 out_free_ip:
116 xfs_inode_free(ip);
117 return error;
120 static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld)
122 return ld->di_version >= 3 &&
123 (ld->di_flags2 & XFS_DIFLAG2_BIGTIME);
126 /* Convert a log timestamp to an ondisk timestamp. */
127 static inline xfs_timestamp_t
128 xfs_log_dinode_to_disk_ts(
129 struct xfs_log_dinode *from,
130 const xfs_log_timestamp_t its)
132 struct xfs_legacy_timestamp *lts;
133 struct xfs_log_legacy_timestamp *lits;
134 xfs_timestamp_t ts;
136 if (xfs_log_dinode_has_bigtime(from))
137 return cpu_to_be64(its);
139 lts = (struct xfs_legacy_timestamp *)&ts;
140 lits = (struct xfs_log_legacy_timestamp *)&its;
141 lts->t_sec = cpu_to_be32(lits->t_sec);
142 lts->t_nsec = cpu_to_be32(lits->t_nsec);
144 return ts;
147 static inline bool xfs_log_dinode_has_large_extent_counts(
148 const struct xfs_log_dinode *ld)
150 return ld->di_version >= 3 &&
151 (ld->di_flags2 & XFS_DIFLAG2_NREXT64);
154 static inline void
155 xfs_log_dinode_to_disk_iext_counters(
156 struct xfs_log_dinode *from,
157 struct xfs_dinode *to)
159 if (xfs_log_dinode_has_large_extent_counts(from)) {
160 to->di_big_nextents = cpu_to_be64(from->di_big_nextents);
161 to->di_big_anextents = cpu_to_be32(from->di_big_anextents);
162 to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad);
163 } else {
164 to->di_nextents = cpu_to_be32(from->di_nextents);
165 to->di_anextents = cpu_to_be16(from->di_anextents);
170 STATIC void
171 xfs_log_dinode_to_disk(
172 struct xfs_log_dinode *from,
173 struct xfs_dinode *to,
174 xfs_lsn_t lsn)
176 to->di_magic = cpu_to_be16(from->di_magic);
177 to->di_mode = cpu_to_be16(from->di_mode);
178 to->di_version = from->di_version;
179 to->di_format = from->di_format;
180 to->di_metatype = cpu_to_be16(from->di_metatype);
181 to->di_uid = cpu_to_be32(from->di_uid);
182 to->di_gid = cpu_to_be32(from->di_gid);
183 to->di_nlink = cpu_to_be32(from->di_nlink);
184 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
185 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
187 to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
188 to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
189 to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime);
191 to->di_size = cpu_to_be64(from->di_size);
192 to->di_nblocks = cpu_to_be64(from->di_nblocks);
193 to->di_extsize = cpu_to_be32(from->di_extsize);
194 to->di_forkoff = from->di_forkoff;
195 to->di_aformat = from->di_aformat;
196 to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
197 to->di_dmstate = cpu_to_be16(from->di_dmstate);
198 to->di_flags = cpu_to_be16(from->di_flags);
199 to->di_gen = cpu_to_be32(from->di_gen);
201 if (from->di_version == 3) {
202 to->di_changecount = cpu_to_be64(from->di_changecount);
203 to->di_crtime = xfs_log_dinode_to_disk_ts(from,
204 from->di_crtime);
205 to->di_flags2 = cpu_to_be64(from->di_flags2);
206 to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
207 to->di_ino = cpu_to_be64(from->di_ino);
208 to->di_lsn = cpu_to_be64(lsn);
209 memset(to->di_pad2, 0, sizeof(to->di_pad2));
210 uuid_copy(&to->di_uuid, &from->di_uuid);
211 to->di_v3_pad = 0;
212 } else {
213 to->di_flushiter = cpu_to_be16(from->di_flushiter);
214 memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
217 xfs_log_dinode_to_disk_iext_counters(from, to);
220 STATIC int
221 xlog_dinode_verify_extent_counts(
222 struct xfs_mount *mp,
223 struct xfs_log_dinode *ldip)
225 xfs_extnum_t nextents;
226 xfs_aextnum_t anextents;
228 if (xfs_log_dinode_has_large_extent_counts(ldip)) {
229 if (!xfs_has_large_extent_counts(mp) ||
230 (ldip->di_nrext64_pad != 0)) {
231 XFS_CORRUPTION_ERROR(
232 "Bad log dinode large extent count format",
233 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
234 xfs_alert(mp,
235 "Bad inode 0x%llx, large extent counts %d, padding 0x%x",
236 ldip->di_ino, xfs_has_large_extent_counts(mp),
237 ldip->di_nrext64_pad);
238 return -EFSCORRUPTED;
241 nextents = ldip->di_big_nextents;
242 anextents = ldip->di_big_anextents;
243 } else {
244 if (ldip->di_version == 3 && ldip->di_v3_pad != 0) {
245 XFS_CORRUPTION_ERROR(
246 "Bad log dinode di_v3_pad",
247 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
248 xfs_alert(mp,
249 "Bad inode 0x%llx, di_v3_pad 0x%llx",
250 ldip->di_ino, ldip->di_v3_pad);
251 return -EFSCORRUPTED;
254 nextents = ldip->di_nextents;
255 anextents = ldip->di_anextents;
258 if (unlikely(nextents + anextents > ldip->di_nblocks)) {
259 XFS_CORRUPTION_ERROR("Bad log dinode extent counts",
260 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
261 xfs_alert(mp,
262 "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx",
263 ldip->di_ino, xfs_has_large_extent_counts(mp), nextents,
264 anextents, ldip->di_nblocks);
265 return -EFSCORRUPTED;
268 return 0;
271 static inline int
272 xlog_recover_inode_dbroot(
273 struct xfs_mount *mp,
274 void *src,
275 unsigned int len,
276 struct xfs_dinode *dip)
278 void *dfork = XFS_DFORK_DPTR(dip);
279 unsigned int dsize = XFS_DFORK_DSIZE(dip, mp);
281 switch (dip->di_format) {
282 case XFS_DINODE_FMT_BTREE:
283 xfs_bmbt_to_bmdr(mp, src, len, dfork, dsize);
284 break;
285 case XFS_DINODE_FMT_META_BTREE:
286 switch (be16_to_cpu(dip->di_metatype)) {
287 case XFS_METAFILE_RTRMAP:
288 xfs_rtrmapbt_to_disk(mp, src, len, dfork, dsize);
289 return 0;
290 case XFS_METAFILE_RTREFCOUNT:
291 xfs_rtrefcountbt_to_disk(mp, src, len, dfork, dsize);
292 return 0;
293 default:
294 ASSERT(0);
295 return -EFSCORRUPTED;
297 break;
298 default:
299 ASSERT(0);
300 return -EFSCORRUPTED;
303 return 0;
306 STATIC int
307 xlog_recover_inode_commit_pass2(
308 struct xlog *log,
309 struct list_head *buffer_list,
310 struct xlog_recover_item *item,
311 xfs_lsn_t current_lsn)
313 struct xfs_inode_log_format *in_f;
314 struct xfs_mount *mp = log->l_mp;
315 struct xfs_buf *bp;
316 struct xfs_dinode *dip;
317 int len;
318 char *src;
319 char *dest;
320 int error;
321 int attr_index;
322 uint fields;
323 struct xfs_log_dinode *ldip;
324 uint isize;
325 int need_free = 0;
326 xfs_failaddr_t fa;
328 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
329 in_f = item->ri_buf[0].i_addr;
330 } else {
331 in_f = kmalloc(sizeof(struct xfs_inode_log_format),
332 GFP_KERNEL | __GFP_NOFAIL);
333 need_free = 1;
334 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
335 if (error)
336 goto error;
340 * Inode buffers can be freed, look out for it,
341 * and do not replay the inode.
343 if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) {
344 error = 0;
345 trace_xfs_log_recover_inode_cancel(log, in_f);
346 goto error;
348 trace_xfs_log_recover_inode_recover(log, in_f);
350 error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
351 0, &bp, &xfs_inode_buf_ops);
352 if (error)
353 goto error;
354 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
355 dip = xfs_buf_offset(bp, in_f->ilf_boffset);
358 * Make sure the place we're flushing out to really looks
359 * like an inode!
361 if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) {
362 xfs_alert(mp,
363 "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld",
364 __func__, dip, bp, in_f->ilf_ino);
365 error = -EFSCORRUPTED;
366 goto out_release;
368 ldip = item->ri_buf[1].i_addr;
369 if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
370 xfs_alert(mp,
371 "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld",
372 __func__, item, in_f->ilf_ino);
373 error = -EFSCORRUPTED;
374 goto out_release;
378 * If the inode has an LSN in it, recover the inode only if the on-disk
379 * inode's LSN is older than the lsn of the transaction we are
380 * replaying. We can have multiple checkpoints with the same start LSN,
381 * so the current LSN being equal to the on-disk LSN doesn't necessarily
382 * mean that the on-disk inode is more recent than the change being
383 * replayed.
385 * We must check the current_lsn against the on-disk inode
386 * here because the we can't trust the log dinode to contain a valid LSN
387 * (see comment below before replaying the log dinode for details).
389 * Note: we still need to replay an owner change even though the inode
390 * is more recent than the transaction as there is no guarantee that all
391 * the btree blocks are more recent than this transaction, too.
393 if (dip->di_version >= 3) {
394 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
396 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) {
397 trace_xfs_log_recover_inode_skip(log, in_f);
398 error = 0;
399 goto out_owner_change;
404 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
405 * are transactional and if ordering is necessary we can determine that
406 * more accurately by the LSN field in the V3 inode core. Don't trust
407 * the inode versions we might be changing them here - use the
408 * superblock flag to determine whether we need to look at di_flushiter
409 * to skip replay when the on disk inode is newer than the log one
411 if (!xfs_has_v3inodes(mp)) {
412 if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
414 * Deal with the wrap case, DI_MAX_FLUSH is less
415 * than smaller numbers
417 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
418 ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
419 /* do nothing */
420 } else {
421 trace_xfs_log_recover_inode_skip(log, in_f);
422 error = 0;
423 goto out_release;
427 /* Take the opportunity to reset the flush iteration count */
428 ldip->di_flushiter = 0;
432 if (unlikely(S_ISREG(ldip->di_mode))) {
433 if (ldip->di_format != XFS_DINODE_FMT_EXTENTS &&
434 ldip->di_format != XFS_DINODE_FMT_BTREE &&
435 ldip->di_format != XFS_DINODE_FMT_META_BTREE) {
436 XFS_CORRUPTION_ERROR(
437 "Bad log dinode data fork format for regular file",
438 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
439 xfs_alert(mp,
440 "Bad inode 0x%llx, data fork format 0x%x",
441 in_f->ilf_ino, ldip->di_format);
442 error = -EFSCORRUPTED;
443 goto out_release;
445 } else if (unlikely(S_ISDIR(ldip->di_mode))) {
446 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
447 (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
448 (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
449 XFS_CORRUPTION_ERROR(
450 "Bad log dinode data fork format for directory",
451 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
452 xfs_alert(mp,
453 "Bad inode 0x%llx, data fork format 0x%x",
454 in_f->ilf_ino, ldip->di_format);
455 error = -EFSCORRUPTED;
456 goto out_release;
460 error = xlog_dinode_verify_extent_counts(mp, ldip);
461 if (error)
462 goto out_release;
464 if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
465 XFS_CORRUPTION_ERROR("Bad log dinode fork offset",
466 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
467 xfs_alert(mp,
468 "Bad inode 0x%llx, di_forkoff 0x%x",
469 in_f->ilf_ino, ldip->di_forkoff);
470 error = -EFSCORRUPTED;
471 goto out_release;
473 isize = xfs_log_dinode_size(mp);
474 if (unlikely(item->ri_buf[1].i_len > isize)) {
475 XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW,
476 mp, ldip, sizeof(*ldip));
477 xfs_alert(mp,
478 "Bad inode 0x%llx log dinode size 0x%x",
479 in_f->ilf_ino, item->ri_buf[1].i_len);
480 error = -EFSCORRUPTED;
481 goto out_release;
485 * Recover the log dinode inode into the on disk inode.
487 * The LSN in the log dinode is garbage - it can be zero or reflect
488 * stale in-memory runtime state that isn't coherent with the changes
489 * logged in this transaction or the changes written to the on-disk
490 * inode. Hence we write the current lSN into the inode because that
491 * matches what xfs_iflush() would write inode the inode when flushing
492 * the changes in this transaction.
494 xfs_log_dinode_to_disk(ldip, dip, current_lsn);
496 fields = in_f->ilf_fields;
497 if (fields & XFS_ILOG_DEV)
498 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
500 if (in_f->ilf_size == 2)
501 goto out_owner_change;
502 len = item->ri_buf[2].i_len;
503 src = item->ri_buf[2].i_addr;
504 ASSERT(in_f->ilf_size <= 4);
505 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
506 ASSERT(!(fields & XFS_ILOG_DFORK) ||
507 (len == xlog_calc_iovec_len(in_f->ilf_dsize)));
509 switch (fields & XFS_ILOG_DFORK) {
510 case XFS_ILOG_DDATA:
511 case XFS_ILOG_DEXT:
512 memcpy(XFS_DFORK_DPTR(dip), src, len);
513 break;
515 case XFS_ILOG_DBROOT:
516 error = xlog_recover_inode_dbroot(mp, src, len, dip);
517 if (error)
518 goto out_release;
519 break;
521 default:
523 * There are no data fork flags set.
525 ASSERT((fields & XFS_ILOG_DFORK) == 0);
526 break;
530 * If we logged any attribute data, recover it. There may or
531 * may not have been any other non-core data logged in this
532 * transaction.
534 if (in_f->ilf_fields & XFS_ILOG_AFORK) {
535 if (in_f->ilf_fields & XFS_ILOG_DFORK) {
536 attr_index = 3;
537 } else {
538 attr_index = 2;
540 len = item->ri_buf[attr_index].i_len;
541 src = item->ri_buf[attr_index].i_addr;
542 ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize));
544 switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
545 case XFS_ILOG_ADATA:
546 case XFS_ILOG_AEXT:
547 dest = XFS_DFORK_APTR(dip);
548 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
549 memcpy(dest, src, len);
550 break;
552 case XFS_ILOG_ABROOT:
553 dest = XFS_DFORK_APTR(dip);
554 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
555 len, (struct xfs_bmdr_block *)dest,
556 XFS_DFORK_ASIZE(dip, mp));
557 break;
559 default:
560 xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
561 ASSERT(0);
562 error = -EFSCORRUPTED;
563 goto out_release;
567 out_owner_change:
568 /* Recover the swapext owner change unless inode has been deleted */
569 if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) &&
570 (dip->di_mode != 0))
571 error = xfs_recover_inode_owner_change(mp, dip, in_f,
572 buffer_list);
573 /* re-generate the checksum and validate the recovered inode. */
574 xfs_dinode_calc_crc(log->l_mp, dip);
575 fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip);
576 if (fa) {
577 XFS_CORRUPTION_ERROR(
578 "Bad dinode after recovery",
579 XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip));
580 xfs_alert(mp,
581 "Metadata corruption detected at %pS, inode 0x%llx",
582 fa, in_f->ilf_ino);
583 error = -EFSCORRUPTED;
584 goto out_release;
587 ASSERT(bp->b_mount == mp);
588 bp->b_flags |= _XBF_LOGRECOVERY;
589 xfs_buf_delwri_queue(bp, buffer_list);
591 out_release:
592 xfs_buf_relse(bp);
593 error:
594 if (need_free)
595 kfree(in_f);
596 return error;
599 const struct xlog_recover_item_ops xlog_inode_item_ops = {
600 .item_type = XFS_LI_INODE,
601 .ra_pass2 = xlog_recover_inode_ra_pass2,
602 .commit_pass2 = xlog_recover_inode_commit_pass2,