2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 #include "xfs_shared.h"
21 #include "xfs_format.h"
22 #include "xfs_log_format.h"
23 #include "xfs_trans_resv.h"
24 #include "xfs_mount.h"
25 #include "xfs_defer.h"
26 #include "xfs_inode.h"
27 #include "xfs_errortag.h"
28 #include "xfs_error.h"
29 #include "xfs_cksum.h"
30 #include "xfs_icache.h"
31 #include "xfs_trans.h"
32 #include "xfs_ialloc.h"
35 #include <linux/iversion.h>
38 * Check that none of the inode's in the buffer have a next
39 * unlinked field of 0.
51 j
= mp
->m_inode_cluster_size
>> mp
->m_sb
.sb_inodelog
;
53 for (i
= 0; i
< j
; i
++) {
54 dip
= xfs_buf_offset(bp
, i
* mp
->m_sb
.sb_inodesize
);
55 if (!dip
->di_next_unlinked
) {
57 "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
58 i
, (long long)bp
->b_bn
);
65 xfs_dinode_good_version(
69 if (xfs_sb_version_hascrc(&mp
->m_sb
))
72 return version
== 1 || version
== 2;
76 * If we are doing readahead on an inode buffer, we might be in log recovery
77 * reading an inode allocation buffer that hasn't yet been replayed, and hence
78 * has not had the inode cores stamped into it. Hence for readahead, the buffer
79 * may be potentially invalid.
81 * If the readahead buffer is invalid, we need to mark it with an error and
82 * clear the DONE status of the buffer so that a followup read will re-read it
83 * from disk. We don't report the error otherwise to avoid warnings during log
84 * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
85 * because all we want to do is say readahead failed; there is no-one to report
86 * the error to, so this will distinguish it from a non-ra verifier failure.
87 * Changes to this readahead error behavour also need to be reflected in
88 * xfs_dquot_buf_readahead_verify().
95 struct xfs_mount
*mp
= bp
->b_target
->bt_mount
;
100 * Validate the magic number and version of every inode in the buffer
102 ni
= XFS_BB_TO_FSB(mp
, bp
->b_length
) * mp
->m_sb
.sb_inopblock
;
103 for (i
= 0; i
< ni
; i
++) {
107 dip
= xfs_buf_offset(bp
, (i
<< mp
->m_sb
.sb_inodelog
));
108 di_ok
= dip
->di_magic
== cpu_to_be16(XFS_DINODE_MAGIC
) &&
109 xfs_dinode_good_version(mp
, dip
->di_version
);
110 if (unlikely(XFS_TEST_ERROR(!di_ok
, mp
,
111 XFS_ERRTAG_ITOBP_INOTOBP
))) {
113 bp
->b_flags
&= ~XBF_DONE
;
114 xfs_buf_ioerror(bp
, -EIO
);
118 xfs_verifier_error(bp
, -EFSCORRUPTED
, __this_address
);
121 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
122 (unsigned long long)bp
->b_bn
, i
,
123 be16_to_cpu(dip
->di_magic
));
127 xfs_inobp_check(mp
, bp
);
132 xfs_inode_buf_read_verify(
135 xfs_inode_buf_verify(bp
, false);
139 xfs_inode_buf_readahead_verify(
142 xfs_inode_buf_verify(bp
, true);
146 xfs_inode_buf_write_verify(
149 xfs_inode_buf_verify(bp
, false);
152 const struct xfs_buf_ops xfs_inode_buf_ops
= {
154 .verify_read
= xfs_inode_buf_read_verify
,
155 .verify_write
= xfs_inode_buf_write_verify
,
158 const struct xfs_buf_ops xfs_inode_buf_ra_ops
= {
159 .name
= "xxfs_inode_ra",
160 .verify_read
= xfs_inode_buf_readahead_verify
,
161 .verify_write
= xfs_inode_buf_write_verify
,
166 * This routine is called to map an inode to the buffer containing the on-disk
167 * version of the inode. It returns a pointer to the buffer containing the
168 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
169 * pointer to the on-disk inode within that buffer.
171 * If a non-zero error is returned, then the contents of bpp and dipp are
176 struct xfs_mount
*mp
,
177 struct xfs_trans
*tp
,
178 struct xfs_imap
*imap
,
179 struct xfs_dinode
**dipp
,
180 struct xfs_buf
**bpp
,
187 buf_flags
|= XBF_UNMAPPED
;
188 error
= xfs_trans_read_buf(mp
, tp
, mp
->m_ddev_targp
, imap
->im_blkno
,
189 (int)imap
->im_len
, buf_flags
, &bp
,
192 if (error
== -EAGAIN
) {
193 ASSERT(buf_flags
& XBF_TRYLOCK
);
197 if (error
== -EFSCORRUPTED
&&
198 (iget_flags
& XFS_IGET_UNTRUSTED
))
201 xfs_warn(mp
, "%s: xfs_trans_read_buf() returned error %d.",
207 *dipp
= xfs_buf_offset(bp
, imap
->im_boffset
);
213 struct xfs_inode
*ip
,
214 struct xfs_dinode
*from
)
216 struct xfs_icdinode
*to
= &ip
->i_d
;
217 struct inode
*inode
= VFS_I(ip
);
221 * Convert v1 inodes immediately to v2 inode format as this is the
222 * minimum inode version format we support in the rest of the code.
224 to
->di_version
= from
->di_version
;
225 if (to
->di_version
== 1) {
226 set_nlink(inode
, be16_to_cpu(from
->di_onlink
));
227 to
->di_projid_lo
= 0;
228 to
->di_projid_hi
= 0;
231 set_nlink(inode
, be32_to_cpu(from
->di_nlink
));
232 to
->di_projid_lo
= be16_to_cpu(from
->di_projid_lo
);
233 to
->di_projid_hi
= be16_to_cpu(from
->di_projid_hi
);
236 to
->di_format
= from
->di_format
;
237 to
->di_uid
= be32_to_cpu(from
->di_uid
);
238 to
->di_gid
= be32_to_cpu(from
->di_gid
);
239 to
->di_flushiter
= be16_to_cpu(from
->di_flushiter
);
242 * Time is signed, so need to convert to signed 32 bit before
243 * storing in inode timestamp which may be 64 bit. Otherwise
244 * a time before epoch is converted to a time long after epoch
247 inode
->i_atime
.tv_sec
= (int)be32_to_cpu(from
->di_atime
.t_sec
);
248 inode
->i_atime
.tv_nsec
= (int)be32_to_cpu(from
->di_atime
.t_nsec
);
249 inode
->i_mtime
.tv_sec
= (int)be32_to_cpu(from
->di_mtime
.t_sec
);
250 inode
->i_mtime
.tv_nsec
= (int)be32_to_cpu(from
->di_mtime
.t_nsec
);
251 inode
->i_ctime
.tv_sec
= (int)be32_to_cpu(from
->di_ctime
.t_sec
);
252 inode
->i_ctime
.tv_nsec
= (int)be32_to_cpu(from
->di_ctime
.t_nsec
);
253 inode
->i_generation
= be32_to_cpu(from
->di_gen
);
254 inode
->i_mode
= be16_to_cpu(from
->di_mode
);
256 to
->di_size
= be64_to_cpu(from
->di_size
);
257 to
->di_nblocks
= be64_to_cpu(from
->di_nblocks
);
258 to
->di_extsize
= be32_to_cpu(from
->di_extsize
);
259 to
->di_nextents
= be32_to_cpu(from
->di_nextents
);
260 to
->di_anextents
= be16_to_cpu(from
->di_anextents
);
261 to
->di_forkoff
= from
->di_forkoff
;
262 to
->di_aformat
= from
->di_aformat
;
263 to
->di_dmevmask
= be32_to_cpu(from
->di_dmevmask
);
264 to
->di_dmstate
= be16_to_cpu(from
->di_dmstate
);
265 to
->di_flags
= be16_to_cpu(from
->di_flags
);
267 if (to
->di_version
== 3) {
268 inode_set_iversion_queried(inode
,
269 be64_to_cpu(from
->di_changecount
));
270 to
->di_crtime
.t_sec
= be32_to_cpu(from
->di_crtime
.t_sec
);
271 to
->di_crtime
.t_nsec
= be32_to_cpu(from
->di_crtime
.t_nsec
);
272 to
->di_flags2
= be64_to_cpu(from
->di_flags2
);
273 to
->di_cowextsize
= be32_to_cpu(from
->di_cowextsize
);
279 struct xfs_inode
*ip
,
280 struct xfs_dinode
*to
,
283 struct xfs_icdinode
*from
= &ip
->i_d
;
284 struct inode
*inode
= VFS_I(ip
);
286 to
->di_magic
= cpu_to_be16(XFS_DINODE_MAGIC
);
289 to
->di_version
= from
->di_version
;
290 to
->di_format
= from
->di_format
;
291 to
->di_uid
= cpu_to_be32(from
->di_uid
);
292 to
->di_gid
= cpu_to_be32(from
->di_gid
);
293 to
->di_projid_lo
= cpu_to_be16(from
->di_projid_lo
);
294 to
->di_projid_hi
= cpu_to_be16(from
->di_projid_hi
);
296 memset(to
->di_pad
, 0, sizeof(to
->di_pad
));
297 to
->di_atime
.t_sec
= cpu_to_be32(inode
->i_atime
.tv_sec
);
298 to
->di_atime
.t_nsec
= cpu_to_be32(inode
->i_atime
.tv_nsec
);
299 to
->di_mtime
.t_sec
= cpu_to_be32(inode
->i_mtime
.tv_sec
);
300 to
->di_mtime
.t_nsec
= cpu_to_be32(inode
->i_mtime
.tv_nsec
);
301 to
->di_ctime
.t_sec
= cpu_to_be32(inode
->i_ctime
.tv_sec
);
302 to
->di_ctime
.t_nsec
= cpu_to_be32(inode
->i_ctime
.tv_nsec
);
303 to
->di_nlink
= cpu_to_be32(inode
->i_nlink
);
304 to
->di_gen
= cpu_to_be32(inode
->i_generation
);
305 to
->di_mode
= cpu_to_be16(inode
->i_mode
);
307 to
->di_size
= cpu_to_be64(from
->di_size
);
308 to
->di_nblocks
= cpu_to_be64(from
->di_nblocks
);
309 to
->di_extsize
= cpu_to_be32(from
->di_extsize
);
310 to
->di_nextents
= cpu_to_be32(from
->di_nextents
);
311 to
->di_anextents
= cpu_to_be16(from
->di_anextents
);
312 to
->di_forkoff
= from
->di_forkoff
;
313 to
->di_aformat
= from
->di_aformat
;
314 to
->di_dmevmask
= cpu_to_be32(from
->di_dmevmask
);
315 to
->di_dmstate
= cpu_to_be16(from
->di_dmstate
);
316 to
->di_flags
= cpu_to_be16(from
->di_flags
);
318 if (from
->di_version
== 3) {
319 to
->di_changecount
= cpu_to_be64(inode_peek_iversion(inode
));
320 to
->di_crtime
.t_sec
= cpu_to_be32(from
->di_crtime
.t_sec
);
321 to
->di_crtime
.t_nsec
= cpu_to_be32(from
->di_crtime
.t_nsec
);
322 to
->di_flags2
= cpu_to_be64(from
->di_flags2
);
323 to
->di_cowextsize
= cpu_to_be32(from
->di_cowextsize
);
324 to
->di_ino
= cpu_to_be64(ip
->i_ino
);
325 to
->di_lsn
= cpu_to_be64(lsn
);
326 memset(to
->di_pad2
, 0, sizeof(to
->di_pad2
));
327 uuid_copy(&to
->di_uuid
, &ip
->i_mount
->m_sb
.sb_meta_uuid
);
328 to
->di_flushiter
= 0;
330 to
->di_flushiter
= cpu_to_be16(from
->di_flushiter
);
335 xfs_log_dinode_to_disk(
336 struct xfs_log_dinode
*from
,
337 struct xfs_dinode
*to
)
339 to
->di_magic
= cpu_to_be16(from
->di_magic
);
340 to
->di_mode
= cpu_to_be16(from
->di_mode
);
341 to
->di_version
= from
->di_version
;
342 to
->di_format
= from
->di_format
;
344 to
->di_uid
= cpu_to_be32(from
->di_uid
);
345 to
->di_gid
= cpu_to_be32(from
->di_gid
);
346 to
->di_nlink
= cpu_to_be32(from
->di_nlink
);
347 to
->di_projid_lo
= cpu_to_be16(from
->di_projid_lo
);
348 to
->di_projid_hi
= cpu_to_be16(from
->di_projid_hi
);
349 memcpy(to
->di_pad
, from
->di_pad
, sizeof(to
->di_pad
));
351 to
->di_atime
.t_sec
= cpu_to_be32(from
->di_atime
.t_sec
);
352 to
->di_atime
.t_nsec
= cpu_to_be32(from
->di_atime
.t_nsec
);
353 to
->di_mtime
.t_sec
= cpu_to_be32(from
->di_mtime
.t_sec
);
354 to
->di_mtime
.t_nsec
= cpu_to_be32(from
->di_mtime
.t_nsec
);
355 to
->di_ctime
.t_sec
= cpu_to_be32(from
->di_ctime
.t_sec
);
356 to
->di_ctime
.t_nsec
= cpu_to_be32(from
->di_ctime
.t_nsec
);
358 to
->di_size
= cpu_to_be64(from
->di_size
);
359 to
->di_nblocks
= cpu_to_be64(from
->di_nblocks
);
360 to
->di_extsize
= cpu_to_be32(from
->di_extsize
);
361 to
->di_nextents
= cpu_to_be32(from
->di_nextents
);
362 to
->di_anextents
= cpu_to_be16(from
->di_anextents
);
363 to
->di_forkoff
= from
->di_forkoff
;
364 to
->di_aformat
= from
->di_aformat
;
365 to
->di_dmevmask
= cpu_to_be32(from
->di_dmevmask
);
366 to
->di_dmstate
= cpu_to_be16(from
->di_dmstate
);
367 to
->di_flags
= cpu_to_be16(from
->di_flags
);
368 to
->di_gen
= cpu_to_be32(from
->di_gen
);
370 if (from
->di_version
== 3) {
371 to
->di_changecount
= cpu_to_be64(from
->di_changecount
);
372 to
->di_crtime
.t_sec
= cpu_to_be32(from
->di_crtime
.t_sec
);
373 to
->di_crtime
.t_nsec
= cpu_to_be32(from
->di_crtime
.t_nsec
);
374 to
->di_flags2
= cpu_to_be64(from
->di_flags2
);
375 to
->di_cowextsize
= cpu_to_be32(from
->di_cowextsize
);
376 to
->di_ino
= cpu_to_be64(from
->di_ino
);
377 to
->di_lsn
= cpu_to_be64(from
->di_lsn
);
378 memcpy(to
->di_pad2
, from
->di_pad2
, sizeof(to
->di_pad2
));
379 uuid_copy(&to
->di_uuid
, &from
->di_uuid
);
380 to
->di_flushiter
= 0;
382 to
->di_flushiter
= cpu_to_be16(from
->di_flushiter
);
388 struct xfs_mount
*mp
,
390 struct xfs_dinode
*dip
)
397 if (dip
->di_magic
!= cpu_to_be16(XFS_DINODE_MAGIC
))
398 return __this_address
;
400 /* Verify v3 integrity information first */
401 if (dip
->di_version
>= 3) {
402 if (!xfs_sb_version_hascrc(&mp
->m_sb
))
403 return __this_address
;
404 if (!xfs_verify_cksum((char *)dip
, mp
->m_sb
.sb_inodesize
,
406 return __this_address
;
407 if (be64_to_cpu(dip
->di_ino
) != ino
)
408 return __this_address
;
409 if (!uuid_equal(&dip
->di_uuid
, &mp
->m_sb
.sb_meta_uuid
))
410 return __this_address
;
413 /* don't allow invalid i_size */
414 di_size
= be64_to_cpu(dip
->di_size
);
415 if (di_size
& (1ULL << 63))
416 return __this_address
;
418 mode
= be16_to_cpu(dip
->di_mode
);
419 if (mode
&& xfs_mode_to_ftype(mode
) == XFS_DIR3_FT_UNKNOWN
)
420 return __this_address
;
422 /* No zero-length symlinks/dirs. */
423 if ((S_ISLNK(mode
) || S_ISDIR(mode
)) && di_size
== 0)
424 return __this_address
;
426 /* Fork checks carried over from xfs_iformat_fork */
428 be32_to_cpu(dip
->di_nextents
) + be16_to_cpu(dip
->di_anextents
) >
429 be64_to_cpu(dip
->di_nblocks
))
430 return __this_address
;
432 if (mode
&& XFS_DFORK_BOFF(dip
) > mp
->m_sb
.sb_inodesize
)
433 return __this_address
;
435 flags
= be16_to_cpu(dip
->di_flags
);
437 if (mode
&& (flags
& XFS_DIFLAG_REALTIME
) && !mp
->m_rtdev_targp
)
438 return __this_address
;
440 /* Do we have appropriate data fork formats for the mode? */
441 switch (mode
& S_IFMT
) {
446 if (dip
->di_format
!= XFS_DINODE_FMT_DEV
)
447 return __this_address
;
452 switch (dip
->di_format
) {
453 case XFS_DINODE_FMT_LOCAL
:
455 * no local regular files yet
458 return __this_address
;
459 if (di_size
> XFS_DFORK_DSIZE(dip
, mp
))
460 return __this_address
;
462 case XFS_DINODE_FMT_EXTENTS
:
463 case XFS_DINODE_FMT_BTREE
:
466 return __this_address
;
470 /* Uninitialized inode ok. */
473 return __this_address
;
476 if (XFS_DFORK_Q(dip
)) {
477 switch (dip
->di_aformat
) {
478 case XFS_DINODE_FMT_LOCAL
:
479 case XFS_DINODE_FMT_EXTENTS
:
480 case XFS_DINODE_FMT_BTREE
:
483 return __this_address
;
487 /* only version 3 or greater inodes are extensively verified here */
488 if (dip
->di_version
< 3)
491 flags2
= be64_to_cpu(dip
->di_flags2
);
493 /* don't allow reflink/cowextsize if we don't have reflink */
494 if ((flags2
& (XFS_DIFLAG2_REFLINK
| XFS_DIFLAG2_COWEXTSIZE
)) &&
495 !xfs_sb_version_hasreflink(&mp
->m_sb
))
496 return __this_address
;
498 /* only regular files get reflink */
499 if ((flags2
& XFS_DIFLAG2_REFLINK
) && (mode
& S_IFMT
) != S_IFREG
)
500 return __this_address
;
502 /* don't let reflink and realtime mix */
503 if ((flags2
& XFS_DIFLAG2_REFLINK
) && (flags
& XFS_DIFLAG_REALTIME
))
504 return __this_address
;
506 /* don't let reflink and dax mix */
507 if ((flags2
& XFS_DIFLAG2_REFLINK
) && (flags2
& XFS_DIFLAG2_DAX
))
508 return __this_address
;
515 struct xfs_mount
*mp
,
516 struct xfs_dinode
*dip
)
520 if (dip
->di_version
< 3)
523 ASSERT(xfs_sb_version_hascrc(&mp
->m_sb
));
524 crc
= xfs_start_cksum_update((char *)dip
, mp
->m_sb
.sb_inodesize
,
526 dip
->di_crc
= xfs_end_cksum(crc
);
530 * Read the disk inode attributes into the in-core inode structure.
532 * For version 5 superblocks, if we are initialising a new inode and we are not
533 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
534 * inode core with a random generation number. If we are keeping inodes around,
535 * we need to read the inode cluster to get the existing generation number off
536 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
537 * format) then log recovery is dependent on the di_flushiter field being
538 * initialised from the current on-disk value and hence we must also read the
554 * Fill in the location information in the in-core inode.
556 error
= xfs_imap(mp
, tp
, ip
->i_ino
, &ip
->i_imap
, iget_flags
);
560 /* shortcut IO on inode allocation if possible */
561 if ((iget_flags
& XFS_IGET_CREATE
) &&
562 xfs_sb_version_hascrc(&mp
->m_sb
) &&
563 !(mp
->m_flags
& XFS_MOUNT_IKEEP
)) {
564 /* initialise the on-disk inode core */
565 memset(&ip
->i_d
, 0, sizeof(ip
->i_d
));
566 VFS_I(ip
)->i_generation
= prandom_u32();
567 if (xfs_sb_version_hascrc(&mp
->m_sb
))
568 ip
->i_d
.di_version
= 3;
570 ip
->i_d
.di_version
= 2;
575 * Get pointers to the on-disk inode and the buffer containing it.
577 error
= xfs_imap_to_bp(mp
, tp
, &ip
->i_imap
, &dip
, &bp
, 0, iget_flags
);
581 /* even unallocated inodes are verified */
582 fa
= xfs_dinode_verify(mp
, ip
->i_ino
, dip
);
584 xfs_inode_verifier_error(ip
, -EFSCORRUPTED
, "dinode", dip
,
586 error
= -EFSCORRUPTED
;
591 * If the on-disk inode is already linked to a directory
592 * entry, copy all of the inode into the in-core inode.
593 * xfs_iformat_fork() handles copying in the inode format
594 * specific information.
595 * Otherwise, just get the truly permanent information.
598 xfs_inode_from_disk(ip
, dip
);
599 error
= xfs_iformat_fork(ip
, dip
);
602 xfs_alert(mp
, "%s: xfs_iformat() returned error %d",
609 * Partial initialisation of the in-core inode. Just the bits
610 * that xfs_ialloc won't overwrite or relies on being correct.
612 ip
->i_d
.di_version
= dip
->di_version
;
613 VFS_I(ip
)->i_generation
= be32_to_cpu(dip
->di_gen
);
614 ip
->i_d
.di_flushiter
= be16_to_cpu(dip
->di_flushiter
);
617 * Make sure to pull in the mode here as well in
618 * case the inode is released without being used.
619 * This ensures that xfs_inactive() will see that
620 * the inode is already free and not try to mess
621 * with the uninitialized part of it.
623 VFS_I(ip
)->i_mode
= 0;
626 ASSERT(ip
->i_d
.di_version
>= 2);
627 ip
->i_delayed_blks
= 0;
630 * Mark the buffer containing the inode as something to keep
631 * around for a while. This helps to keep recently accessed
632 * meta-data in-core longer.
634 xfs_buf_set_ref(bp
, XFS_INO_REF
);
637 * Use xfs_trans_brelse() to release the buffer containing the on-disk
638 * inode, because it was acquired with xfs_trans_read_buf() in
639 * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
640 * brelse(). If we're within a transaction, then xfs_trans_brelse()
641 * will only release the buffer if it is not dirty within the
642 * transaction. It will be OK to release the buffer in this case,
643 * because inodes on disk are never destroyed and we will be locking the
644 * new in-core inode before putting it in the cache where other
645 * processes can find it. Thus we don't have to worry about the inode
646 * being changed just because we released the buffer.
649 xfs_trans_brelse(tp
, bp
);