1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
6 #include <linux/iversion.h>
10 #include "xfs_shared.h"
11 #include "xfs_format.h"
12 #include "xfs_log_format.h"
13 #include "xfs_trans_resv.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_inode.h"
20 #include "xfs_trans_space.h"
21 #include "xfs_trans.h"
22 #include "xfs_buf_item.h"
23 #include "xfs_inode_item.h"
24 #include "xfs_iunlink_item.h"
25 #include "xfs_ialloc.h"
27 #include "xfs_bmap_util.h"
28 #include "xfs_errortag.h"
29 #include "xfs_error.h"
30 #include "xfs_quota.h"
31 #include "xfs_filestream.h"
32 #include "xfs_trace.h"
33 #include "xfs_icache.h"
34 #include "xfs_symlink.h"
35 #include "xfs_trans_priv.h"
37 #include "xfs_bmap_btree.h"
38 #include "xfs_reflink.h"
40 #include "xfs_log_priv.h"
41 #include "xfs_health.h"
43 #include "xfs_parent.h"
44 #include "xfs_xattr.h"
45 #include "xfs_inode_util.h"
46 #include "xfs_metafile.h"
48 struct kmem_cache
*xfs_inode_cache
;
51 * These two are wrapper routines around the xfs_ilock() routine used to
52 * centralize some grungy code. They are used in places that wish to lock the
53 * inode solely for reading the extents. The reason these places can't just
54 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
55 * bringing in of the extents from disk for a file in b-tree format. If the
56 * inode is in b-tree format, then we need to lock the inode exclusively until
57 * the extents are read in. Locking it exclusively all the time would limit
58 * our parallelism unnecessarily, though. What we do instead is check to see
59 * if the extents have been read in yet, and only lock the inode exclusively
62 * The functions return a value which should be given to the corresponding
66 xfs_ilock_data_map_shared(
69 uint lock_mode
= XFS_ILOCK_SHARED
;
71 if (xfs_need_iread_extents(&ip
->i_df
))
72 lock_mode
= XFS_ILOCK_EXCL
;
73 xfs_ilock(ip
, lock_mode
);
78 xfs_ilock_attr_map_shared(
81 uint lock_mode
= XFS_ILOCK_SHARED
;
83 if (xfs_inode_has_attr_fork(ip
) && xfs_need_iread_extents(&ip
->i_af
))
84 lock_mode
= XFS_ILOCK_EXCL
;
85 xfs_ilock(ip
, lock_mode
);
90 * You can't set both SHARED and EXCL for the same lock,
91 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
92 * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
93 * to set in lock_flags.
96 xfs_lock_flags_assert(
99 ASSERT((lock_flags
& (XFS_IOLOCK_SHARED
| XFS_IOLOCK_EXCL
)) !=
100 (XFS_IOLOCK_SHARED
| XFS_IOLOCK_EXCL
));
101 ASSERT((lock_flags
& (XFS_MMAPLOCK_SHARED
| XFS_MMAPLOCK_EXCL
)) !=
102 (XFS_MMAPLOCK_SHARED
| XFS_MMAPLOCK_EXCL
));
103 ASSERT((lock_flags
& (XFS_ILOCK_SHARED
| XFS_ILOCK_EXCL
)) !=
104 (XFS_ILOCK_SHARED
| XFS_ILOCK_EXCL
));
105 ASSERT((lock_flags
& ~(XFS_LOCK_MASK
| XFS_LOCK_SUBCLASS_MASK
)) == 0);
106 ASSERT(lock_flags
!= 0);
110 * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
111 * multi-reader locks: invalidate_lock and the i_lock. This routine allows
112 * various combinations of the locks to be obtained.
114 * The 3 locks should always be ordered so that the IO lock is obtained first,
115 * the mmap lock second and the ilock last in order to prevent deadlock.
117 * Basic locking order:
119 * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
121 * mmap_lock locking order:
123 * i_rwsem -> page lock -> mmap_lock
124 * mmap_lock -> invalidate_lock -> page_lock
126 * The difference in mmap_lock locking order mean that we cannot hold the
127 * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
128 * can fault in pages during copy in/out (for buffered IO) or require the
129 * mmap_lock in get_user_pages() to map the user pages into the kernel address
130 * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
131 * fault because page faults already hold the mmap_lock.
133 * Hence to serialise fully against both syscall and mmap based IO, we need to
134 * take both the i_rwsem and the invalidate_lock. These locks should *only* be
135 * both taken in places where we need to invalidate the page cache in a race
136 * free manner (e.g. truncate, hole punch and other extent manipulation
144 trace_xfs_ilock(ip
, lock_flags
, _RET_IP_
);
146 xfs_lock_flags_assert(lock_flags
);
148 if (lock_flags
& XFS_IOLOCK_EXCL
) {
149 down_write_nested(&VFS_I(ip
)->i_rwsem
,
150 XFS_IOLOCK_DEP(lock_flags
));
151 } else if (lock_flags
& XFS_IOLOCK_SHARED
) {
152 down_read_nested(&VFS_I(ip
)->i_rwsem
,
153 XFS_IOLOCK_DEP(lock_flags
));
156 if (lock_flags
& XFS_MMAPLOCK_EXCL
) {
157 down_write_nested(&VFS_I(ip
)->i_mapping
->invalidate_lock
,
158 XFS_MMAPLOCK_DEP(lock_flags
));
159 } else if (lock_flags
& XFS_MMAPLOCK_SHARED
) {
160 down_read_nested(&VFS_I(ip
)->i_mapping
->invalidate_lock
,
161 XFS_MMAPLOCK_DEP(lock_flags
));
164 if (lock_flags
& XFS_ILOCK_EXCL
)
165 down_write_nested(&ip
->i_lock
, XFS_ILOCK_DEP(lock_flags
));
166 else if (lock_flags
& XFS_ILOCK_SHARED
)
167 down_read_nested(&ip
->i_lock
, XFS_ILOCK_DEP(lock_flags
));
171 * This is just like xfs_ilock(), except that the caller
172 * is guaranteed not to sleep. It returns 1 if it gets
173 * the requested locks and 0 otherwise. If the IO lock is
174 * obtained but the inode lock cannot be, then the IO lock
175 * is dropped before returning.
177 * ip -- the inode being locked
178 * lock_flags -- this parameter indicates the inode's locks to be
179 * to be locked. See the comment for xfs_ilock() for a list
187 trace_xfs_ilock_nowait(ip
, lock_flags
, _RET_IP_
);
189 xfs_lock_flags_assert(lock_flags
);
191 if (lock_flags
& XFS_IOLOCK_EXCL
) {
192 if (!down_write_trylock(&VFS_I(ip
)->i_rwsem
))
194 } else if (lock_flags
& XFS_IOLOCK_SHARED
) {
195 if (!down_read_trylock(&VFS_I(ip
)->i_rwsem
))
199 if (lock_flags
& XFS_MMAPLOCK_EXCL
) {
200 if (!down_write_trylock(&VFS_I(ip
)->i_mapping
->invalidate_lock
))
201 goto out_undo_iolock
;
202 } else if (lock_flags
& XFS_MMAPLOCK_SHARED
) {
203 if (!down_read_trylock(&VFS_I(ip
)->i_mapping
->invalidate_lock
))
204 goto out_undo_iolock
;
207 if (lock_flags
& XFS_ILOCK_EXCL
) {
208 if (!down_write_trylock(&ip
->i_lock
))
209 goto out_undo_mmaplock
;
210 } else if (lock_flags
& XFS_ILOCK_SHARED
) {
211 if (!down_read_trylock(&ip
->i_lock
))
212 goto out_undo_mmaplock
;
217 if (lock_flags
& XFS_MMAPLOCK_EXCL
)
218 up_write(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
219 else if (lock_flags
& XFS_MMAPLOCK_SHARED
)
220 up_read(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
222 if (lock_flags
& XFS_IOLOCK_EXCL
)
223 up_write(&VFS_I(ip
)->i_rwsem
);
224 else if (lock_flags
& XFS_IOLOCK_SHARED
)
225 up_read(&VFS_I(ip
)->i_rwsem
);
231 * xfs_iunlock() is used to drop the inode locks acquired with
232 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
233 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
234 * that we know which locks to drop.
236 * ip -- the inode being unlocked
237 * lock_flags -- this parameter indicates the inode's locks to be
238 * to be unlocked. See the comment for xfs_ilock() for a list
239 * of valid values for this parameter.
247 xfs_lock_flags_assert(lock_flags
);
249 if (lock_flags
& XFS_IOLOCK_EXCL
)
250 up_write(&VFS_I(ip
)->i_rwsem
);
251 else if (lock_flags
& XFS_IOLOCK_SHARED
)
252 up_read(&VFS_I(ip
)->i_rwsem
);
254 if (lock_flags
& XFS_MMAPLOCK_EXCL
)
255 up_write(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
256 else if (lock_flags
& XFS_MMAPLOCK_SHARED
)
257 up_read(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
259 if (lock_flags
& XFS_ILOCK_EXCL
)
260 up_write(&ip
->i_lock
);
261 else if (lock_flags
& XFS_ILOCK_SHARED
)
262 up_read(&ip
->i_lock
);
264 trace_xfs_iunlock(ip
, lock_flags
, _RET_IP_
);
268 * give up write locks. the i/o lock cannot be held nested
269 * if it is being demoted.
276 ASSERT(lock_flags
& (XFS_IOLOCK_EXCL
|XFS_MMAPLOCK_EXCL
|XFS_ILOCK_EXCL
));
278 ~(XFS_IOLOCK_EXCL
|XFS_MMAPLOCK_EXCL
|XFS_ILOCK_EXCL
)) == 0);
280 if (lock_flags
& XFS_ILOCK_EXCL
)
281 downgrade_write(&ip
->i_lock
);
282 if (lock_flags
& XFS_MMAPLOCK_EXCL
)
283 downgrade_write(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
284 if (lock_flags
& XFS_IOLOCK_EXCL
)
285 downgrade_write(&VFS_I(ip
)->i_rwsem
);
287 trace_xfs_ilock_demote(ip
, lock_flags
, _RET_IP_
);
292 struct xfs_inode
*ip
,
296 * Sometimes we assert the ILOCK is held exclusively, but we're in
297 * a workqueue, so lockdep doesn't know we're the owner.
299 if (lock_flags
& XFS_ILOCK_SHARED
)
300 rwsem_assert_held(&ip
->i_lock
);
301 else if (lock_flags
& XFS_ILOCK_EXCL
)
302 rwsem_assert_held_write_nolockdep(&ip
->i_lock
);
304 if (lock_flags
& XFS_MMAPLOCK_SHARED
)
305 rwsem_assert_held(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
306 else if (lock_flags
& XFS_MMAPLOCK_EXCL
)
307 rwsem_assert_held_write(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
309 if (lock_flags
& XFS_IOLOCK_SHARED
)
310 rwsem_assert_held(&VFS_I(ip
)->i_rwsem
);
311 else if (lock_flags
& XFS_IOLOCK_EXCL
)
312 rwsem_assert_held_write(&VFS_I(ip
)->i_rwsem
);
316 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
317 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
318 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
319 * errors and warnings.
321 #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
323 xfs_lockdep_subclass_ok(
326 return subclass
< MAX_LOCKDEP_SUBCLASSES
;
329 #define xfs_lockdep_subclass_ok(subclass) (true)
333 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
334 * value. This can be called for any type of inode lock combination, including
335 * parent locking. Care must be taken to ensure we don't overrun the subclass
336 * storage fields in the class mask we build.
345 ASSERT(!(lock_mode
& XFS_ILOCK_PARENT
));
346 ASSERT(xfs_lockdep_subclass_ok(subclass
));
348 if (lock_mode
& (XFS_IOLOCK_SHARED
|XFS_IOLOCK_EXCL
)) {
349 ASSERT(subclass
<= XFS_IOLOCK_MAX_SUBCLASS
);
350 class += subclass
<< XFS_IOLOCK_SHIFT
;
353 if (lock_mode
& (XFS_MMAPLOCK_SHARED
|XFS_MMAPLOCK_EXCL
)) {
354 ASSERT(subclass
<= XFS_MMAPLOCK_MAX_SUBCLASS
);
355 class += subclass
<< XFS_MMAPLOCK_SHIFT
;
358 if (lock_mode
& (XFS_ILOCK_SHARED
|XFS_ILOCK_EXCL
)) {
359 ASSERT(subclass
<= XFS_ILOCK_MAX_SUBCLASS
);
360 class += subclass
<< XFS_ILOCK_SHIFT
;
363 return (lock_mode
& ~XFS_LOCK_SUBCLASS_MASK
) | class;
367 * The following routine will lock n inodes in exclusive mode. We assume the
368 * caller calls us with the inodes in i_ino order.
370 * We need to detect deadlock where an inode that we lock is in the AIL and we
371 * start waiting for another inode that is locked by a thread in a long running
372 * transaction (such as truncate). This can result in deadlock since the long
373 * running trans might need to wait for the inode we just locked in order to
374 * push the tail and free space in the log.
376 * xfs_lock_inodes() can only be used to lock one type of lock at a time -
377 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
378 * lock more than one at a time, lockdep will report false positives saying we
379 * have violated locking orders.
383 struct xfs_inode
**ips
,
391 struct xfs_log_item
*lp
;
394 * Currently supports between 2 and 5 inodes with exclusive locking. We
395 * support an arbitrary depth of locking here, but absolute limits on
396 * inodes depend on the type of locking and the limits placed by
397 * lockdep annotations in xfs_lock_inumorder. These are all checked by
400 ASSERT(ips
&& inodes
>= 2 && inodes
<= 5);
401 ASSERT(lock_mode
& (XFS_IOLOCK_EXCL
| XFS_MMAPLOCK_EXCL
|
403 ASSERT(!(lock_mode
& (XFS_IOLOCK_SHARED
| XFS_MMAPLOCK_SHARED
|
405 ASSERT(!(lock_mode
& XFS_MMAPLOCK_EXCL
) ||
406 inodes
<= XFS_MMAPLOCK_MAX_SUBCLASS
+ 1);
407 ASSERT(!(lock_mode
& XFS_ILOCK_EXCL
) ||
408 inodes
<= XFS_ILOCK_MAX_SUBCLASS
+ 1);
410 if (lock_mode
& XFS_IOLOCK_EXCL
) {
411 ASSERT(!(lock_mode
& (XFS_MMAPLOCK_EXCL
| XFS_ILOCK_EXCL
)));
412 } else if (lock_mode
& XFS_MMAPLOCK_EXCL
)
413 ASSERT(!(lock_mode
& XFS_ILOCK_EXCL
));
418 for (; i
< inodes
; i
++) {
421 if (i
&& (ips
[i
] == ips
[i
- 1])) /* Already locked */
425 * If try_lock is not set yet, make sure all locked inodes are
426 * not in the AIL. If any are, set try_lock to be used later.
429 for (j
= (i
- 1); j
>= 0 && !try_lock
; j
--) {
430 lp
= &ips
[j
]->i_itemp
->ili_item
;
431 if (lp
&& test_bit(XFS_LI_IN_AIL
, &lp
->li_flags
))
437 * If any of the previous locks we have locked is in the AIL,
438 * we must TRY to get the second and subsequent locks. If
439 * we can't get any, we must release all we have
443 xfs_ilock(ips
[i
], xfs_lock_inumorder(lock_mode
, i
));
447 /* try_lock means we have an inode locked that is in the AIL. */
449 if (xfs_ilock_nowait(ips
[i
], xfs_lock_inumorder(lock_mode
, i
)))
453 * Unlock all previous guys and try again. xfs_iunlock will try
454 * to push the tail if the inode is in the AIL.
457 for (j
= i
- 1; j
>= 0; j
--) {
459 * Check to see if we've already unlocked this one. Not
460 * the first one going back, and the inode ptr is the
463 if (j
!= (i
- 1) && ips
[j
] == ips
[j
+ 1])
466 xfs_iunlock(ips
[j
], lock_mode
);
469 if ((attempts
% 5) == 0) {
470 delay(1); /* Don't just spin the CPU */
477 * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
478 * mmaplock must be double-locked separately since we use i_rwsem and
479 * invalidate_lock for that. We now support taking one lock EXCL and the
484 struct xfs_inode
*ip0
,
486 struct xfs_inode
*ip1
,
490 struct xfs_log_item
*lp
;
492 ASSERT(hweight32(ip0_mode
) == 1);
493 ASSERT(hweight32(ip1_mode
) == 1);
494 ASSERT(!(ip0_mode
& (XFS_IOLOCK_SHARED
|XFS_IOLOCK_EXCL
)));
495 ASSERT(!(ip1_mode
& (XFS_IOLOCK_SHARED
|XFS_IOLOCK_EXCL
)));
496 ASSERT(!(ip0_mode
& (XFS_MMAPLOCK_SHARED
|XFS_MMAPLOCK_EXCL
)));
497 ASSERT(!(ip1_mode
& (XFS_MMAPLOCK_SHARED
|XFS_MMAPLOCK_EXCL
)));
498 ASSERT(ip0
->i_ino
!= ip1
->i_ino
);
500 if (ip0
->i_ino
> ip1
->i_ino
) {
502 swap(ip0_mode
, ip1_mode
);
506 xfs_ilock(ip0
, xfs_lock_inumorder(ip0_mode
, 0));
509 * If the first lock we have locked is in the AIL, we must TRY to get
510 * the second lock. If we can't get it, we must release the first one
513 lp
= &ip0
->i_itemp
->ili_item
;
514 if (lp
&& test_bit(XFS_LI_IN_AIL
, &lp
->li_flags
)) {
515 if (!xfs_ilock_nowait(ip1
, xfs_lock_inumorder(ip1_mode
, 1))) {
516 xfs_iunlock(ip0
, ip0_mode
);
517 if ((++attempts
% 5) == 0)
518 delay(1); /* Don't just spin the CPU */
522 xfs_ilock(ip1
, xfs_lock_inumorder(ip1_mode
, 1));
527 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
528 * is allowed, otherwise it has to be an exact match. If a CI match is found,
529 * ci_name->name will point to a the actual name (caller must free) or
530 * will be set to NULL if an exact match is found.
534 struct xfs_inode
*dp
,
535 const struct xfs_name
*name
,
536 struct xfs_inode
**ipp
,
537 struct xfs_name
*ci_name
)
542 trace_xfs_lookup(dp
, name
);
544 if (xfs_is_shutdown(dp
->i_mount
))
546 if (xfs_ifork_zapped(dp
, XFS_DATA_FORK
))
549 error
= xfs_dir_lookup(NULL
, dp
, name
, &inum
, ci_name
);
553 error
= xfs_iget(dp
->i_mount
, NULL
, inum
, 0, 0, ipp
);
558 * Fail if a directory entry in the regular directory tree points to
561 if (XFS_IS_CORRUPT(dp
->i_mount
, xfs_is_metadir_inode(*ipp
))) {
562 xfs_fs_mark_sick(dp
->i_mount
, XFS_SICK_FS_METADIR
);
563 error
= -EFSCORRUPTED
;
573 kfree(ci_name
->name
);
580 * Initialise a newly allocated inode and return the in-core inode to the
581 * caller locked exclusively.
583 * Caller is responsible for unlocking the inode manually upon return
587 struct xfs_trans
*tp
,
589 const struct xfs_icreate_args
*args
,
590 struct xfs_inode
**ipp
)
592 struct xfs_mount
*mp
= tp
->t_mountp
;
593 struct xfs_inode
*ip
= NULL
;
597 * Get the in-core inode with the lock held exclusively to prevent
598 * others from looking at until we're done.
600 error
= xfs_iget(mp
, tp
, ino
, XFS_IGET_CREATE
, XFS_ILOCK_EXCL
, &ip
);
605 xfs_trans_ijoin(tp
, ip
, 0);
606 xfs_inode_init(tp
, args
, ip
);
608 /* now that we have an i_mode we can setup the inode structure */
615 /* Return dquots for the ids that will be assigned to a new file. */
618 const struct xfs_icreate_args
*args
,
619 struct xfs_dquot
**udqpp
,
620 struct xfs_dquot
**gdqpp
,
621 struct xfs_dquot
**pdqpp
)
623 struct inode
*dir
= VFS_I(args
->pip
);
624 kuid_t uid
= GLOBAL_ROOT_UID
;
625 kgid_t gid
= GLOBAL_ROOT_GID
;
627 unsigned int flags
= XFS_QMOPT_QUOTALL
;
631 * The uid/gid computation code must match what the VFS uses to
632 * assign i_[ug]id. INHERIT adjusts the gid computation for
633 * setgid/grpid systems.
635 uid
= mapped_fsuid(args
->idmap
, i_user_ns(dir
));
636 gid
= mapped_fsgid(args
->idmap
, i_user_ns(dir
));
637 prid
= xfs_get_initial_prid(args
->pip
);
638 flags
|= XFS_QMOPT_INHERIT
;
641 *udqpp
= *gdqpp
= *pdqpp
= NULL
;
643 return xfs_qm_vop_dqalloc(args
->pip
, uid
, gid
, prid
, flags
, udqpp
,
649 const struct xfs_icreate_args
*args
,
650 struct xfs_name
*name
,
651 struct xfs_inode
**ipp
)
653 struct xfs_inode
*dp
= args
->pip
;
654 struct xfs_dir_update du
= {
658 struct xfs_mount
*mp
= dp
->i_mount
;
659 struct xfs_trans
*tp
= NULL
;
660 struct xfs_dquot
*udqp
;
661 struct xfs_dquot
*gdqp
;
662 struct xfs_dquot
*pdqp
;
663 struct xfs_trans_res
*tres
;
665 bool unlock_dp_on_error
= false;
666 bool is_dir
= S_ISDIR(args
->mode
);
670 trace_xfs_create(dp
, name
);
672 if (xfs_is_shutdown(mp
))
674 if (xfs_ifork_zapped(dp
, XFS_DATA_FORK
))
677 /* Make sure that we have allocated dquot(s) on disk. */
678 error
= xfs_icreate_dqalloc(args
, &udqp
, &gdqp
, &pdqp
);
683 resblks
= xfs_mkdir_space_res(mp
, name
->len
);
684 tres
= &M_RES(mp
)->tr_mkdir
;
686 resblks
= xfs_create_space_res(mp
, name
->len
);
687 tres
= &M_RES(mp
)->tr_create
;
690 error
= xfs_parent_start(mp
, &du
.ppargs
);
692 goto out_release_dquots
;
695 * Initially assume that the file does not exist and
696 * reserve the resources for that case. If that is not
697 * the case we'll drop the one we have and get a more
698 * appropriate transaction later.
700 error
= xfs_trans_alloc_icreate(mp
, tres
, udqp
, gdqp
, pdqp
, resblks
,
702 if (error
== -ENOSPC
) {
703 /* flush outstanding delalloc blocks and retry */
704 xfs_flush_inodes(mp
);
705 error
= xfs_trans_alloc_icreate(mp
, tres
, udqp
, gdqp
, pdqp
,
711 xfs_ilock(dp
, XFS_ILOCK_EXCL
| XFS_ILOCK_PARENT
);
712 unlock_dp_on_error
= true;
715 * A newly created regular or special file just has one directory
716 * entry pointing to them, but a directory also the "." entry
717 * pointing to itself.
719 error
= xfs_dialloc(&tp
, args
, &ino
);
721 error
= xfs_icreate(tp
, ino
, args
, &du
.ip
);
723 goto out_trans_cancel
;
726 * Now we join the directory inode to the transaction. We do not do it
727 * earlier because xfs_dialloc might commit the previous transaction
728 * (and release all the locks). An error from here on will result in
729 * the transaction cancel unlocking dp so don't do it explicitly in the
732 xfs_trans_ijoin(tp
, dp
, 0);
734 error
= xfs_dir_create_child(tp
, resblks
, &du
);
736 goto out_trans_cancel
;
739 * If this is a synchronous mount, make sure that the
740 * create transaction goes to disk before returning to
743 if (xfs_has_wsync(mp
) || xfs_has_dirsync(mp
))
744 xfs_trans_set_sync(tp
);
747 * Attach the dquot(s) to the inodes and modify them incore.
748 * These ids of the inode couldn't have changed since the new
749 * inode has been locked ever since it was created.
751 xfs_qm_vop_create_dqattach(tp
, du
.ip
, udqp
, gdqp
, pdqp
);
753 error
= xfs_trans_commit(tp
);
755 goto out_release_inode
;
762 xfs_iunlock(du
.ip
, XFS_ILOCK_EXCL
);
763 xfs_iunlock(dp
, XFS_ILOCK_EXCL
);
764 xfs_parent_finish(mp
, du
.ppargs
);
768 xfs_trans_cancel(tp
);
771 * Wait until after the current transaction is aborted to finish the
772 * setup of the inode and release the inode. This prevents recursive
773 * transactions and deadlocks from xfs_inactive.
776 xfs_iunlock(du
.ip
, XFS_ILOCK_EXCL
);
777 xfs_finish_inode_setup(du
.ip
);
781 xfs_parent_finish(mp
, du
.ppargs
);
787 if (unlock_dp_on_error
)
788 xfs_iunlock(dp
, XFS_ILOCK_EXCL
);
794 const struct xfs_icreate_args
*args
,
795 struct xfs_inode
**ipp
)
797 struct xfs_inode
*dp
= args
->pip
;
798 struct xfs_mount
*mp
= dp
->i_mount
;
799 struct xfs_inode
*ip
= NULL
;
800 struct xfs_trans
*tp
= NULL
;
801 struct xfs_dquot
*udqp
;
802 struct xfs_dquot
*gdqp
;
803 struct xfs_dquot
*pdqp
;
804 struct xfs_trans_res
*tres
;
809 ASSERT(args
->flags
& XFS_ICREATE_TMPFILE
);
811 if (xfs_is_shutdown(mp
))
814 /* Make sure that we have allocated dquot(s) on disk. */
815 error
= xfs_icreate_dqalloc(args
, &udqp
, &gdqp
, &pdqp
);
819 resblks
= XFS_IALLOC_SPACE_RES(mp
);
820 tres
= &M_RES(mp
)->tr_create_tmpfile
;
822 error
= xfs_trans_alloc_icreate(mp
, tres
, udqp
, gdqp
, pdqp
, resblks
,
825 goto out_release_dquots
;
827 error
= xfs_dialloc(&tp
, args
, &ino
);
829 error
= xfs_icreate(tp
, ino
, args
, &ip
);
831 goto out_trans_cancel
;
833 if (xfs_has_wsync(mp
))
834 xfs_trans_set_sync(tp
);
837 * Attach the dquot(s) to the inodes and modify them incore.
838 * These ids of the inode couldn't have changed since the new
839 * inode has been locked ever since it was created.
841 xfs_qm_vop_create_dqattach(tp
, ip
, udqp
, gdqp
, pdqp
);
843 error
= xfs_iunlink(tp
, ip
);
845 goto out_trans_cancel
;
847 error
= xfs_trans_commit(tp
);
849 goto out_release_inode
;
856 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
860 xfs_trans_cancel(tp
);
863 * Wait until after the current transaction is aborted to finish the
864 * setup of the inode and release the inode. This prevents recursive
865 * transactions and deadlocks from xfs_inactive.
868 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
869 xfs_finish_inode_setup(ip
);
882 struct xfs_inode
*tdp
,
883 struct xfs_inode
*sip
,
884 struct xfs_name
*target_name
)
886 struct xfs_dir_update du
= {
891 struct xfs_mount
*mp
= tdp
->i_mount
;
892 struct xfs_trans
*tp
;
893 int error
, nospace_error
= 0;
896 trace_xfs_link(tdp
, target_name
);
898 ASSERT(!S_ISDIR(VFS_I(sip
)->i_mode
));
900 if (xfs_is_shutdown(mp
))
902 if (xfs_ifork_zapped(tdp
, XFS_DATA_FORK
))
905 error
= xfs_qm_dqattach(sip
);
909 error
= xfs_qm_dqattach(tdp
);
913 error
= xfs_parent_start(mp
, &du
.ppargs
);
917 resblks
= xfs_link_space_res(mp
, target_name
->len
);
918 error
= xfs_trans_alloc_dir(tdp
, &M_RES(mp
)->tr_link
, sip
, &resblks
,
919 &tp
, &nospace_error
);
924 * We don't allow reservationless or quotaless hardlinking when parent
925 * pointers are enabled because we can't back out if the xattrs must
928 if (du
.ppargs
&& nospace_error
) {
929 error
= nospace_error
;
934 * If we are using project inheritance, we only allow hard link
935 * creation in our tree when the project IDs are the same; else
936 * the tree quota mechanism could be circumvented.
938 if (unlikely((tdp
->i_diflags
& XFS_DIFLAG_PROJINHERIT
) &&
939 tdp
->i_projid
!= sip
->i_projid
)) {
941 * Project quota setup skips special files which can
942 * leave inodes in a PROJINHERIT directory without a
943 * project ID set. We need to allow links to be made
944 * to these "project-less" inodes because userspace
945 * expects them to succeed after project ID setup,
946 * but everything else should be rejected.
948 if (!special_file(VFS_I(sip
)->i_mode
) ||
949 sip
->i_projid
!= 0) {
955 error
= xfs_dir_add_child(tp
, resblks
, &du
);
960 * If this is a synchronous mount, make sure that the
961 * link transaction goes to disk before returning to
964 if (xfs_has_wsync(mp
) || xfs_has_dirsync(mp
))
965 xfs_trans_set_sync(tp
);
967 error
= xfs_trans_commit(tp
);
968 xfs_iunlock(tdp
, XFS_ILOCK_EXCL
);
969 xfs_iunlock(sip
, XFS_ILOCK_EXCL
);
970 xfs_parent_finish(mp
, du
.ppargs
);
974 xfs_trans_cancel(tp
);
975 xfs_iunlock(tdp
, XFS_ILOCK_EXCL
);
976 xfs_iunlock(sip
, XFS_ILOCK_EXCL
);
978 xfs_parent_finish(mp
, du
.ppargs
);
980 if (error
== -ENOSPC
&& nospace_error
)
981 error
= nospace_error
;
985 /* Clear the reflink flag and the cowblocks tag if possible. */
987 xfs_itruncate_clear_reflink_flags(
988 struct xfs_inode
*ip
)
990 struct xfs_ifork
*dfork
;
991 struct xfs_ifork
*cfork
;
993 if (!xfs_is_reflink_inode(ip
))
995 dfork
= xfs_ifork_ptr(ip
, XFS_DATA_FORK
);
996 cfork
= xfs_ifork_ptr(ip
, XFS_COW_FORK
);
997 if (dfork
->if_bytes
== 0 && cfork
->if_bytes
== 0)
998 ip
->i_diflags2
&= ~XFS_DIFLAG2_REFLINK
;
999 if (cfork
->if_bytes
== 0)
1000 xfs_inode_clear_cowblocks_tag(ip
);
1004 * Free up the underlying blocks past new_size. The new size must be smaller
1005 * than the current size. This routine can be used both for the attribute and
1006 * data fork, and does not modify the inode size, which is left to the caller.
1008 * The transaction passed to this routine must have made a permanent log
1009 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
1010 * given transaction and start new ones, so make sure everything involved in
1011 * the transaction is tidy before calling here. Some transaction will be
1012 * returned to the caller to be committed. The incoming transaction must
1013 * already include the inode, and both inode locks must be held exclusively.
1014 * The inode must also be "held" within the transaction. On return the inode
1015 * will be "held" within the returned transaction. This routine does NOT
1016 * require any disk space to be reserved for it within the transaction.
1018 * If we get an error, we must return with the inode locked and linked into the
1019 * current transaction. This keeps things simple for the higher level code,
1020 * because it always knows that the inode is locked and held in the transaction
1021 * that returns to it whether errors occur or not. We don't mark the inode
1022 * dirty on error so that transactions can be easily aborted if possible.
1025 xfs_itruncate_extents_flags(
1026 struct xfs_trans
**tpp
,
1027 struct xfs_inode
*ip
,
1029 xfs_fsize_t new_size
,
1032 struct xfs_mount
*mp
= ip
->i_mount
;
1033 struct xfs_trans
*tp
= *tpp
;
1034 xfs_fileoff_t first_unmap_block
;
1037 xfs_assert_ilocked(ip
, XFS_ILOCK_EXCL
);
1038 if (atomic_read(&VFS_I(ip
)->i_count
))
1039 xfs_assert_ilocked(ip
, XFS_IOLOCK_EXCL
);
1040 ASSERT(new_size
<= XFS_ISIZE(ip
));
1041 ASSERT(tp
->t_flags
& XFS_TRANS_PERM_LOG_RES
);
1042 ASSERT(ip
->i_itemp
!= NULL
);
1043 ASSERT(ip
->i_itemp
->ili_lock_flags
== 0);
1044 ASSERT(!XFS_NOT_DQATTACHED(mp
, ip
));
1046 trace_xfs_itruncate_extents_start(ip
, new_size
);
1048 flags
|= xfs_bmapi_aflag(whichfork
);
1051 * Since it is possible for space to become allocated beyond
1052 * the end of the file (in a crash where the space is allocated
1053 * but the inode size is not yet updated), simply remove any
1054 * blocks which show up between the new EOF and the maximum
1055 * possible file size.
1057 * We have to free all the blocks to the bmbt maximum offset, even if
1058 * the page cache can't scale that far.
1060 first_unmap_block
= XFS_B_TO_FSB(mp
, (xfs_ufsize_t
)new_size
);
1061 if (!xfs_verify_fileoff(mp
, first_unmap_block
)) {
1062 WARN_ON_ONCE(first_unmap_block
> XFS_MAX_FILEOFF
);
1066 error
= xfs_bunmapi_range(&tp
, ip
, flags
, first_unmap_block
,
1071 if (whichfork
== XFS_DATA_FORK
) {
1072 /* Remove all pending CoW reservations. */
1073 error
= xfs_reflink_cancel_cow_blocks(ip
, &tp
,
1074 first_unmap_block
, XFS_MAX_FILEOFF
, true);
1078 xfs_itruncate_clear_reflink_flags(ip
);
1082 * Always re-log the inode so that our permanent transaction can keep
1083 * on rolling it forward in the log.
1085 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
1087 trace_xfs_itruncate_extents_end(ip
, new_size
);
1095 * Mark all the buffers attached to this directory stale. In theory we should
1096 * never be freeing a directory with any blocks at all, but this covers the
1097 * case where we've recovered a directory swap with a "temporary" directory
1098 * created by online repair and now need to dump it.
1102 struct xfs_inode
*dp
)
1104 struct xfs_iext_cursor icur
;
1105 struct xfs_bmbt_irec got
;
1106 struct xfs_mount
*mp
= dp
->i_mount
;
1107 struct xfs_da_geometry
*geo
= mp
->m_dir_geo
;
1108 struct xfs_ifork
*ifp
= xfs_ifork_ptr(dp
, XFS_DATA_FORK
);
1112 * Invalidate each directory block. All directory blocks are of
1113 * fsbcount length and alignment, so we only need to walk those same
1114 * offsets. We hold the only reference to this inode, so we must wait
1115 * for the buffer locks.
1117 for_each_xfs_iext(ifp
, &icur
, &got
) {
1118 for (off
= round_up(got
.br_startoff
, geo
->fsbcount
);
1119 off
< got
.br_startoff
+ got
.br_blockcount
;
1120 off
+= geo
->fsbcount
) {
1121 struct xfs_buf
*bp
= NULL
;
1122 xfs_fsblock_t fsbno
;
1125 fsbno
= (off
- got
.br_startoff
) + got
.br_startblock
;
1126 error
= xfs_buf_incore(mp
->m_ddev_targp
,
1127 XFS_FSB_TO_DADDR(mp
, fsbno
),
1128 XFS_FSB_TO_BB(mp
, geo
->fsbcount
),
1140 * xfs_inactive_truncate
1142 * Called to perform a truncate when an inode becomes unlinked.
1145 xfs_inactive_truncate(
1146 struct xfs_inode
*ip
)
1148 struct xfs_mount
*mp
= ip
->i_mount
;
1149 struct xfs_trans
*tp
;
1152 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_itruncate
, 0, 0, 0, &tp
);
1154 ASSERT(xfs_is_shutdown(mp
));
1157 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
1158 xfs_trans_ijoin(tp
, ip
, 0);
1161 * Log the inode size first to prevent stale data exposure in the event
1162 * of a system crash before the truncate completes. See the related
1163 * comment in xfs_vn_setattr_size() for details.
1165 ip
->i_disk_size
= 0;
1166 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
1168 error
= xfs_itruncate_extents(&tp
, ip
, XFS_DATA_FORK
, 0);
1170 goto error_trans_cancel
;
1172 ASSERT(ip
->i_df
.if_nextents
== 0);
1174 error
= xfs_trans_commit(tp
);
1178 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1182 xfs_trans_cancel(tp
);
1184 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1189 * xfs_inactive_ifree()
1191 * Perform the inode free when an inode is unlinked.
1195 struct xfs_inode
*ip
)
1197 struct xfs_mount
*mp
= ip
->i_mount
;
1198 struct xfs_trans
*tp
;
1202 * We try to use a per-AG reservation for any block needed by the finobt
1203 * tree, but as the finobt feature predates the per-AG reservation
1204 * support a degraded file system might not have enough space for the
1205 * reservation at mount time. In that case try to dip into the reserved
1208 * Send a warning if the reservation does happen to fail, as the inode
1209 * now remains allocated and sits on the unlinked list until the fs is
1212 if (unlikely(mp
->m_finobt_nores
)) {
1213 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_ifree
,
1214 XFS_IFREE_SPACE_RES(mp
), 0, XFS_TRANS_RESERVE
,
1217 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_ifree
, 0, 0, 0, &tp
);
1220 if (error
== -ENOSPC
) {
1221 xfs_warn_ratelimited(mp
,
1222 "Failed to remove inode(s) from unlinked list. "
1223 "Please free space, unmount and run xfs_repair.");
1225 ASSERT(xfs_is_shutdown(mp
));
1231 * We do not hold the inode locked across the entire rolling transaction
1232 * here. We only need to hold it for the first transaction that
1233 * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
1234 * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
1235 * here breaks the relationship between cluster buffer invalidation and
1236 * stale inode invalidation on cluster buffer item journal commit
1237 * completion, and can result in leaving dirty stale inodes hanging
1240 * We have no need for serialising this inode operation against other
1241 * operations - we freed the inode and hence reallocation is required
1242 * and that will serialise on reallocating the space the deferops need
1243 * to free. Hence we can unlock the inode on the first commit of
1244 * the transaction rather than roll it right through the deferops. This
1245 * avoids relogging the XFS_ISTALE inode.
1247 * We check that xfs_ifree() hasn't grown an internal transaction roll
1248 * by asserting that the inode is still locked when it returns.
1250 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
1251 xfs_trans_ijoin(tp
, ip
, XFS_ILOCK_EXCL
);
1253 error
= xfs_ifree(tp
, ip
);
1254 xfs_assert_ilocked(ip
, XFS_ILOCK_EXCL
);
1257 * If we fail to free the inode, shut down. The cancel
1258 * might do that, we need to make sure. Otherwise the
1259 * inode might be lost for a long time or forever.
1261 if (!xfs_is_shutdown(mp
)) {
1262 xfs_notice(mp
, "%s: xfs_ifree returned error %d",
1264 xfs_force_shutdown(mp
, SHUTDOWN_META_IO_ERROR
);
1266 xfs_trans_cancel(tp
);
1271 * Credit the quota account(s). The inode is gone.
1273 xfs_trans_mod_dquot_byino(tp
, ip
, XFS_TRANS_DQ_ICOUNT
, -1);
1275 return xfs_trans_commit(tp
);
1279 * Returns true if we need to update the on-disk metadata before we can free
1280 * the memory used by this inode. Updates include freeing post-eof
1281 * preallocations; freeing COW staging extents; and marking the inode free in
1282 * the inobt if it is on the unlinked list.
1285 xfs_inode_needs_inactive(
1286 struct xfs_inode
*ip
)
1288 struct xfs_mount
*mp
= ip
->i_mount
;
1289 struct xfs_ifork
*cow_ifp
= xfs_ifork_ptr(ip
, XFS_COW_FORK
);
1292 * If the inode is already free, then there can be nothing
1295 if (VFS_I(ip
)->i_mode
== 0)
1299 * If this is a read-only mount, don't do this (would generate I/O)
1300 * unless we're in log recovery and cleaning the iunlinked list.
1302 if (xfs_is_readonly(mp
) && !xlog_recovery_needed(mp
->m_log
))
1305 /* If the log isn't running, push inodes straight to reclaim. */
1306 if (xfs_is_shutdown(mp
) || xfs_has_norecovery(mp
))
1309 /* Metadata inodes require explicit resource cleanup. */
1310 if (xfs_is_internal_inode(ip
))
1313 /* Want to clean out the cow blocks if there are any. */
1314 if (cow_ifp
&& cow_ifp
->if_bytes
> 0)
1317 /* Unlinked files must be freed. */
1318 if (VFS_I(ip
)->i_nlink
== 0)
1322 * This file isn't being freed, so check if there are post-eof blocks
1325 * Note: don't bother with iolock here since lockdep complains about
1326 * acquiring it in reclaim context. We have the only reference to the
1327 * inode at this point anyways.
1329 return xfs_can_free_eofblocks(ip
);
1333 * Save health status somewhere, if we're dumping an inode with uncorrected
1334 * errors and online repair isn't running.
1337 xfs_inactive_health(
1338 struct xfs_inode
*ip
)
1340 struct xfs_mount
*mp
= ip
->i_mount
;
1341 struct xfs_perag
*pag
;
1343 unsigned int checked
;
1345 xfs_inode_measure_sickness(ip
, &sick
, &checked
);
1349 trace_xfs_inode_unfixed_corruption(ip
, sick
);
1351 if (sick
& XFS_SICK_INO_FORGET
)
1354 pag
= xfs_perag_get(mp
, XFS_INO_TO_AGNO(mp
, ip
->i_ino
));
1356 /* There had better still be a perag structure! */
1361 xfs_ag_mark_sick(pag
, XFS_SICK_AG_INODES
);
1368 * This is called when the vnode reference count for the vnode
1369 * goes to zero. If the file has been unlinked, then it must
1370 * now be truncated. Also, we clear all of the read-ahead state
1371 * kept for the inode here since the file is now closed.
1377 struct xfs_mount
*mp
;
1382 * If the inode is already free, then there can be nothing
1385 if (VFS_I(ip
)->i_mode
== 0) {
1386 ASSERT(ip
->i_df
.if_broot_bytes
== 0);
1391 ASSERT(!xfs_iflags_test(ip
, XFS_IRECOVERY
));
1393 xfs_inactive_health(ip
);
1396 * If this is a read-only mount, don't do this (would generate I/O)
1397 * unless we're in log recovery and cleaning the iunlinked list.
1399 if (xfs_is_readonly(mp
) && !xlog_recovery_needed(mp
->m_log
))
1402 /* Metadata inodes require explicit resource cleanup. */
1403 if (xfs_is_internal_inode(ip
))
1406 /* Try to clean out the cow blocks if there are any. */
1407 if (xfs_inode_has_cow_data(ip
))
1408 xfs_reflink_cancel_cow_range(ip
, 0, NULLFILEOFF
, true);
1410 if (VFS_I(ip
)->i_nlink
!= 0) {
1412 * Note: don't bother with iolock here since lockdep complains
1413 * about acquiring it in reclaim context. We have the only
1414 * reference to the inode at this point anyways.
1416 if (xfs_can_free_eofblocks(ip
))
1417 error
= xfs_free_eofblocks(ip
);
1422 if (S_ISREG(VFS_I(ip
)->i_mode
) &&
1423 (ip
->i_disk_size
!= 0 || XFS_ISIZE(ip
) != 0 ||
1424 xfs_inode_has_filedata(ip
)))
1427 if (xfs_iflags_test(ip
, XFS_IQUOTAUNCHECKED
)) {
1429 * If this inode is being inactivated during a quotacheck and
1430 * has not yet been scanned by quotacheck, we /must/ remove
1431 * the dquots from the inode before inactivation changes the
1432 * block and inode counts. Most probably this is a result of
1433 * reloading the incore iunlinked list to purge unrecovered
1436 xfs_qm_dqdetach(ip
);
1438 error
= xfs_qm_dqattach(ip
);
1443 if (S_ISDIR(VFS_I(ip
)->i_mode
) && ip
->i_df
.if_nextents
> 0) {
1444 xfs_inactive_dir(ip
);
1448 if (S_ISLNK(VFS_I(ip
)->i_mode
))
1449 error
= xfs_inactive_symlink(ip
);
1451 error
= xfs_inactive_truncate(ip
);
1456 * If there are attributes associated with the file then blow them away
1457 * now. The code calls a routine that recursively deconstructs the
1458 * attribute fork. If also blows away the in-core attribute fork.
1460 if (xfs_inode_has_attr_fork(ip
)) {
1461 error
= xfs_attr_inactive(ip
);
1466 ASSERT(ip
->i_forkoff
== 0);
1471 error
= xfs_inactive_ifree(ip
);
1475 * We're done making metadata updates for this inode, so we can release
1476 * the attached dquots.
1478 xfs_qm_dqdetach(ip
);
1483 * Find an inode on the unlinked list. This does not take references to the
1484 * inode as we have existence guarantees by holding the AGI buffer lock and that
1485 * only unlinked, referenced inodes can be on the unlinked inode list. If we
1486 * don't find the inode in cache, then let the caller handle the situation.
1490 struct xfs_perag
*pag
,
1493 struct xfs_inode
*ip
;
1496 ip
= radix_tree_lookup(&pag
->pag_ici_root
, agino
);
1498 /* Caller can handle inode not being in memory. */
1504 * Inode in RCU freeing limbo should not happen. Warn about this and
1505 * let the caller handle the failure.
1507 if (WARN_ON_ONCE(!ip
->i_ino
)) {
1511 ASSERT(!xfs_iflags_test(ip
, XFS_IRECLAIMABLE
| XFS_IRECLAIM
));
1517 * Load the inode @next_agino into the cache and set its prev_unlinked pointer
1518 * to @prev_agino. Caller must hold the AGI to synchronize with other changes
1519 * to the unlinked list.
1522 xfs_iunlink_reload_next(
1523 struct xfs_trans
*tp
,
1524 struct xfs_buf
*agibp
,
1525 xfs_agino_t prev_agino
,
1526 xfs_agino_t next_agino
)
1528 struct xfs_perag
*pag
= agibp
->b_pag
;
1529 struct xfs_mount
*mp
= pag_mount(pag
);
1530 struct xfs_inode
*next_ip
= NULL
;
1533 ASSERT(next_agino
!= NULLAGINO
);
1537 next_ip
= radix_tree_lookup(&pag
->pag_ici_root
, next_agino
);
1538 ASSERT(next_ip
== NULL
);
1542 xfs_info_ratelimited(mp
,
1543 "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
1544 next_agino
, pag_agno(pag
));
1547 * Use an untrusted lookup just to be cautious in case the AGI has been
1548 * corrupted and now points at a free inode. That shouldn't happen,
1549 * but we'd rather shut down now since we're already running in a weird
1552 error
= xfs_iget(mp
, tp
, xfs_agino_to_ino(pag
, next_agino
),
1553 XFS_IGET_UNTRUSTED
, 0, &next_ip
);
1555 xfs_ag_mark_sick(pag
, XFS_SICK_AG_AGI
);
1559 /* If this is not an unlinked inode, something is very wrong. */
1560 if (VFS_I(next_ip
)->i_nlink
!= 0) {
1561 xfs_ag_mark_sick(pag
, XFS_SICK_AG_AGI
);
1562 error
= -EFSCORRUPTED
;
1566 next_ip
->i_prev_unlinked
= prev_agino
;
1567 trace_xfs_iunlink_reload_next(next_ip
);
1569 ASSERT(!(VFS_I(next_ip
)->i_state
& I_DONTCACHE
));
1570 if (xfs_is_quotacheck_running(mp
) && next_ip
)
1571 xfs_iflags_set(next_ip
, XFS_IQUOTAUNCHECKED
);
1577 * Look up the inode number specified and if it is not already marked XFS_ISTALE
1578 * mark it stale. We should only find clean inodes in this lookup that aren't
1582 xfs_ifree_mark_inode_stale(
1583 struct xfs_perag
*pag
,
1584 struct xfs_inode
*free_ip
,
1587 struct xfs_mount
*mp
= pag_mount(pag
);
1588 struct xfs_inode_log_item
*iip
;
1589 struct xfs_inode
*ip
;
1593 ip
= radix_tree_lookup(&pag
->pag_ici_root
, XFS_INO_TO_AGINO(mp
, inum
));
1595 /* Inode not in memory, nothing to do */
1602 * because this is an RCU protected lookup, we could find a recently
1603 * freed or even reallocated inode during the lookup. We need to check
1604 * under the i_flags_lock for a valid inode here. Skip it if it is not
1605 * valid, the wrong inode or stale.
1607 spin_lock(&ip
->i_flags_lock
);
1608 if (ip
->i_ino
!= inum
|| __xfs_iflags_test(ip
, XFS_ISTALE
))
1609 goto out_iflags_unlock
;
1612 * Don't try to lock/unlock the current inode, but we _cannot_ skip the
1613 * other inodes that we did not find in the list attached to the buffer
1614 * and are not already marked stale. If we can't lock it, back off and
1617 if (ip
!= free_ip
) {
1618 if (!xfs_ilock_nowait(ip
, XFS_ILOCK_EXCL
)) {
1619 spin_unlock(&ip
->i_flags_lock
);
1625 ip
->i_flags
|= XFS_ISTALE
;
1628 * If the inode is flushing, it is already attached to the buffer. All
1629 * we needed to do here is mark the inode stale so buffer IO completion
1630 * will remove it from the AIL.
1633 if (__xfs_iflags_test(ip
, XFS_IFLUSHING
)) {
1634 ASSERT(!list_empty(&iip
->ili_item
.li_bio_list
));
1635 ASSERT(iip
->ili_last_fields
);
1640 * Inodes not attached to the buffer can be released immediately.
1641 * Everything else has to go through xfs_iflush_abort() on journal
1642 * commit as the flock synchronises removal of the inode from the
1643 * cluster buffer against inode reclaim.
1645 if (!iip
|| list_empty(&iip
->ili_item
.li_bio_list
))
1648 __xfs_iflags_set(ip
, XFS_IFLUSHING
);
1649 spin_unlock(&ip
->i_flags_lock
);
1652 /* we have a dirty inode in memory that has not yet been flushed. */
1653 spin_lock(&iip
->ili_lock
);
1654 iip
->ili_last_fields
= iip
->ili_fields
;
1655 iip
->ili_fields
= 0;
1656 iip
->ili_fsync_fields
= 0;
1657 spin_unlock(&iip
->ili_lock
);
1658 ASSERT(iip
->ili_last_fields
);
1661 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1666 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1668 spin_unlock(&ip
->i_flags_lock
);
1673 * A big issue when freeing the inode cluster is that we _cannot_ skip any
1674 * inodes that are in memory - they all must be marked stale and attached to
1675 * the cluster buffer.
1679 struct xfs_trans
*tp
,
1680 struct xfs_perag
*pag
,
1681 struct xfs_inode
*free_ip
,
1682 struct xfs_icluster
*xic
)
1684 struct xfs_mount
*mp
= free_ip
->i_mount
;
1685 struct xfs_ino_geometry
*igeo
= M_IGEO(mp
);
1688 xfs_ino_t inum
= xic
->first_ino
;
1694 nbufs
= igeo
->ialloc_blks
/ igeo
->blocks_per_cluster
;
1696 for (j
= 0; j
< nbufs
; j
++, inum
+= igeo
->inodes_per_cluster
) {
1698 * The allocation bitmap tells us which inodes of the chunk were
1699 * physically allocated. Skip the cluster if an inode falls into
1702 ioffset
= inum
- xic
->first_ino
;
1703 if ((xic
->alloc
& XFS_INOBT_MASK(ioffset
)) == 0) {
1704 ASSERT(ioffset
% igeo
->inodes_per_cluster
== 0);
1708 blkno
= XFS_AGB_TO_DADDR(mp
, XFS_INO_TO_AGNO(mp
, inum
),
1709 XFS_INO_TO_AGBNO(mp
, inum
));
1712 * We obtain and lock the backing buffer first in the process
1713 * here to ensure dirty inodes attached to the buffer remain in
1714 * the flushing state while we mark them stale.
1716 * If we scan the in-memory inodes first, then buffer IO can
1717 * complete before we get a lock on it, and hence we may fail
1718 * to mark all the active inodes on the buffer stale.
1720 error
= xfs_trans_get_buf(tp
, mp
->m_ddev_targp
, blkno
,
1721 mp
->m_bsize
* igeo
->blocks_per_cluster
,
1727 * This buffer may not have been correctly initialised as we
1728 * didn't read it from disk. That's not important because we are
1729 * only using to mark the buffer as stale in the log, and to
1730 * attach stale cached inodes on it.
1732 * For the inode that triggered the cluster freeing, this
1733 * attachment may occur in xfs_inode_item_precommit() after we
1734 * have marked this buffer stale. If this buffer was not in
1735 * memory before xfs_ifree_cluster() started, it will not be
1736 * marked XBF_DONE and this will cause problems later in
1737 * xfs_inode_item_precommit() when we trip over a (stale, !done)
1738 * buffer to attached to the transaction.
1740 * Hence we have to mark the buffer as XFS_DONE here. This is
1741 * safe because we are also marking the buffer as XBF_STALE and
1742 * XFS_BLI_STALE. That means it will never be dispatched for
1743 * IO and it won't be unlocked until the cluster freeing has
1744 * been committed to the journal and the buffer unpinned. If it
1745 * is written, we want to know about it, and we want it to
1746 * fail. We can acheive this by adding a write verifier to the
1749 bp
->b_flags
|= XBF_DONE
;
1750 bp
->b_ops
= &xfs_inode_buf_ops
;
1753 * Now we need to set all the cached clean inodes as XFS_ISTALE,
1754 * too. This requires lookups, and will skip inodes that we've
1755 * already marked XFS_ISTALE.
1757 for (i
= 0; i
< igeo
->inodes_per_cluster
; i
++)
1758 xfs_ifree_mark_inode_stale(pag
, free_ip
, inum
+ i
);
1760 xfs_trans_stale_inode_buf(tp
, bp
);
1761 xfs_trans_binval(tp
, bp
);
1767 * This is called to return an inode to the inode free list. The inode should
1768 * already be truncated to 0 length and have no pages associated with it. This
1769 * routine also assumes that the inode is already a part of the transaction.
1771 * The on-disk copy of the inode will have been added to the list of unlinked
1772 * inodes in the AGI. We need to remove the inode from that list atomically with
1773 * respect to freeing it here.
1777 struct xfs_trans
*tp
,
1778 struct xfs_inode
*ip
)
1780 struct xfs_mount
*mp
= ip
->i_mount
;
1781 struct xfs_perag
*pag
;
1782 struct xfs_icluster xic
= { 0 };
1783 struct xfs_inode_log_item
*iip
= ip
->i_itemp
;
1786 xfs_assert_ilocked(ip
, XFS_ILOCK_EXCL
);
1787 ASSERT(VFS_I(ip
)->i_nlink
== 0);
1788 ASSERT(ip
->i_df
.if_nextents
== 0);
1789 ASSERT(ip
->i_disk_size
== 0 || !S_ISREG(VFS_I(ip
)->i_mode
));
1790 ASSERT(ip
->i_nblocks
== 0);
1792 pag
= xfs_perag_get(mp
, XFS_INO_TO_AGNO(mp
, ip
->i_ino
));
1794 error
= xfs_inode_uninit(tp
, pag
, ip
, &xic
);
1798 if (xfs_iflags_test(ip
, XFS_IPRESERVE_DM_FIELDS
))
1799 xfs_iflags_clear(ip
, XFS_IPRESERVE_DM_FIELDS
);
1801 /* Don't attempt to replay owner changes for a deleted inode */
1802 spin_lock(&iip
->ili_lock
);
1803 iip
->ili_fields
&= ~(XFS_ILOG_AOWNER
| XFS_ILOG_DOWNER
);
1804 spin_unlock(&iip
->ili_lock
);
1807 error
= xfs_ifree_cluster(tp
, pag
, ip
, &xic
);
1814 * This is called to unpin an inode. The caller must have the inode locked
1815 * in at least shared mode so that the buffer cannot be subsequently pinned
1816 * once someone is waiting for it to be unpinned.
1820 struct xfs_inode
*ip
)
1822 xfs_assert_ilocked(ip
, XFS_ILOCK_EXCL
| XFS_ILOCK_SHARED
);
1824 trace_xfs_inode_unpin_nowait(ip
, _RET_IP_
);
1826 /* Give the log a push to start the unpinning I/O */
1827 xfs_log_force_seq(ip
->i_mount
, ip
->i_itemp
->ili_commit_seq
, 0, NULL
);
1833 struct xfs_inode
*ip
)
1835 wait_queue_head_t
*wq
= bit_waitqueue(&ip
->i_flags
, __XFS_IPINNED_BIT
);
1836 DEFINE_WAIT_BIT(wait
, &ip
->i_flags
, __XFS_IPINNED_BIT
);
1841 prepare_to_wait(wq
, &wait
.wq_entry
, TASK_UNINTERRUPTIBLE
);
1842 if (xfs_ipincount(ip
))
1844 } while (xfs_ipincount(ip
));
1845 finish_wait(wq
, &wait
.wq_entry
);
1850 struct xfs_inode
*ip
)
1852 if (xfs_ipincount(ip
))
1853 __xfs_iunpin_wait(ip
);
1857 * Removing an inode from the namespace involves removing the directory entry
1858 * and dropping the link count on the inode. Removing the directory entry can
1859 * result in locking an AGF (directory blocks were freed) and removing a link
1860 * count can result in placing the inode on an unlinked list which results in
1863 * The big problem here is that we have an ordering constraint on AGF and AGI
1864 * locking - inode allocation locks the AGI, then can allocate a new extent for
1865 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
1866 * removes the inode from the unlinked list, requiring that we lock the AGI
1867 * first, and then freeing the inode can result in an inode chunk being freed
1868 * and hence freeing disk space requiring that we lock an AGF.
1870 * Hence the ordering that is imposed by other parts of the code is AGI before
1871 * AGF. This means we cannot remove the directory entry before we drop the inode
1872 * reference count and put it on the unlinked list as this results in a lock
1873 * order of AGF then AGI, and this can deadlock against inode allocation and
1874 * freeing. Therefore we must drop the link counts before we remove the
1877 * This is still safe from a transactional point of view - it is not until we
1878 * get to xfs_defer_finish() that we have the possibility of multiple
1879 * transactions in this operation. Hence as long as we remove the directory
1880 * entry and drop the link count in the first transaction of the remove
1881 * operation, there are no transactional constraints on the ordering here.
1885 struct xfs_inode
*dp
,
1886 struct xfs_name
*name
,
1887 struct xfs_inode
*ip
)
1889 struct xfs_dir_update du
= {
1894 struct xfs_mount
*mp
= dp
->i_mount
;
1895 struct xfs_trans
*tp
= NULL
;
1896 int is_dir
= S_ISDIR(VFS_I(ip
)->i_mode
);
1901 trace_xfs_remove(dp
, name
);
1903 if (xfs_is_shutdown(mp
))
1905 if (xfs_ifork_zapped(dp
, XFS_DATA_FORK
))
1908 error
= xfs_qm_dqattach(dp
);
1912 error
= xfs_qm_dqattach(ip
);
1916 error
= xfs_parent_start(mp
, &du
.ppargs
);
1921 * We try to get the real space reservation first, allowing for
1922 * directory btree deletion(s) implying possible bmap insert(s). If we
1923 * can't get the space reservation then we use 0 instead, and avoid the
1924 * bmap btree insert(s) in the directory code by, if the bmap insert
1925 * tries to happen, instead trimming the LAST block from the directory.
1927 * Ignore EDQUOT and ENOSPC being returned via nospace_error because
1928 * the directory code can handle a reservationless update and we don't
1929 * want to prevent a user from trying to free space by deleting things.
1931 resblks
= xfs_remove_space_res(mp
, name
->len
);
1932 error
= xfs_trans_alloc_dir(dp
, &M_RES(mp
)->tr_remove
, ip
, &resblks
,
1935 ASSERT(error
!= -ENOSPC
);
1939 error
= xfs_dir_remove_child(tp
, resblks
, &du
);
1941 goto out_trans_cancel
;
1944 * If this is a synchronous mount, make sure that the
1945 * remove transaction goes to disk before returning to
1948 if (xfs_has_wsync(mp
) || xfs_has_dirsync(mp
))
1949 xfs_trans_set_sync(tp
);
1951 error
= xfs_trans_commit(tp
);
1955 if (is_dir
&& xfs_inode_is_filestream(ip
))
1956 xfs_filestream_deassociate(ip
);
1958 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1959 xfs_iunlock(dp
, XFS_ILOCK_EXCL
);
1960 xfs_parent_finish(mp
, du
.ppargs
);
1964 xfs_trans_cancel(tp
);
1966 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1967 xfs_iunlock(dp
, XFS_ILOCK_EXCL
);
1969 xfs_parent_finish(mp
, du
.ppargs
);
1976 struct xfs_inode
**i_tab
,
1981 for (i
= num_inodes
- 1; i
>= 0; i
--) {
1982 /* Skip duplicate inodes if src and target dps are the same */
1983 if (!i_tab
[i
] || (i
> 0 && i_tab
[i
] == i_tab
[i
- 1]))
1985 xfs_iunlock(i_tab
[i
], XFS_ILOCK_EXCL
);
1990 * Enter all inodes for a rename transaction into a sorted array.
1992 #define __XFS_SORT_INODES 5
1994 xfs_sort_for_rename(
1995 struct xfs_inode
*dp1
, /* in: old (source) directory inode */
1996 struct xfs_inode
*dp2
, /* in: new (target) directory inode */
1997 struct xfs_inode
*ip1
, /* in: inode of old entry */
1998 struct xfs_inode
*ip2
, /* in: inode of new entry */
1999 struct xfs_inode
*wip
, /* in: whiteout inode */
2000 struct xfs_inode
**i_tab
,/* out: sorted array of inodes */
2001 int *num_inodes
) /* in/out: inodes in array */
2005 ASSERT(*num_inodes
== __XFS_SORT_INODES
);
2006 memset(i_tab
, 0, *num_inodes
* sizeof(struct xfs_inode
*));
2009 * i_tab contains a list of pointers to inodes. We initialize
2010 * the table here & we'll sort it. We will then use it to
2011 * order the acquisition of the inode locks.
2013 * Note that the table may contain duplicates. e.g., dp1 == dp2.
2025 xfs_sort_inodes(i_tab
, *num_inodes
);
2030 struct xfs_inode
**i_tab
,
2031 unsigned int num_inodes
)
2035 ASSERT(num_inodes
<= __XFS_SORT_INODES
);
2038 * Sort the elements via bubble sort. (Remember, there are at
2039 * most 5 elements to sort, so this is adequate.)
2041 for (i
= 0; i
< num_inodes
; i
++) {
2042 for (j
= 1; j
< num_inodes
; j
++) {
2043 if (i_tab
[j
]->i_ino
< i_tab
[j
-1]->i_ino
)
2044 swap(i_tab
[j
], i_tab
[j
- 1]);
2050 * xfs_rename_alloc_whiteout()
2052 * Return a referenced, unlinked, unlocked inode that can be used as a
2053 * whiteout in a rename transaction. We use a tmpfile inode here so that if we
2054 * crash between allocating the inode and linking it into the rename transaction
2055 * recovery will free the inode and we won't leak it.
2058 xfs_rename_alloc_whiteout(
2059 struct mnt_idmap
*idmap
,
2060 struct xfs_name
*src_name
,
2061 struct xfs_inode
*dp
,
2062 struct xfs_inode
**wip
)
2064 struct xfs_icreate_args args
= {
2067 .mode
= S_IFCHR
| WHITEOUT_MODE
,
2068 .flags
= XFS_ICREATE_TMPFILE
,
2070 struct xfs_inode
*tmpfile
;
2074 error
= xfs_create_tmpfile(&args
, &tmpfile
);
2078 name
.name
= src_name
->name
;
2079 name
.len
= src_name
->len
;
2080 error
= xfs_inode_init_security(VFS_I(tmpfile
), VFS_I(dp
), &name
);
2082 xfs_finish_inode_setup(tmpfile
);
2088 * Prepare the tmpfile inode as if it were created through the VFS.
2089 * Complete the inode setup and flag it as linkable. nlink is already
2090 * zero, so we can skip the drop_nlink.
2092 xfs_setup_iops(tmpfile
);
2093 xfs_finish_inode_setup(tmpfile
);
2094 VFS_I(tmpfile
)->i_state
|= I_LINKABLE
;
2105 struct mnt_idmap
*idmap
,
2106 struct xfs_inode
*src_dp
,
2107 struct xfs_name
*src_name
,
2108 struct xfs_inode
*src_ip
,
2109 struct xfs_inode
*target_dp
,
2110 struct xfs_name
*target_name
,
2111 struct xfs_inode
*target_ip
,
2114 struct xfs_dir_update du_src
= {
2119 struct xfs_dir_update du_tgt
= {
2121 .name
= target_name
,
2124 struct xfs_dir_update du_wip
= { };
2125 struct xfs_mount
*mp
= src_dp
->i_mount
;
2126 struct xfs_trans
*tp
;
2127 struct xfs_inode
*inodes
[__XFS_SORT_INODES
];
2129 int num_inodes
= __XFS_SORT_INODES
;
2130 bool new_parent
= (src_dp
!= target_dp
);
2131 bool src_is_directory
= S_ISDIR(VFS_I(src_ip
)->i_mode
);
2133 bool retried
= false;
2134 int error
, nospace_error
= 0;
2136 trace_xfs_rename(src_dp
, target_dp
, src_name
, target_name
);
2138 if ((flags
& RENAME_EXCHANGE
) && !target_ip
)
2142 * If we are doing a whiteout operation, allocate the whiteout inode
2143 * we will be placing at the target and ensure the type is set
2146 if (flags
& RENAME_WHITEOUT
) {
2147 error
= xfs_rename_alloc_whiteout(idmap
, src_name
, target_dp
,
2152 /* setup target dirent info as whiteout */
2153 src_name
->type
= XFS_DIR3_FT_CHRDEV
;
2156 xfs_sort_for_rename(src_dp
, target_dp
, src_ip
, target_ip
, du_wip
.ip
,
2157 inodes
, &num_inodes
);
2159 error
= xfs_parent_start(mp
, &du_src
.ppargs
);
2161 goto out_release_wip
;
2164 error
= xfs_parent_start(mp
, &du_wip
.ppargs
);
2166 goto out_src_ppargs
;
2170 error
= xfs_parent_start(mp
, &du_tgt
.ppargs
);
2172 goto out_wip_ppargs
;
2177 spaceres
= xfs_rename_space_res(mp
, src_name
->len
, target_ip
!= NULL
,
2178 target_name
->len
, du_wip
.ip
!= NULL
);
2179 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_rename
, spaceres
, 0, 0, &tp
);
2180 if (error
== -ENOSPC
) {
2181 nospace_error
= error
;
2183 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_rename
, 0, 0, 0,
2187 goto out_tgt_ppargs
;
2190 * We don't allow reservationless renaming when parent pointers are
2191 * enabled because we can't back out if the xattrs must grow.
2193 if (du_src
.ppargs
&& nospace_error
) {
2194 error
= nospace_error
;
2195 xfs_trans_cancel(tp
);
2196 goto out_tgt_ppargs
;
2200 * Attach the dquots to the inodes
2202 error
= xfs_qm_vop_rename_dqattach(inodes
);
2204 xfs_trans_cancel(tp
);
2205 goto out_tgt_ppargs
;
2209 * Lock all the participating inodes. Depending upon whether
2210 * the target_name exists in the target directory, and
2211 * whether the target directory is the same as the source
2212 * directory, we can lock from 2 to 5 inodes.
2214 xfs_lock_inodes(inodes
, num_inodes
, XFS_ILOCK_EXCL
);
2217 * Join all the inodes to the transaction.
2219 xfs_trans_ijoin(tp
, src_dp
, 0);
2221 xfs_trans_ijoin(tp
, target_dp
, 0);
2222 xfs_trans_ijoin(tp
, src_ip
, 0);
2224 xfs_trans_ijoin(tp
, target_ip
, 0);
2226 xfs_trans_ijoin(tp
, du_wip
.ip
, 0);
2229 * If we are using project inheritance, we only allow renames
2230 * into our tree when the project IDs are the same; else the
2231 * tree quota mechanism would be circumvented.
2233 if (unlikely((target_dp
->i_diflags
& XFS_DIFLAG_PROJINHERIT
) &&
2234 target_dp
->i_projid
!= src_ip
->i_projid
)) {
2236 goto out_trans_cancel
;
2239 /* RENAME_EXCHANGE is unique from here on. */
2240 if (flags
& RENAME_EXCHANGE
) {
2241 error
= xfs_dir_exchange_children(tp
, &du_src
, &du_tgt
,
2244 goto out_trans_cancel
;
2249 * Try to reserve quota to handle an expansion of the target directory.
2250 * We'll allow the rename to continue in reservationless mode if we hit
2251 * a space usage constraint. If we trigger reservationless mode, save
2252 * the errno if there isn't any free space in the target directory.
2254 if (spaceres
!= 0) {
2255 error
= xfs_trans_reserve_quota_nblks(tp
, target_dp
, spaceres
,
2257 if (error
== -EDQUOT
|| error
== -ENOSPC
) {
2259 xfs_trans_cancel(tp
);
2260 xfs_iunlock_rename(inodes
, num_inodes
);
2261 xfs_blockgc_free_quota(target_dp
, 0);
2266 nospace_error
= error
;
2271 goto out_trans_cancel
;
2275 * We don't allow quotaless renaming when parent pointers are enabled
2276 * because we can't back out if the xattrs must grow.
2278 if (du_src
.ppargs
&& nospace_error
) {
2279 error
= nospace_error
;
2280 goto out_trans_cancel
;
2284 * Lock the AGI buffers we need to handle bumping the nlink of the
2285 * whiteout inode off the unlinked list and to handle dropping the
2286 * nlink of the target inode. Per locking order rules, do this in
2287 * increasing AG order and before directory block allocation tries to
2288 * grab AGFs because we grab AGIs before AGFs.
2290 * The (vfs) caller must ensure that if src is a directory then
2291 * target_ip is either null or an empty directory.
2293 for (i
= 0; i
< num_inodes
&& inodes
[i
] != NULL
; i
++) {
2294 if (inodes
[i
] == du_wip
.ip
||
2295 (inodes
[i
] == target_ip
&&
2296 (VFS_I(target_ip
)->i_nlink
== 1 || src_is_directory
))) {
2297 struct xfs_perag
*pag
;
2300 pag
= xfs_perag_get(mp
,
2301 XFS_INO_TO_AGNO(mp
, inodes
[i
]->i_ino
));
2302 error
= xfs_read_agi(pag
, tp
, 0, &bp
);
2305 goto out_trans_cancel
;
2309 error
= xfs_dir_rename_children(tp
, &du_src
, &du_tgt
, spaceres
,
2312 goto out_trans_cancel
;
2316 * Now we have a real link, clear the "I'm a tmpfile" state
2317 * flag from the inode so it doesn't accidentally get misused in
2320 VFS_I(du_wip
.ip
)->i_state
&= ~I_LINKABLE
;
2325 * If this is a synchronous mount, make sure that the rename
2326 * transaction goes to disk before returning to the user.
2328 if (xfs_has_wsync(tp
->t_mountp
) || xfs_has_dirsync(tp
->t_mountp
))
2329 xfs_trans_set_sync(tp
);
2331 error
= xfs_trans_commit(tp
);
2336 xfs_trans_cancel(tp
);
2338 xfs_iunlock_rename(inodes
, num_inodes
);
2340 xfs_parent_finish(mp
, du_tgt
.ppargs
);
2342 xfs_parent_finish(mp
, du_wip
.ppargs
);
2344 xfs_parent_finish(mp
, du_src
.ppargs
);
2347 xfs_irele(du_wip
.ip
);
2348 if (error
== -ENOSPC
&& nospace_error
)
2349 error
= nospace_error
;
2355 struct xfs_inode
*ip
,
2358 struct xfs_inode_log_item
*iip
= ip
->i_itemp
;
2359 struct xfs_dinode
*dip
;
2360 struct xfs_mount
*mp
= ip
->i_mount
;
2363 xfs_assert_ilocked(ip
, XFS_ILOCK_EXCL
| XFS_ILOCK_SHARED
);
2364 ASSERT(xfs_iflags_test(ip
, XFS_IFLUSHING
));
2365 ASSERT(ip
->i_df
.if_format
!= XFS_DINODE_FMT_BTREE
||
2366 ip
->i_df
.if_nextents
> XFS_IFORK_MAXEXT(ip
, XFS_DATA_FORK
));
2367 ASSERT(iip
->ili_item
.li_buf
== bp
);
2369 dip
= xfs_buf_offset(bp
, ip
->i_imap
.im_boffset
);
2372 * We don't flush the inode if any of the following checks fail, but we
2373 * do still update the log item and attach to the backing buffer as if
2374 * the flush happened. This is a formality to facilitate predictable
2375 * error handling as the caller will shutdown and fail the buffer.
2377 error
= -EFSCORRUPTED
;
2378 if (XFS_TEST_ERROR(dip
->di_magic
!= cpu_to_be16(XFS_DINODE_MAGIC
),
2379 mp
, XFS_ERRTAG_IFLUSH_1
)) {
2380 xfs_alert_tag(mp
, XFS_PTAG_IFLUSH
,
2381 "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT
,
2382 __func__
, ip
->i_ino
, be16_to_cpu(dip
->di_magic
), dip
);
2385 if (S_ISREG(VFS_I(ip
)->i_mode
)) {
2387 ip
->i_df
.if_format
!= XFS_DINODE_FMT_EXTENTS
&&
2388 ip
->i_df
.if_format
!= XFS_DINODE_FMT_BTREE
,
2389 mp
, XFS_ERRTAG_IFLUSH_3
)) {
2390 xfs_alert_tag(mp
, XFS_PTAG_IFLUSH
,
2391 "%s: Bad regular inode %llu, ptr "PTR_FMT
,
2392 __func__
, ip
->i_ino
, ip
);
2395 } else if (S_ISDIR(VFS_I(ip
)->i_mode
)) {
2397 ip
->i_df
.if_format
!= XFS_DINODE_FMT_EXTENTS
&&
2398 ip
->i_df
.if_format
!= XFS_DINODE_FMT_BTREE
&&
2399 ip
->i_df
.if_format
!= XFS_DINODE_FMT_LOCAL
,
2400 mp
, XFS_ERRTAG_IFLUSH_4
)) {
2401 xfs_alert_tag(mp
, XFS_PTAG_IFLUSH
,
2402 "%s: Bad directory inode %llu, ptr "PTR_FMT
,
2403 __func__
, ip
->i_ino
, ip
);
2407 if (XFS_TEST_ERROR(ip
->i_df
.if_nextents
+ xfs_ifork_nextents(&ip
->i_af
) >
2408 ip
->i_nblocks
, mp
, XFS_ERRTAG_IFLUSH_5
)) {
2409 xfs_alert_tag(mp
, XFS_PTAG_IFLUSH
,
2410 "%s: detected corrupt incore inode %llu, "
2411 "total extents = %llu nblocks = %lld, ptr "PTR_FMT
,
2412 __func__
, ip
->i_ino
,
2413 ip
->i_df
.if_nextents
+ xfs_ifork_nextents(&ip
->i_af
),
2417 if (XFS_TEST_ERROR(ip
->i_forkoff
> mp
->m_sb
.sb_inodesize
,
2418 mp
, XFS_ERRTAG_IFLUSH_6
)) {
2419 xfs_alert_tag(mp
, XFS_PTAG_IFLUSH
,
2420 "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT
,
2421 __func__
, ip
->i_ino
, ip
->i_forkoff
, ip
);
2426 * Inode item log recovery for v2 inodes are dependent on the flushiter
2427 * count for correct sequencing. We bump the flush iteration count so
2428 * we can detect flushes which postdate a log record during recovery.
2429 * This is redundant as we now log every change and hence this can't
2430 * happen but we need to still do it to ensure backwards compatibility
2431 * with old kernels that predate logging all inode changes.
2433 if (!xfs_has_v3inodes(mp
))
2437 * If there are inline format data / attr forks attached to this inode,
2438 * make sure they are not corrupt.
2440 if (ip
->i_df
.if_format
== XFS_DINODE_FMT_LOCAL
&&
2441 xfs_ifork_verify_local_data(ip
))
2443 if (xfs_inode_has_attr_fork(ip
) &&
2444 ip
->i_af
.if_format
== XFS_DINODE_FMT_LOCAL
&&
2445 xfs_ifork_verify_local_attr(ip
))
2449 * Copy the dirty parts of the inode into the on-disk inode. We always
2450 * copy out the core of the inode, because if the inode is dirty at all
2453 xfs_inode_to_disk(ip
, dip
, iip
->ili_item
.li_lsn
);
2455 /* Wrap, we never let the log put out DI_MAX_FLUSH */
2456 if (!xfs_has_v3inodes(mp
)) {
2457 if (ip
->i_flushiter
== DI_MAX_FLUSH
)
2458 ip
->i_flushiter
= 0;
2461 xfs_iflush_fork(ip
, dip
, iip
, XFS_DATA_FORK
);
2462 if (xfs_inode_has_attr_fork(ip
))
2463 xfs_iflush_fork(ip
, dip
, iip
, XFS_ATTR_FORK
);
2466 * We've recorded everything logged in the inode, so we'd like to clear
2467 * the ili_fields bits so we don't log and flush things unnecessarily.
2468 * However, we can't stop logging all this information until the data
2469 * we've copied into the disk buffer is written to disk. If we did we
2470 * might overwrite the copy of the inode in the log with all the data
2471 * after re-logging only part of it, and in the face of a crash we
2472 * wouldn't have all the data we need to recover.
2474 * What we do is move the bits to the ili_last_fields field. When
2475 * logging the inode, these bits are moved back to the ili_fields field.
2476 * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
2477 * we know that the information those bits represent is permanently on
2478 * disk. As long as the flush completes before the inode is logged
2479 * again, then both ili_fields and ili_last_fields will be cleared.
2483 spin_lock(&iip
->ili_lock
);
2484 iip
->ili_last_fields
= iip
->ili_fields
;
2485 iip
->ili_fields
= 0;
2486 iip
->ili_fsync_fields
= 0;
2487 set_bit(XFS_LI_FLUSHING
, &iip
->ili_item
.li_flags
);
2488 spin_unlock(&iip
->ili_lock
);
2491 * Store the current LSN of the inode so that we can tell whether the
2492 * item has moved in the AIL from xfs_buf_inode_iodone().
2494 xfs_trans_ail_copy_lsn(mp
->m_ail
, &iip
->ili_flush_lsn
,
2495 &iip
->ili_item
.li_lsn
);
2497 /* generate the checksum. */
2498 xfs_dinode_calc_crc(mp
, dip
);
2500 xfs_inode_mark_sick(ip
, XFS_SICK_INO_CORE
);
2505 * Non-blocking flush of dirty inode metadata into the backing buffer.
2507 * The caller must have a reference to the inode and hold the cluster buffer
2508 * locked. The function will walk across all the inodes on the cluster buffer it
2509 * can find and lock without blocking, and flush them to the cluster buffer.
2511 * On successful flushing of at least one inode, the caller must write out the
2512 * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
2513 * the caller needs to release the buffer. On failure, the filesystem will be
2514 * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
2521 struct xfs_mount
*mp
= bp
->b_mount
;
2522 struct xfs_log_item
*lip
, *n
;
2523 struct xfs_inode
*ip
;
2524 struct xfs_inode_log_item
*iip
;
2529 * We must use the safe variant here as on shutdown xfs_iflush_abort()
2530 * will remove itself from the list.
2532 list_for_each_entry_safe(lip
, n
, &bp
->b_li_list
, li_bio_list
) {
2533 iip
= (struct xfs_inode_log_item
*)lip
;
2534 ip
= iip
->ili_inode
;
2537 * Quick and dirty check to avoid locks if possible.
2539 if (__xfs_iflags_test(ip
, XFS_IRECLAIM
| XFS_IFLUSHING
))
2541 if (xfs_ipincount(ip
))
2545 * The inode is still attached to the buffer, which means it is
2546 * dirty but reclaim might try to grab it. Check carefully for
2547 * that, and grab the ilock while still holding the i_flags_lock
2548 * to guarantee reclaim will not be able to reclaim this inode
2549 * once we drop the i_flags_lock.
2551 spin_lock(&ip
->i_flags_lock
);
2552 ASSERT(!__xfs_iflags_test(ip
, XFS_ISTALE
));
2553 if (__xfs_iflags_test(ip
, XFS_IRECLAIM
| XFS_IFLUSHING
)) {
2554 spin_unlock(&ip
->i_flags_lock
);
2559 * ILOCK will pin the inode against reclaim and prevent
2560 * concurrent transactions modifying the inode while we are
2561 * flushing the inode. If we get the lock, set the flushing
2562 * state before we drop the i_flags_lock.
2564 if (!xfs_ilock_nowait(ip
, XFS_ILOCK_SHARED
)) {
2565 spin_unlock(&ip
->i_flags_lock
);
2568 __xfs_iflags_set(ip
, XFS_IFLUSHING
);
2569 spin_unlock(&ip
->i_flags_lock
);
2572 * Abort flushing this inode if we are shut down because the
2573 * inode may not currently be in the AIL. This can occur when
2574 * log I/O failure unpins the inode without inserting into the
2575 * AIL, leaving a dirty/unpinned inode attached to the buffer
2576 * that otherwise looks like it should be flushed.
2578 if (xlog_is_shutdown(mp
->m_log
)) {
2579 xfs_iunpin_wait(ip
);
2580 xfs_iflush_abort(ip
);
2581 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
2586 /* don't block waiting on a log force to unpin dirty inodes */
2587 if (xfs_ipincount(ip
)) {
2588 xfs_iflags_clear(ip
, XFS_IFLUSHING
);
2589 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
2593 if (!xfs_inode_clean(ip
))
2594 error
= xfs_iflush(ip
, bp
);
2596 xfs_iflags_clear(ip
, XFS_IFLUSHING
);
2597 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
2605 * Shutdown first so we kill the log before we release this
2606 * buffer. If it is an INODE_ALLOC buffer and pins the tail
2607 * of the log, failing it before the _log_ is shut down can
2608 * result in the log tail being moved forward in the journal
2609 * on disk because log writes can still be taking place. Hence
2610 * unpinning the tail will allow the ICREATE intent to be
2611 * removed from the log an recovery will fail with uninitialised
2612 * inode cluster buffers.
2614 xfs_force_shutdown(mp
, SHUTDOWN_CORRUPT_INCORE
);
2615 bp
->b_flags
|= XBF_ASYNC
;
2616 xfs_buf_ioend_fail(bp
);
2623 XFS_STATS_INC(mp
, xs_icluster_flushcnt
);
2624 XFS_STATS_ADD(mp
, xs_icluster_flushinode
, clcount
);
2629 /* Release an inode. */
2632 struct xfs_inode
*ip
)
2634 trace_xfs_irele(ip
, _RET_IP_
);
2639 * Ensure all commited transactions touching the inode are written to the log.
2642 xfs_log_force_inode(
2643 struct xfs_inode
*ip
)
2647 xfs_ilock(ip
, XFS_ILOCK_SHARED
);
2648 if (xfs_ipincount(ip
))
2649 seq
= ip
->i_itemp
->ili_commit_seq
;
2650 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
2654 return xfs_log_force_seq(ip
->i_mount
, seq
, XFS_LOG_SYNC
, NULL
);
2658 * Grab the exclusive iolock for a data copy from src to dest, making sure to
2659 * abide vfs locking order (lowest pointer value goes first) and breaking the
2660 * layout leases before proceeding. The loop is needed because we cannot call
2661 * the blocking break_layout() with the iolocks held, and therefore have to
2662 * back out both locks.
2665 xfs_iolock_two_inodes_and_break_layout(
2675 /* Wait to break both inodes' layouts before we start locking. */
2676 error
= break_layout(src
, true);
2680 error
= break_layout(dest
, true);
2685 /* Lock one inode and make sure nobody got in and leased it. */
2687 error
= break_layout(src
, false);
2690 if (error
== -EWOULDBLOCK
)
2698 /* Lock the other inode and make sure nobody got in and leased it. */
2699 inode_lock_nested(dest
, I_MUTEX_NONDIR2
);
2700 error
= break_layout(dest
, false);
2704 if (error
== -EWOULDBLOCK
)
2713 xfs_mmaplock_two_inodes_and_break_dax_layout(
2714 struct xfs_inode
*ip1
,
2715 struct xfs_inode
*ip2
)
2721 if (ip1
->i_ino
> ip2
->i_ino
)
2726 /* Lock the first inode */
2727 xfs_ilock(ip1
, XFS_MMAPLOCK_EXCL
);
2728 error
= xfs_break_dax_layouts(VFS_I(ip1
), &retry
);
2729 if (error
|| retry
) {
2730 xfs_iunlock(ip1
, XFS_MMAPLOCK_EXCL
);
2731 if (error
== 0 && retry
)
2739 /* Nested lock the second inode */
2740 xfs_ilock(ip2
, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL
, 1));
2742 * We cannot use xfs_break_dax_layouts() directly here because it may
2743 * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
2744 * for this nested lock case.
2746 page
= dax_layout_busy_page(VFS_I(ip2
)->i_mapping
);
2747 if (page
&& page_ref_count(page
) != 1) {
2748 xfs_iunlock(ip2
, XFS_MMAPLOCK_EXCL
);
2749 xfs_iunlock(ip1
, XFS_MMAPLOCK_EXCL
);
2757 * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
2762 struct xfs_inode
*ip1
,
2763 struct xfs_inode
*ip2
)
2767 ret
= xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1
), VFS_I(ip2
));
2771 if (IS_DAX(VFS_I(ip1
)) && IS_DAX(VFS_I(ip2
))) {
2772 ret
= xfs_mmaplock_two_inodes_and_break_dax_layout(ip1
, ip2
);
2774 inode_unlock(VFS_I(ip2
));
2776 inode_unlock(VFS_I(ip1
));
2780 filemap_invalidate_lock_two(VFS_I(ip1
)->i_mapping
,
2781 VFS_I(ip2
)->i_mapping
);
2786 /* Unlock both inodes to allow IO and mmap activity. */
2788 xfs_iunlock2_io_mmap(
2789 struct xfs_inode
*ip1
,
2790 struct xfs_inode
*ip2
)
2792 if (IS_DAX(VFS_I(ip1
)) && IS_DAX(VFS_I(ip2
))) {
2793 xfs_iunlock(ip2
, XFS_MMAPLOCK_EXCL
);
2795 xfs_iunlock(ip1
, XFS_MMAPLOCK_EXCL
);
2797 filemap_invalidate_unlock_two(VFS_I(ip1
)->i_mapping
,
2798 VFS_I(ip2
)->i_mapping
);
2800 inode_unlock(VFS_I(ip2
));
2802 inode_unlock(VFS_I(ip1
));
2805 /* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
2807 xfs_iunlock2_remapping(
2808 struct xfs_inode
*ip1
,
2809 struct xfs_inode
*ip2
)
2811 xfs_iflags_clear(ip1
, XFS_IREMAPPING
);
2814 xfs_iunlock(ip1
, XFS_MMAPLOCK_SHARED
);
2815 xfs_iunlock(ip2
, XFS_MMAPLOCK_EXCL
);
2818 inode_unlock_shared(VFS_I(ip1
));
2819 inode_unlock(VFS_I(ip2
));
2823 * Reload the incore inode list for this inode. Caller should ensure that
2824 * the link count cannot change, either by taking ILOCK_SHARED or otherwise
2825 * preventing other threads from executing.
2828 xfs_inode_reload_unlinked_bucket(
2829 struct xfs_trans
*tp
,
2830 struct xfs_inode
*ip
)
2832 struct xfs_mount
*mp
= tp
->t_mountp
;
2833 struct xfs_buf
*agibp
;
2834 struct xfs_agi
*agi
;
2835 struct xfs_perag
*pag
;
2836 xfs_agnumber_t agno
= XFS_INO_TO_AGNO(mp
, ip
->i_ino
);
2837 xfs_agino_t agino
= XFS_INO_TO_AGINO(mp
, ip
->i_ino
);
2838 xfs_agino_t prev_agino
, next_agino
;
2839 unsigned int bucket
;
2840 bool foundit
= false;
2843 /* Grab the first inode in the list */
2844 pag
= xfs_perag_get(mp
, agno
);
2845 error
= xfs_ialloc_read_agi(pag
, tp
, 0, &agibp
);
2851 * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the
2852 * incore unlinked list pointers for this inode. Check once more to
2853 * see if we raced with anyone else to reload the unlinked list.
2855 if (!xfs_inode_unlinked_incomplete(ip
)) {
2860 bucket
= agino
% XFS_AGI_UNLINKED_BUCKETS
;
2861 agi
= agibp
->b_addr
;
2863 trace_xfs_inode_reload_unlinked_bucket(ip
);
2865 xfs_info_ratelimited(mp
,
2866 "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.",
2869 prev_agino
= NULLAGINO
;
2870 next_agino
= be32_to_cpu(agi
->agi_unlinked
[bucket
]);
2871 while (next_agino
!= NULLAGINO
) {
2872 struct xfs_inode
*next_ip
= NULL
;
2874 /* Found this caller's inode, set its backlink. */
2875 if (next_agino
== agino
) {
2877 next_ip
->i_prev_unlinked
= prev_agino
;
2882 /* Try in-memory lookup first. */
2883 next_ip
= xfs_iunlink_lookup(pag
, next_agino
);
2887 /* Inode not in memory, try reloading it. */
2888 error
= xfs_iunlink_reload_next(tp
, agibp
, prev_agino
,
2893 /* Grab the reloaded inode. */
2894 next_ip
= xfs_iunlink_lookup(pag
, next_agino
);
2896 /* No incore inode at all? We reloaded it... */
2897 ASSERT(next_ip
!= NULL
);
2898 error
= -EFSCORRUPTED
;
2903 prev_agino
= next_agino
;
2904 next_agino
= next_ip
->i_next_unlinked
;
2908 xfs_trans_brelse(tp
, agibp
);
2909 /* Should have found this inode somewhere in the iunlinked bucket. */
2910 if (!error
&& !foundit
)
2911 error
= -EFSCORRUPTED
;
2915 /* Decide if this inode is missing its unlinked list and reload it. */
2917 xfs_inode_reload_unlinked(
2918 struct xfs_inode
*ip
)
2920 struct xfs_trans
*tp
;
2923 error
= xfs_trans_alloc_empty(ip
->i_mount
, &tp
);
2927 xfs_ilock(ip
, XFS_ILOCK_SHARED
);
2928 if (xfs_inode_unlinked_incomplete(ip
))
2929 error
= xfs_inode_reload_unlinked_bucket(tp
, ip
);
2930 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
2931 xfs_trans_cancel(tp
);
2936 /* Has this inode fork been zapped by repair? */
2939 const struct xfs_inode
*ip
,
2942 unsigned int datamask
= 0;
2944 switch (whichfork
) {
2946 switch (ip
->i_vnode
.i_mode
& S_IFMT
) {
2948 datamask
= XFS_SICK_INO_DIR_ZAPPED
;
2951 datamask
= XFS_SICK_INO_SYMLINK_ZAPPED
;
2954 return ip
->i_sick
& (XFS_SICK_INO_BMBTD_ZAPPED
| datamask
);
2956 return ip
->i_sick
& XFS_SICK_INO_BMBTA_ZAPPED
;
2962 /* Compute the number of data and realtime blocks used by a file. */
2964 xfs_inode_count_blocks(
2965 struct xfs_trans
*tp
,
2966 struct xfs_inode
*ip
,
2967 xfs_filblks_t
*dblocks
,
2968 xfs_filblks_t
*rblocks
)
2970 struct xfs_ifork
*ifp
= xfs_ifork_ptr(ip
, XFS_DATA_FORK
);
2973 if (XFS_IS_REALTIME_INODE(ip
))
2974 xfs_bmap_count_leaves(ifp
, rblocks
);
2975 *dblocks
= ip
->i_nblocks
- *rblocks
;
2980 struct inode
*inode
)
2982 struct xfs_inode
*ip
= XFS_I(inode
);
2984 xfs_iunlock(ip
, XFS_MMAPLOCK_EXCL
);
2986 xfs_ilock(ip
, XFS_MMAPLOCK_EXCL
);
2990 xfs_break_dax_layouts(
2991 struct inode
*inode
,
2996 xfs_assert_ilocked(XFS_I(inode
), XFS_MMAPLOCK_EXCL
);
2998 page
= dax_layout_busy_page(inode
->i_mapping
);
3003 return ___wait_var_event(&page
->_refcount
,
3004 atomic_read(&page
->_refcount
) == 1, TASK_INTERRUPTIBLE
,
3005 0, 0, xfs_wait_dax_page(inode
));
3010 struct inode
*inode
,
3012 enum layout_break_reason reason
)
3017 xfs_assert_ilocked(XFS_I(inode
), XFS_IOLOCK_SHARED
| XFS_IOLOCK_EXCL
);
3023 error
= xfs_break_dax_layouts(inode
, &retry
);
3028 error
= xfs_break_leased_layouts(inode
, iolock
, &retry
);
3034 } while (error
== 0 && retry
);
3039 /* Returns the size of fundamental allocation unit for a file, in bytes. */
3041 xfs_inode_alloc_unitsize(
3042 struct xfs_inode
*ip
)
3044 unsigned int blocks
= 1;
3046 if (XFS_IS_REALTIME_INODE(ip
))
3047 blocks
= ip
->i_mount
->m_sb
.sb_rextsize
;
3049 return XFS_FSB_TO_B(ip
->i_mount
, blocks
);
3052 /* Should we always be using copy on write for file writes? */
3054 xfs_is_always_cow_inode(
3055 const struct xfs_inode
*ip
)
3057 return ip
->i_mount
->m_always_cow
&& xfs_has_reflink(ip
->i_mount
);