1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
6 #include <linux/iversion.h>
10 #include "xfs_shared.h"
11 #include "xfs_format.h"
12 #include "xfs_log_format.h"
13 #include "xfs_trans_resv.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_inode.h"
20 #include "xfs_trans_space.h"
21 #include "xfs_trans.h"
22 #include "xfs_buf_item.h"
23 #include "xfs_inode_item.h"
24 #include "xfs_iunlink_item.h"
25 #include "xfs_ialloc.h"
27 #include "xfs_bmap_util.h"
28 #include "xfs_errortag.h"
29 #include "xfs_error.h"
30 #include "xfs_quota.h"
31 #include "xfs_filestream.h"
32 #include "xfs_trace.h"
33 #include "xfs_icache.h"
34 #include "xfs_symlink.h"
35 #include "xfs_trans_priv.h"
37 #include "xfs_bmap_btree.h"
38 #include "xfs_reflink.h"
40 #include "xfs_log_priv.h"
41 #include "xfs_health.h"
43 #include "xfs_parent.h"
44 #include "xfs_xattr.h"
45 #include "xfs_inode_util.h"
47 struct kmem_cache
*xfs_inode_cache
;
50 * These two are wrapper routines around the xfs_ilock() routine used to
51 * centralize some grungy code. They are used in places that wish to lock the
52 * inode solely for reading the extents. The reason these places can't just
53 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
54 * bringing in of the extents from disk for a file in b-tree format. If the
55 * inode is in b-tree format, then we need to lock the inode exclusively until
56 * the extents are read in. Locking it exclusively all the time would limit
57 * our parallelism unnecessarily, though. What we do instead is check to see
58 * if the extents have been read in yet, and only lock the inode exclusively
61 * The functions return a value which should be given to the corresponding
65 xfs_ilock_data_map_shared(
68 uint lock_mode
= XFS_ILOCK_SHARED
;
70 if (xfs_need_iread_extents(&ip
->i_df
))
71 lock_mode
= XFS_ILOCK_EXCL
;
72 xfs_ilock(ip
, lock_mode
);
77 xfs_ilock_attr_map_shared(
80 uint lock_mode
= XFS_ILOCK_SHARED
;
82 if (xfs_inode_has_attr_fork(ip
) && xfs_need_iread_extents(&ip
->i_af
))
83 lock_mode
= XFS_ILOCK_EXCL
;
84 xfs_ilock(ip
, lock_mode
);
89 * You can't set both SHARED and EXCL for the same lock,
90 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
91 * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
92 * to set in lock_flags.
95 xfs_lock_flags_assert(
98 ASSERT((lock_flags
& (XFS_IOLOCK_SHARED
| XFS_IOLOCK_EXCL
)) !=
99 (XFS_IOLOCK_SHARED
| XFS_IOLOCK_EXCL
));
100 ASSERT((lock_flags
& (XFS_MMAPLOCK_SHARED
| XFS_MMAPLOCK_EXCL
)) !=
101 (XFS_MMAPLOCK_SHARED
| XFS_MMAPLOCK_EXCL
));
102 ASSERT((lock_flags
& (XFS_ILOCK_SHARED
| XFS_ILOCK_EXCL
)) !=
103 (XFS_ILOCK_SHARED
| XFS_ILOCK_EXCL
));
104 ASSERT((lock_flags
& ~(XFS_LOCK_MASK
| XFS_LOCK_SUBCLASS_MASK
)) == 0);
105 ASSERT(lock_flags
!= 0);
109 * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
110 * multi-reader locks: invalidate_lock and the i_lock. This routine allows
111 * various combinations of the locks to be obtained.
113 * The 3 locks should always be ordered so that the IO lock is obtained first,
114 * the mmap lock second and the ilock last in order to prevent deadlock.
116 * Basic locking order:
118 * i_rwsem -> invalidate_lock -> page_lock -> i_ilock
120 * mmap_lock locking order:
122 * i_rwsem -> page lock -> mmap_lock
123 * mmap_lock -> invalidate_lock -> page_lock
125 * The difference in mmap_lock locking order mean that we cannot hold the
126 * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
127 * can fault in pages during copy in/out (for buffered IO) or require the
128 * mmap_lock in get_user_pages() to map the user pages into the kernel address
129 * space for direct IO. Similarly the i_rwsem cannot be taken inside a page
130 * fault because page faults already hold the mmap_lock.
132 * Hence to serialise fully against both syscall and mmap based IO, we need to
133 * take both the i_rwsem and the invalidate_lock. These locks should *only* be
134 * both taken in places where we need to invalidate the page cache in a race
135 * free manner (e.g. truncate, hole punch and other extent manipulation
143 trace_xfs_ilock(ip
, lock_flags
, _RET_IP_
);
145 xfs_lock_flags_assert(lock_flags
);
147 if (lock_flags
& XFS_IOLOCK_EXCL
) {
148 down_write_nested(&VFS_I(ip
)->i_rwsem
,
149 XFS_IOLOCK_DEP(lock_flags
));
150 } else if (lock_flags
& XFS_IOLOCK_SHARED
) {
151 down_read_nested(&VFS_I(ip
)->i_rwsem
,
152 XFS_IOLOCK_DEP(lock_flags
));
155 if (lock_flags
& XFS_MMAPLOCK_EXCL
) {
156 down_write_nested(&VFS_I(ip
)->i_mapping
->invalidate_lock
,
157 XFS_MMAPLOCK_DEP(lock_flags
));
158 } else if (lock_flags
& XFS_MMAPLOCK_SHARED
) {
159 down_read_nested(&VFS_I(ip
)->i_mapping
->invalidate_lock
,
160 XFS_MMAPLOCK_DEP(lock_flags
));
163 if (lock_flags
& XFS_ILOCK_EXCL
)
164 down_write_nested(&ip
->i_lock
, XFS_ILOCK_DEP(lock_flags
));
165 else if (lock_flags
& XFS_ILOCK_SHARED
)
166 down_read_nested(&ip
->i_lock
, XFS_ILOCK_DEP(lock_flags
));
170 * This is just like xfs_ilock(), except that the caller
171 * is guaranteed not to sleep. It returns 1 if it gets
172 * the requested locks and 0 otherwise. If the IO lock is
173 * obtained but the inode lock cannot be, then the IO lock
174 * is dropped before returning.
176 * ip -- the inode being locked
177 * lock_flags -- this parameter indicates the inode's locks to be
178 * to be locked. See the comment for xfs_ilock() for a list
186 trace_xfs_ilock_nowait(ip
, lock_flags
, _RET_IP_
);
188 xfs_lock_flags_assert(lock_flags
);
190 if (lock_flags
& XFS_IOLOCK_EXCL
) {
191 if (!down_write_trylock(&VFS_I(ip
)->i_rwsem
))
193 } else if (lock_flags
& XFS_IOLOCK_SHARED
) {
194 if (!down_read_trylock(&VFS_I(ip
)->i_rwsem
))
198 if (lock_flags
& XFS_MMAPLOCK_EXCL
) {
199 if (!down_write_trylock(&VFS_I(ip
)->i_mapping
->invalidate_lock
))
200 goto out_undo_iolock
;
201 } else if (lock_flags
& XFS_MMAPLOCK_SHARED
) {
202 if (!down_read_trylock(&VFS_I(ip
)->i_mapping
->invalidate_lock
))
203 goto out_undo_iolock
;
206 if (lock_flags
& XFS_ILOCK_EXCL
) {
207 if (!down_write_trylock(&ip
->i_lock
))
208 goto out_undo_mmaplock
;
209 } else if (lock_flags
& XFS_ILOCK_SHARED
) {
210 if (!down_read_trylock(&ip
->i_lock
))
211 goto out_undo_mmaplock
;
216 if (lock_flags
& XFS_MMAPLOCK_EXCL
)
217 up_write(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
218 else if (lock_flags
& XFS_MMAPLOCK_SHARED
)
219 up_read(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
221 if (lock_flags
& XFS_IOLOCK_EXCL
)
222 up_write(&VFS_I(ip
)->i_rwsem
);
223 else if (lock_flags
& XFS_IOLOCK_SHARED
)
224 up_read(&VFS_I(ip
)->i_rwsem
);
230 * xfs_iunlock() is used to drop the inode locks acquired with
231 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
232 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
233 * that we know which locks to drop.
235 * ip -- the inode being unlocked
236 * lock_flags -- this parameter indicates the inode's locks to be
237 * to be unlocked. See the comment for xfs_ilock() for a list
238 * of valid values for this parameter.
246 xfs_lock_flags_assert(lock_flags
);
248 if (lock_flags
& XFS_IOLOCK_EXCL
)
249 up_write(&VFS_I(ip
)->i_rwsem
);
250 else if (lock_flags
& XFS_IOLOCK_SHARED
)
251 up_read(&VFS_I(ip
)->i_rwsem
);
253 if (lock_flags
& XFS_MMAPLOCK_EXCL
)
254 up_write(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
255 else if (lock_flags
& XFS_MMAPLOCK_SHARED
)
256 up_read(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
258 if (lock_flags
& XFS_ILOCK_EXCL
)
259 up_write(&ip
->i_lock
);
260 else if (lock_flags
& XFS_ILOCK_SHARED
)
261 up_read(&ip
->i_lock
);
263 trace_xfs_iunlock(ip
, lock_flags
, _RET_IP_
);
267 * give up write locks. the i/o lock cannot be held nested
268 * if it is being demoted.
275 ASSERT(lock_flags
& (XFS_IOLOCK_EXCL
|XFS_MMAPLOCK_EXCL
|XFS_ILOCK_EXCL
));
277 ~(XFS_IOLOCK_EXCL
|XFS_MMAPLOCK_EXCL
|XFS_ILOCK_EXCL
)) == 0);
279 if (lock_flags
& XFS_ILOCK_EXCL
)
280 downgrade_write(&ip
->i_lock
);
281 if (lock_flags
& XFS_MMAPLOCK_EXCL
)
282 downgrade_write(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
283 if (lock_flags
& XFS_IOLOCK_EXCL
)
284 downgrade_write(&VFS_I(ip
)->i_rwsem
);
286 trace_xfs_ilock_demote(ip
, lock_flags
, _RET_IP_
);
291 struct xfs_inode
*ip
,
295 * Sometimes we assert the ILOCK is held exclusively, but we're in
296 * a workqueue, so lockdep doesn't know we're the owner.
298 if (lock_flags
& XFS_ILOCK_SHARED
)
299 rwsem_assert_held(&ip
->i_lock
);
300 else if (lock_flags
& XFS_ILOCK_EXCL
)
301 rwsem_assert_held_write_nolockdep(&ip
->i_lock
);
303 if (lock_flags
& XFS_MMAPLOCK_SHARED
)
304 rwsem_assert_held(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
305 else if (lock_flags
& XFS_MMAPLOCK_EXCL
)
306 rwsem_assert_held_write(&VFS_I(ip
)->i_mapping
->invalidate_lock
);
308 if (lock_flags
& XFS_IOLOCK_SHARED
)
309 rwsem_assert_held(&VFS_I(ip
)->i_rwsem
);
310 else if (lock_flags
& XFS_IOLOCK_EXCL
)
311 rwsem_assert_held_write(&VFS_I(ip
)->i_rwsem
);
315 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
316 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
317 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
318 * errors and warnings.
320 #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
322 xfs_lockdep_subclass_ok(
325 return subclass
< MAX_LOCKDEP_SUBCLASSES
;
328 #define xfs_lockdep_subclass_ok(subclass) (true)
332 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
333 * value. This can be called for any type of inode lock combination, including
334 * parent locking. Care must be taken to ensure we don't overrun the subclass
335 * storage fields in the class mask we build.
344 ASSERT(!(lock_mode
& (XFS_ILOCK_PARENT
| XFS_ILOCK_RTBITMAP
|
346 ASSERT(xfs_lockdep_subclass_ok(subclass
));
348 if (lock_mode
& (XFS_IOLOCK_SHARED
|XFS_IOLOCK_EXCL
)) {
349 ASSERT(subclass
<= XFS_IOLOCK_MAX_SUBCLASS
);
350 class += subclass
<< XFS_IOLOCK_SHIFT
;
353 if (lock_mode
& (XFS_MMAPLOCK_SHARED
|XFS_MMAPLOCK_EXCL
)) {
354 ASSERT(subclass
<= XFS_MMAPLOCK_MAX_SUBCLASS
);
355 class += subclass
<< XFS_MMAPLOCK_SHIFT
;
358 if (lock_mode
& (XFS_ILOCK_SHARED
|XFS_ILOCK_EXCL
)) {
359 ASSERT(subclass
<= XFS_ILOCK_MAX_SUBCLASS
);
360 class += subclass
<< XFS_ILOCK_SHIFT
;
363 return (lock_mode
& ~XFS_LOCK_SUBCLASS_MASK
) | class;
367 * The following routine will lock n inodes in exclusive mode. We assume the
368 * caller calls us with the inodes in i_ino order.
370 * We need to detect deadlock where an inode that we lock is in the AIL and we
371 * start waiting for another inode that is locked by a thread in a long running
372 * transaction (such as truncate). This can result in deadlock since the long
373 * running trans might need to wait for the inode we just locked in order to
374 * push the tail and free space in the log.
376 * xfs_lock_inodes() can only be used to lock one type of lock at a time -
377 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
378 * lock more than one at a time, lockdep will report false positives saying we
379 * have violated locking orders.
383 struct xfs_inode
**ips
,
391 struct xfs_log_item
*lp
;
394 * Currently supports between 2 and 5 inodes with exclusive locking. We
395 * support an arbitrary depth of locking here, but absolute limits on
396 * inodes depend on the type of locking and the limits placed by
397 * lockdep annotations in xfs_lock_inumorder. These are all checked by
400 ASSERT(ips
&& inodes
>= 2 && inodes
<= 5);
401 ASSERT(lock_mode
& (XFS_IOLOCK_EXCL
| XFS_MMAPLOCK_EXCL
|
403 ASSERT(!(lock_mode
& (XFS_IOLOCK_SHARED
| XFS_MMAPLOCK_SHARED
|
405 ASSERT(!(lock_mode
& XFS_MMAPLOCK_EXCL
) ||
406 inodes
<= XFS_MMAPLOCK_MAX_SUBCLASS
+ 1);
407 ASSERT(!(lock_mode
& XFS_ILOCK_EXCL
) ||
408 inodes
<= XFS_ILOCK_MAX_SUBCLASS
+ 1);
410 if (lock_mode
& XFS_IOLOCK_EXCL
) {
411 ASSERT(!(lock_mode
& (XFS_MMAPLOCK_EXCL
| XFS_ILOCK_EXCL
)));
412 } else if (lock_mode
& XFS_MMAPLOCK_EXCL
)
413 ASSERT(!(lock_mode
& XFS_ILOCK_EXCL
));
418 for (; i
< inodes
; i
++) {
421 if (i
&& (ips
[i
] == ips
[i
- 1])) /* Already locked */
425 * If try_lock is not set yet, make sure all locked inodes are
426 * not in the AIL. If any are, set try_lock to be used later.
429 for (j
= (i
- 1); j
>= 0 && !try_lock
; j
--) {
430 lp
= &ips
[j
]->i_itemp
->ili_item
;
431 if (lp
&& test_bit(XFS_LI_IN_AIL
, &lp
->li_flags
))
437 * If any of the previous locks we have locked is in the AIL,
438 * we must TRY to get the second and subsequent locks. If
439 * we can't get any, we must release all we have
443 xfs_ilock(ips
[i
], xfs_lock_inumorder(lock_mode
, i
));
447 /* try_lock means we have an inode locked that is in the AIL. */
449 if (xfs_ilock_nowait(ips
[i
], xfs_lock_inumorder(lock_mode
, i
)))
453 * Unlock all previous guys and try again. xfs_iunlock will try
454 * to push the tail if the inode is in the AIL.
457 for (j
= i
- 1; j
>= 0; j
--) {
459 * Check to see if we've already unlocked this one. Not
460 * the first one going back, and the inode ptr is the
463 if (j
!= (i
- 1) && ips
[j
] == ips
[j
+ 1])
466 xfs_iunlock(ips
[j
], lock_mode
);
469 if ((attempts
% 5) == 0) {
470 delay(1); /* Don't just spin the CPU */
477 * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
478 * mmaplock must be double-locked separately since we use i_rwsem and
479 * invalidate_lock for that. We now support taking one lock EXCL and the
484 struct xfs_inode
*ip0
,
486 struct xfs_inode
*ip1
,
490 struct xfs_log_item
*lp
;
492 ASSERT(hweight32(ip0_mode
) == 1);
493 ASSERT(hweight32(ip1_mode
) == 1);
494 ASSERT(!(ip0_mode
& (XFS_IOLOCK_SHARED
|XFS_IOLOCK_EXCL
)));
495 ASSERT(!(ip1_mode
& (XFS_IOLOCK_SHARED
|XFS_IOLOCK_EXCL
)));
496 ASSERT(!(ip0_mode
& (XFS_MMAPLOCK_SHARED
|XFS_MMAPLOCK_EXCL
)));
497 ASSERT(!(ip1_mode
& (XFS_MMAPLOCK_SHARED
|XFS_MMAPLOCK_EXCL
)));
498 ASSERT(ip0
->i_ino
!= ip1
->i_ino
);
500 if (ip0
->i_ino
> ip1
->i_ino
) {
502 swap(ip0_mode
, ip1_mode
);
506 xfs_ilock(ip0
, xfs_lock_inumorder(ip0_mode
, 0));
509 * If the first lock we have locked is in the AIL, we must TRY to get
510 * the second lock. If we can't get it, we must release the first one
513 lp
= &ip0
->i_itemp
->ili_item
;
514 if (lp
&& test_bit(XFS_LI_IN_AIL
, &lp
->li_flags
)) {
515 if (!xfs_ilock_nowait(ip1
, xfs_lock_inumorder(ip1_mode
, 1))) {
516 xfs_iunlock(ip0
, ip0_mode
);
517 if ((++attempts
% 5) == 0)
518 delay(1); /* Don't just spin the CPU */
522 xfs_ilock(ip1
, xfs_lock_inumorder(ip1_mode
, 1));
527 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
528 * is allowed, otherwise it has to be an exact match. If a CI match is found,
529 * ci_name->name will point to a the actual name (caller must free) or
530 * will be set to NULL if an exact match is found.
534 struct xfs_inode
*dp
,
535 const struct xfs_name
*name
,
536 struct xfs_inode
**ipp
,
537 struct xfs_name
*ci_name
)
542 trace_xfs_lookup(dp
, name
);
544 if (xfs_is_shutdown(dp
->i_mount
))
546 if (xfs_ifork_zapped(dp
, XFS_DATA_FORK
))
549 error
= xfs_dir_lookup(NULL
, dp
, name
, &inum
, ci_name
);
553 error
= xfs_iget(dp
->i_mount
, NULL
, inum
, 0, 0, ipp
);
561 kfree(ci_name
->name
);
568 * Initialise a newly allocated inode and return the in-core inode to the
569 * caller locked exclusively.
571 * Caller is responsible for unlocking the inode manually upon return
575 struct xfs_trans
*tp
,
577 const struct xfs_icreate_args
*args
,
578 struct xfs_inode
**ipp
)
580 struct xfs_mount
*mp
= tp
->t_mountp
;
581 struct xfs_inode
*ip
= NULL
;
585 * Get the in-core inode with the lock held exclusively to prevent
586 * others from looking at until we're done.
588 error
= xfs_iget(mp
, tp
, ino
, XFS_IGET_CREATE
, XFS_ILOCK_EXCL
, &ip
);
593 xfs_trans_ijoin(tp
, ip
, 0);
594 xfs_inode_init(tp
, args
, ip
);
596 /* now that we have an i_mode we can setup the inode structure */
603 /* Return dquots for the ids that will be assigned to a new file. */
606 const struct xfs_icreate_args
*args
,
607 struct xfs_dquot
**udqpp
,
608 struct xfs_dquot
**gdqpp
,
609 struct xfs_dquot
**pdqpp
)
611 struct inode
*dir
= VFS_I(args
->pip
);
612 kuid_t uid
= GLOBAL_ROOT_UID
;
613 kgid_t gid
= GLOBAL_ROOT_GID
;
615 unsigned int flags
= XFS_QMOPT_QUOTALL
;
619 * The uid/gid computation code must match what the VFS uses to
620 * assign i_[ug]id. INHERIT adjusts the gid computation for
621 * setgid/grpid systems.
623 uid
= mapped_fsuid(args
->idmap
, i_user_ns(dir
));
624 gid
= mapped_fsgid(args
->idmap
, i_user_ns(dir
));
625 prid
= xfs_get_initial_prid(args
->pip
);
626 flags
|= XFS_QMOPT_INHERIT
;
629 *udqpp
= *gdqpp
= *pdqpp
= NULL
;
631 return xfs_qm_vop_dqalloc(args
->pip
, uid
, gid
, prid
, flags
, udqpp
,
637 const struct xfs_icreate_args
*args
,
638 struct xfs_name
*name
,
639 struct xfs_inode
**ipp
)
641 struct xfs_inode
*dp
= args
->pip
;
642 struct xfs_dir_update du
= {
646 struct xfs_mount
*mp
= dp
->i_mount
;
647 struct xfs_trans
*tp
= NULL
;
648 struct xfs_dquot
*udqp
;
649 struct xfs_dquot
*gdqp
;
650 struct xfs_dquot
*pdqp
;
651 struct xfs_trans_res
*tres
;
653 bool unlock_dp_on_error
= false;
654 bool is_dir
= S_ISDIR(args
->mode
);
658 trace_xfs_create(dp
, name
);
660 if (xfs_is_shutdown(mp
))
662 if (xfs_ifork_zapped(dp
, XFS_DATA_FORK
))
665 /* Make sure that we have allocated dquot(s) on disk. */
666 error
= xfs_icreate_dqalloc(args
, &udqp
, &gdqp
, &pdqp
);
671 resblks
= xfs_mkdir_space_res(mp
, name
->len
);
672 tres
= &M_RES(mp
)->tr_mkdir
;
674 resblks
= xfs_create_space_res(mp
, name
->len
);
675 tres
= &M_RES(mp
)->tr_create
;
678 error
= xfs_parent_start(mp
, &du
.ppargs
);
680 goto out_release_dquots
;
683 * Initially assume that the file does not exist and
684 * reserve the resources for that case. If that is not
685 * the case we'll drop the one we have and get a more
686 * appropriate transaction later.
688 error
= xfs_trans_alloc_icreate(mp
, tres
, udqp
, gdqp
, pdqp
, resblks
,
690 if (error
== -ENOSPC
) {
691 /* flush outstanding delalloc blocks and retry */
692 xfs_flush_inodes(mp
);
693 error
= xfs_trans_alloc_icreate(mp
, tres
, udqp
, gdqp
, pdqp
,
699 xfs_ilock(dp
, XFS_ILOCK_EXCL
| XFS_ILOCK_PARENT
);
700 unlock_dp_on_error
= true;
703 * A newly created regular or special file just has one directory
704 * entry pointing to them, but a directory also the "." entry
705 * pointing to itself.
707 error
= xfs_dialloc(&tp
, args
, &ino
);
709 error
= xfs_icreate(tp
, ino
, args
, &du
.ip
);
711 goto out_trans_cancel
;
714 * Now we join the directory inode to the transaction. We do not do it
715 * earlier because xfs_dialloc might commit the previous transaction
716 * (and release all the locks). An error from here on will result in
717 * the transaction cancel unlocking dp so don't do it explicitly in the
720 xfs_trans_ijoin(tp
, dp
, 0);
722 error
= xfs_dir_create_child(tp
, resblks
, &du
);
724 goto out_trans_cancel
;
727 * If this is a synchronous mount, make sure that the
728 * create transaction goes to disk before returning to
731 if (xfs_has_wsync(mp
) || xfs_has_dirsync(mp
))
732 xfs_trans_set_sync(tp
);
735 * Attach the dquot(s) to the inodes and modify them incore.
736 * These ids of the inode couldn't have changed since the new
737 * inode has been locked ever since it was created.
739 xfs_qm_vop_create_dqattach(tp
, du
.ip
, udqp
, gdqp
, pdqp
);
741 error
= xfs_trans_commit(tp
);
743 goto out_release_inode
;
750 xfs_iunlock(du
.ip
, XFS_ILOCK_EXCL
);
751 xfs_iunlock(dp
, XFS_ILOCK_EXCL
);
752 xfs_parent_finish(mp
, du
.ppargs
);
756 xfs_trans_cancel(tp
);
759 * Wait until after the current transaction is aborted to finish the
760 * setup of the inode and release the inode. This prevents recursive
761 * transactions and deadlocks from xfs_inactive.
764 xfs_iunlock(du
.ip
, XFS_ILOCK_EXCL
);
765 xfs_finish_inode_setup(du
.ip
);
769 xfs_parent_finish(mp
, du
.ppargs
);
775 if (unlock_dp_on_error
)
776 xfs_iunlock(dp
, XFS_ILOCK_EXCL
);
782 const struct xfs_icreate_args
*args
,
783 struct xfs_inode
**ipp
)
785 struct xfs_inode
*dp
= args
->pip
;
786 struct xfs_mount
*mp
= dp
->i_mount
;
787 struct xfs_inode
*ip
= NULL
;
788 struct xfs_trans
*tp
= NULL
;
789 struct xfs_dquot
*udqp
;
790 struct xfs_dquot
*gdqp
;
791 struct xfs_dquot
*pdqp
;
792 struct xfs_trans_res
*tres
;
797 ASSERT(args
->flags
& XFS_ICREATE_TMPFILE
);
799 if (xfs_is_shutdown(mp
))
802 /* Make sure that we have allocated dquot(s) on disk. */
803 error
= xfs_icreate_dqalloc(args
, &udqp
, &gdqp
, &pdqp
);
807 resblks
= XFS_IALLOC_SPACE_RES(mp
);
808 tres
= &M_RES(mp
)->tr_create_tmpfile
;
810 error
= xfs_trans_alloc_icreate(mp
, tres
, udqp
, gdqp
, pdqp
, resblks
,
813 goto out_release_dquots
;
815 error
= xfs_dialloc(&tp
, args
, &ino
);
817 error
= xfs_icreate(tp
, ino
, args
, &ip
);
819 goto out_trans_cancel
;
821 if (xfs_has_wsync(mp
))
822 xfs_trans_set_sync(tp
);
825 * Attach the dquot(s) to the inodes and modify them incore.
826 * These ids of the inode couldn't have changed since the new
827 * inode has been locked ever since it was created.
829 xfs_qm_vop_create_dqattach(tp
, ip
, udqp
, gdqp
, pdqp
);
831 error
= xfs_iunlink(tp
, ip
);
833 goto out_trans_cancel
;
835 error
= xfs_trans_commit(tp
);
837 goto out_release_inode
;
844 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
848 xfs_trans_cancel(tp
);
851 * Wait until after the current transaction is aborted to finish the
852 * setup of the inode and release the inode. This prevents recursive
853 * transactions and deadlocks from xfs_inactive.
856 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
857 xfs_finish_inode_setup(ip
);
870 struct xfs_inode
*tdp
,
871 struct xfs_inode
*sip
,
872 struct xfs_name
*target_name
)
874 struct xfs_dir_update du
= {
879 struct xfs_mount
*mp
= tdp
->i_mount
;
880 struct xfs_trans
*tp
;
881 int error
, nospace_error
= 0;
884 trace_xfs_link(tdp
, target_name
);
886 ASSERT(!S_ISDIR(VFS_I(sip
)->i_mode
));
888 if (xfs_is_shutdown(mp
))
890 if (xfs_ifork_zapped(tdp
, XFS_DATA_FORK
))
893 error
= xfs_qm_dqattach(sip
);
897 error
= xfs_qm_dqattach(tdp
);
901 error
= xfs_parent_start(mp
, &du
.ppargs
);
905 resblks
= xfs_link_space_res(mp
, target_name
->len
);
906 error
= xfs_trans_alloc_dir(tdp
, &M_RES(mp
)->tr_link
, sip
, &resblks
,
907 &tp
, &nospace_error
);
912 * We don't allow reservationless or quotaless hardlinking when parent
913 * pointers are enabled because we can't back out if the xattrs must
916 if (du
.ppargs
&& nospace_error
) {
917 error
= nospace_error
;
922 * If we are using project inheritance, we only allow hard link
923 * creation in our tree when the project IDs are the same; else
924 * the tree quota mechanism could be circumvented.
926 if (unlikely((tdp
->i_diflags
& XFS_DIFLAG_PROJINHERIT
) &&
927 tdp
->i_projid
!= sip
->i_projid
)) {
929 * Project quota setup skips special files which can
930 * leave inodes in a PROJINHERIT directory without a
931 * project ID set. We need to allow links to be made
932 * to these "project-less" inodes because userspace
933 * expects them to succeed after project ID setup,
934 * but everything else should be rejected.
936 if (!special_file(VFS_I(sip
)->i_mode
) ||
937 sip
->i_projid
!= 0) {
943 error
= xfs_dir_add_child(tp
, resblks
, &du
);
948 * If this is a synchronous mount, make sure that the
949 * link transaction goes to disk before returning to
952 if (xfs_has_wsync(mp
) || xfs_has_dirsync(mp
))
953 xfs_trans_set_sync(tp
);
955 error
= xfs_trans_commit(tp
);
956 xfs_iunlock(tdp
, XFS_ILOCK_EXCL
);
957 xfs_iunlock(sip
, XFS_ILOCK_EXCL
);
958 xfs_parent_finish(mp
, du
.ppargs
);
962 xfs_trans_cancel(tp
);
963 xfs_iunlock(tdp
, XFS_ILOCK_EXCL
);
964 xfs_iunlock(sip
, XFS_ILOCK_EXCL
);
966 xfs_parent_finish(mp
, du
.ppargs
);
968 if (error
== -ENOSPC
&& nospace_error
)
969 error
= nospace_error
;
973 /* Clear the reflink flag and the cowblocks tag if possible. */
975 xfs_itruncate_clear_reflink_flags(
976 struct xfs_inode
*ip
)
978 struct xfs_ifork
*dfork
;
979 struct xfs_ifork
*cfork
;
981 if (!xfs_is_reflink_inode(ip
))
983 dfork
= xfs_ifork_ptr(ip
, XFS_DATA_FORK
);
984 cfork
= xfs_ifork_ptr(ip
, XFS_COW_FORK
);
985 if (dfork
->if_bytes
== 0 && cfork
->if_bytes
== 0)
986 ip
->i_diflags2
&= ~XFS_DIFLAG2_REFLINK
;
987 if (cfork
->if_bytes
== 0)
988 xfs_inode_clear_cowblocks_tag(ip
);
992 * Free up the underlying blocks past new_size. The new size must be smaller
993 * than the current size. This routine can be used both for the attribute and
994 * data fork, and does not modify the inode size, which is left to the caller.
996 * The transaction passed to this routine must have made a permanent log
997 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
998 * given transaction and start new ones, so make sure everything involved in
999 * the transaction is tidy before calling here. Some transaction will be
1000 * returned to the caller to be committed. The incoming transaction must
1001 * already include the inode, and both inode locks must be held exclusively.
1002 * The inode must also be "held" within the transaction. On return the inode
1003 * will be "held" within the returned transaction. This routine does NOT
1004 * require any disk space to be reserved for it within the transaction.
1006 * If we get an error, we must return with the inode locked and linked into the
1007 * current transaction. This keeps things simple for the higher level code,
1008 * because it always knows that the inode is locked and held in the transaction
1009 * that returns to it whether errors occur or not. We don't mark the inode
1010 * dirty on error so that transactions can be easily aborted if possible.
1013 xfs_itruncate_extents_flags(
1014 struct xfs_trans
**tpp
,
1015 struct xfs_inode
*ip
,
1017 xfs_fsize_t new_size
,
1020 struct xfs_mount
*mp
= ip
->i_mount
;
1021 struct xfs_trans
*tp
= *tpp
;
1022 xfs_fileoff_t first_unmap_block
;
1025 xfs_assert_ilocked(ip
, XFS_ILOCK_EXCL
);
1026 if (atomic_read(&VFS_I(ip
)->i_count
))
1027 xfs_assert_ilocked(ip
, XFS_IOLOCK_EXCL
);
1028 ASSERT(new_size
<= XFS_ISIZE(ip
));
1029 ASSERT(tp
->t_flags
& XFS_TRANS_PERM_LOG_RES
);
1030 ASSERT(ip
->i_itemp
!= NULL
);
1031 ASSERT(ip
->i_itemp
->ili_lock_flags
== 0);
1032 ASSERT(!XFS_NOT_DQATTACHED(mp
, ip
));
1034 trace_xfs_itruncate_extents_start(ip
, new_size
);
1036 flags
|= xfs_bmapi_aflag(whichfork
);
1039 * Since it is possible for space to become allocated beyond
1040 * the end of the file (in a crash where the space is allocated
1041 * but the inode size is not yet updated), simply remove any
1042 * blocks which show up between the new EOF and the maximum
1043 * possible file size.
1045 * We have to free all the blocks to the bmbt maximum offset, even if
1046 * the page cache can't scale that far.
1048 first_unmap_block
= XFS_B_TO_FSB(mp
, (xfs_ufsize_t
)new_size
);
1049 if (!xfs_verify_fileoff(mp
, first_unmap_block
)) {
1050 WARN_ON_ONCE(first_unmap_block
> XFS_MAX_FILEOFF
);
1054 error
= xfs_bunmapi_range(&tp
, ip
, flags
, first_unmap_block
,
1059 if (whichfork
== XFS_DATA_FORK
) {
1060 /* Remove all pending CoW reservations. */
1061 error
= xfs_reflink_cancel_cow_blocks(ip
, &tp
,
1062 first_unmap_block
, XFS_MAX_FILEOFF
, true);
1066 xfs_itruncate_clear_reflink_flags(ip
);
1070 * Always re-log the inode so that our permanent transaction can keep
1071 * on rolling it forward in the log.
1073 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
1075 trace_xfs_itruncate_extents_end(ip
, new_size
);
1083 * Mark all the buffers attached to this directory stale. In theory we should
1084 * never be freeing a directory with any blocks at all, but this covers the
1085 * case where we've recovered a directory swap with a "temporary" directory
1086 * created by online repair and now need to dump it.
1090 struct xfs_inode
*dp
)
1092 struct xfs_iext_cursor icur
;
1093 struct xfs_bmbt_irec got
;
1094 struct xfs_mount
*mp
= dp
->i_mount
;
1095 struct xfs_da_geometry
*geo
= mp
->m_dir_geo
;
1096 struct xfs_ifork
*ifp
= xfs_ifork_ptr(dp
, XFS_DATA_FORK
);
1100 * Invalidate each directory block. All directory blocks are of
1101 * fsbcount length and alignment, so we only need to walk those same
1102 * offsets. We hold the only reference to this inode, so we must wait
1103 * for the buffer locks.
1105 for_each_xfs_iext(ifp
, &icur
, &got
) {
1106 for (off
= round_up(got
.br_startoff
, geo
->fsbcount
);
1107 off
< got
.br_startoff
+ got
.br_blockcount
;
1108 off
+= geo
->fsbcount
) {
1109 struct xfs_buf
*bp
= NULL
;
1110 xfs_fsblock_t fsbno
;
1113 fsbno
= (off
- got
.br_startoff
) + got
.br_startblock
;
1114 error
= xfs_buf_incore(mp
->m_ddev_targp
,
1115 XFS_FSB_TO_DADDR(mp
, fsbno
),
1116 XFS_FSB_TO_BB(mp
, geo
->fsbcount
),
1128 * xfs_inactive_truncate
1130 * Called to perform a truncate when an inode becomes unlinked.
1133 xfs_inactive_truncate(
1134 struct xfs_inode
*ip
)
1136 struct xfs_mount
*mp
= ip
->i_mount
;
1137 struct xfs_trans
*tp
;
1140 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_itruncate
, 0, 0, 0, &tp
);
1142 ASSERT(xfs_is_shutdown(mp
));
1145 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
1146 xfs_trans_ijoin(tp
, ip
, 0);
1149 * Log the inode size first to prevent stale data exposure in the event
1150 * of a system crash before the truncate completes. See the related
1151 * comment in xfs_vn_setattr_size() for details.
1153 ip
->i_disk_size
= 0;
1154 xfs_trans_log_inode(tp
, ip
, XFS_ILOG_CORE
);
1156 error
= xfs_itruncate_extents(&tp
, ip
, XFS_DATA_FORK
, 0);
1158 goto error_trans_cancel
;
1160 ASSERT(ip
->i_df
.if_nextents
== 0);
1162 error
= xfs_trans_commit(tp
);
1166 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1170 xfs_trans_cancel(tp
);
1172 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1177 * xfs_inactive_ifree()
1179 * Perform the inode free when an inode is unlinked.
1183 struct xfs_inode
*ip
)
1185 struct xfs_mount
*mp
= ip
->i_mount
;
1186 struct xfs_trans
*tp
;
1190 * We try to use a per-AG reservation for any block needed by the finobt
1191 * tree, but as the finobt feature predates the per-AG reservation
1192 * support a degraded file system might not have enough space for the
1193 * reservation at mount time. In that case try to dip into the reserved
1196 * Send a warning if the reservation does happen to fail, as the inode
1197 * now remains allocated and sits on the unlinked list until the fs is
1200 if (unlikely(mp
->m_finobt_nores
)) {
1201 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_ifree
,
1202 XFS_IFREE_SPACE_RES(mp
), 0, XFS_TRANS_RESERVE
,
1205 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_ifree
, 0, 0, 0, &tp
);
1208 if (error
== -ENOSPC
) {
1209 xfs_warn_ratelimited(mp
,
1210 "Failed to remove inode(s) from unlinked list. "
1211 "Please free space, unmount and run xfs_repair.");
1213 ASSERT(xfs_is_shutdown(mp
));
1219 * We do not hold the inode locked across the entire rolling transaction
1220 * here. We only need to hold it for the first transaction that
1221 * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
1222 * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
1223 * here breaks the relationship between cluster buffer invalidation and
1224 * stale inode invalidation on cluster buffer item journal commit
1225 * completion, and can result in leaving dirty stale inodes hanging
1228 * We have no need for serialising this inode operation against other
1229 * operations - we freed the inode and hence reallocation is required
1230 * and that will serialise on reallocating the space the deferops need
1231 * to free. Hence we can unlock the inode on the first commit of
1232 * the transaction rather than roll it right through the deferops. This
1233 * avoids relogging the XFS_ISTALE inode.
1235 * We check that xfs_ifree() hasn't grown an internal transaction roll
1236 * by asserting that the inode is still locked when it returns.
1238 xfs_ilock(ip
, XFS_ILOCK_EXCL
);
1239 xfs_trans_ijoin(tp
, ip
, XFS_ILOCK_EXCL
);
1241 error
= xfs_ifree(tp
, ip
);
1242 xfs_assert_ilocked(ip
, XFS_ILOCK_EXCL
);
1245 * If we fail to free the inode, shut down. The cancel
1246 * might do that, we need to make sure. Otherwise the
1247 * inode might be lost for a long time or forever.
1249 if (!xfs_is_shutdown(mp
)) {
1250 xfs_notice(mp
, "%s: xfs_ifree returned error %d",
1252 xfs_force_shutdown(mp
, SHUTDOWN_META_IO_ERROR
);
1254 xfs_trans_cancel(tp
);
1259 * Credit the quota account(s). The inode is gone.
1261 xfs_trans_mod_dquot_byino(tp
, ip
, XFS_TRANS_DQ_ICOUNT
, -1);
1263 return xfs_trans_commit(tp
);
1267 * Returns true if we need to update the on-disk metadata before we can free
1268 * the memory used by this inode. Updates include freeing post-eof
1269 * preallocations; freeing COW staging extents; and marking the inode free in
1270 * the inobt if it is on the unlinked list.
1273 xfs_inode_needs_inactive(
1274 struct xfs_inode
*ip
)
1276 struct xfs_mount
*mp
= ip
->i_mount
;
1277 struct xfs_ifork
*cow_ifp
= xfs_ifork_ptr(ip
, XFS_COW_FORK
);
1280 * If the inode is already free, then there can be nothing
1283 if (VFS_I(ip
)->i_mode
== 0)
1287 * If this is a read-only mount, don't do this (would generate I/O)
1288 * unless we're in log recovery and cleaning the iunlinked list.
1290 if (xfs_is_readonly(mp
) && !xlog_recovery_needed(mp
->m_log
))
1293 /* If the log isn't running, push inodes straight to reclaim. */
1294 if (xfs_is_shutdown(mp
) || xfs_has_norecovery(mp
))
1297 /* Metadata inodes require explicit resource cleanup. */
1298 if (xfs_is_metadata_inode(ip
))
1301 /* Want to clean out the cow blocks if there are any. */
1302 if (cow_ifp
&& cow_ifp
->if_bytes
> 0)
1305 /* Unlinked files must be freed. */
1306 if (VFS_I(ip
)->i_nlink
== 0)
1310 * This file isn't being freed, so check if there are post-eof blocks
1313 * Note: don't bother with iolock here since lockdep complains about
1314 * acquiring it in reclaim context. We have the only reference to the
1315 * inode at this point anyways.
1317 return xfs_can_free_eofblocks(ip
);
1321 * Save health status somewhere, if we're dumping an inode with uncorrected
1322 * errors and online repair isn't running.
1325 xfs_inactive_health(
1326 struct xfs_inode
*ip
)
1328 struct xfs_mount
*mp
= ip
->i_mount
;
1329 struct xfs_perag
*pag
;
1331 unsigned int checked
;
1333 xfs_inode_measure_sickness(ip
, &sick
, &checked
);
1337 trace_xfs_inode_unfixed_corruption(ip
, sick
);
1339 if (sick
& XFS_SICK_INO_FORGET
)
1342 pag
= xfs_perag_get(mp
, XFS_INO_TO_AGNO(mp
, ip
->i_ino
));
1344 /* There had better still be a perag structure! */
1349 xfs_ag_mark_sick(pag
, XFS_SICK_AG_INODES
);
1356 * This is called when the vnode reference count for the vnode
1357 * goes to zero. If the file has been unlinked, then it must
1358 * now be truncated. Also, we clear all of the read-ahead state
1359 * kept for the inode here since the file is now closed.
1365 struct xfs_mount
*mp
;
1370 * If the inode is already free, then there can be nothing
1373 if (VFS_I(ip
)->i_mode
== 0) {
1374 ASSERT(ip
->i_df
.if_broot_bytes
== 0);
1379 ASSERT(!xfs_iflags_test(ip
, XFS_IRECOVERY
));
1381 xfs_inactive_health(ip
);
1384 * If this is a read-only mount, don't do this (would generate I/O)
1385 * unless we're in log recovery and cleaning the iunlinked list.
1387 if (xfs_is_readonly(mp
) && !xlog_recovery_needed(mp
->m_log
))
1390 /* Metadata inodes require explicit resource cleanup. */
1391 if (xfs_is_metadata_inode(ip
))
1394 /* Try to clean out the cow blocks if there are any. */
1395 if (xfs_inode_has_cow_data(ip
))
1396 xfs_reflink_cancel_cow_range(ip
, 0, NULLFILEOFF
, true);
1398 if (VFS_I(ip
)->i_nlink
!= 0) {
1400 * Note: don't bother with iolock here since lockdep complains
1401 * about acquiring it in reclaim context. We have the only
1402 * reference to the inode at this point anyways.
1404 if (xfs_can_free_eofblocks(ip
))
1405 error
= xfs_free_eofblocks(ip
);
1410 if (S_ISREG(VFS_I(ip
)->i_mode
) &&
1411 (ip
->i_disk_size
!= 0 || XFS_ISIZE(ip
) != 0 ||
1412 ip
->i_df
.if_nextents
> 0 || ip
->i_delayed_blks
> 0))
1415 if (xfs_iflags_test(ip
, XFS_IQUOTAUNCHECKED
)) {
1417 * If this inode is being inactivated during a quotacheck and
1418 * has not yet been scanned by quotacheck, we /must/ remove
1419 * the dquots from the inode before inactivation changes the
1420 * block and inode counts. Most probably this is a result of
1421 * reloading the incore iunlinked list to purge unrecovered
1424 xfs_qm_dqdetach(ip
);
1426 error
= xfs_qm_dqattach(ip
);
1431 if (S_ISDIR(VFS_I(ip
)->i_mode
) && ip
->i_df
.if_nextents
> 0) {
1432 xfs_inactive_dir(ip
);
1436 if (S_ISLNK(VFS_I(ip
)->i_mode
))
1437 error
= xfs_inactive_symlink(ip
);
1439 error
= xfs_inactive_truncate(ip
);
1444 * If there are attributes associated with the file then blow them away
1445 * now. The code calls a routine that recursively deconstructs the
1446 * attribute fork. If also blows away the in-core attribute fork.
1448 if (xfs_inode_has_attr_fork(ip
)) {
1449 error
= xfs_attr_inactive(ip
);
1454 ASSERT(ip
->i_forkoff
== 0);
1459 error
= xfs_inactive_ifree(ip
);
1463 * We're done making metadata updates for this inode, so we can release
1464 * the attached dquots.
1466 xfs_qm_dqdetach(ip
);
1471 * Find an inode on the unlinked list. This does not take references to the
1472 * inode as we have existence guarantees by holding the AGI buffer lock and that
1473 * only unlinked, referenced inodes can be on the unlinked inode list. If we
1474 * don't find the inode in cache, then let the caller handle the situation.
1478 struct xfs_perag
*pag
,
1481 struct xfs_inode
*ip
;
1484 ip
= radix_tree_lookup(&pag
->pag_ici_root
, agino
);
1486 /* Caller can handle inode not being in memory. */
1492 * Inode in RCU freeing limbo should not happen. Warn about this and
1493 * let the caller handle the failure.
1495 if (WARN_ON_ONCE(!ip
->i_ino
)) {
1499 ASSERT(!xfs_iflags_test(ip
, XFS_IRECLAIMABLE
| XFS_IRECLAIM
));
1505 * Load the inode @next_agino into the cache and set its prev_unlinked pointer
1506 * to @prev_agino. Caller must hold the AGI to synchronize with other changes
1507 * to the unlinked list.
1510 xfs_iunlink_reload_next(
1511 struct xfs_trans
*tp
,
1512 struct xfs_buf
*agibp
,
1513 xfs_agino_t prev_agino
,
1514 xfs_agino_t next_agino
)
1516 struct xfs_perag
*pag
= agibp
->b_pag
;
1517 struct xfs_mount
*mp
= pag
->pag_mount
;
1518 struct xfs_inode
*next_ip
= NULL
;
1522 ASSERT(next_agino
!= NULLAGINO
);
1526 next_ip
= radix_tree_lookup(&pag
->pag_ici_root
, next_agino
);
1527 ASSERT(next_ip
== NULL
);
1531 xfs_info_ratelimited(mp
,
1532 "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
1533 next_agino
, pag
->pag_agno
);
1536 * Use an untrusted lookup just to be cautious in case the AGI has been
1537 * corrupted and now points at a free inode. That shouldn't happen,
1538 * but we'd rather shut down now since we're already running in a weird
1541 ino
= XFS_AGINO_TO_INO(mp
, pag
->pag_agno
, next_agino
);
1542 error
= xfs_iget(mp
, tp
, ino
, XFS_IGET_UNTRUSTED
, 0, &next_ip
);
1544 xfs_ag_mark_sick(pag
, XFS_SICK_AG_AGI
);
1548 /* If this is not an unlinked inode, something is very wrong. */
1549 if (VFS_I(next_ip
)->i_nlink
!= 0) {
1550 xfs_ag_mark_sick(pag
, XFS_SICK_AG_AGI
);
1551 error
= -EFSCORRUPTED
;
1555 next_ip
->i_prev_unlinked
= prev_agino
;
1556 trace_xfs_iunlink_reload_next(next_ip
);
1558 ASSERT(!(VFS_I(next_ip
)->i_state
& I_DONTCACHE
));
1559 if (xfs_is_quotacheck_running(mp
) && next_ip
)
1560 xfs_iflags_set(next_ip
, XFS_IQUOTAUNCHECKED
);
1566 * Look up the inode number specified and if it is not already marked XFS_ISTALE
1567 * mark it stale. We should only find clean inodes in this lookup that aren't
1571 xfs_ifree_mark_inode_stale(
1572 struct xfs_perag
*pag
,
1573 struct xfs_inode
*free_ip
,
1576 struct xfs_mount
*mp
= pag
->pag_mount
;
1577 struct xfs_inode_log_item
*iip
;
1578 struct xfs_inode
*ip
;
1582 ip
= radix_tree_lookup(&pag
->pag_ici_root
, XFS_INO_TO_AGINO(mp
, inum
));
1584 /* Inode not in memory, nothing to do */
1591 * because this is an RCU protected lookup, we could find a recently
1592 * freed or even reallocated inode during the lookup. We need to check
1593 * under the i_flags_lock for a valid inode here. Skip it if it is not
1594 * valid, the wrong inode or stale.
1596 spin_lock(&ip
->i_flags_lock
);
1597 if (ip
->i_ino
!= inum
|| __xfs_iflags_test(ip
, XFS_ISTALE
))
1598 goto out_iflags_unlock
;
1601 * Don't try to lock/unlock the current inode, but we _cannot_ skip the
1602 * other inodes that we did not find in the list attached to the buffer
1603 * and are not already marked stale. If we can't lock it, back off and
1606 if (ip
!= free_ip
) {
1607 if (!xfs_ilock_nowait(ip
, XFS_ILOCK_EXCL
)) {
1608 spin_unlock(&ip
->i_flags_lock
);
1614 ip
->i_flags
|= XFS_ISTALE
;
1617 * If the inode is flushing, it is already attached to the buffer. All
1618 * we needed to do here is mark the inode stale so buffer IO completion
1619 * will remove it from the AIL.
1622 if (__xfs_iflags_test(ip
, XFS_IFLUSHING
)) {
1623 ASSERT(!list_empty(&iip
->ili_item
.li_bio_list
));
1624 ASSERT(iip
->ili_last_fields
);
1629 * Inodes not attached to the buffer can be released immediately.
1630 * Everything else has to go through xfs_iflush_abort() on journal
1631 * commit as the flock synchronises removal of the inode from the
1632 * cluster buffer against inode reclaim.
1634 if (!iip
|| list_empty(&iip
->ili_item
.li_bio_list
))
1637 __xfs_iflags_set(ip
, XFS_IFLUSHING
);
1638 spin_unlock(&ip
->i_flags_lock
);
1641 /* we have a dirty inode in memory that has not yet been flushed. */
1642 spin_lock(&iip
->ili_lock
);
1643 iip
->ili_last_fields
= iip
->ili_fields
;
1644 iip
->ili_fields
= 0;
1645 iip
->ili_fsync_fields
= 0;
1646 spin_unlock(&iip
->ili_lock
);
1647 ASSERT(iip
->ili_last_fields
);
1650 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1655 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1657 spin_unlock(&ip
->i_flags_lock
);
1662 * A big issue when freeing the inode cluster is that we _cannot_ skip any
1663 * inodes that are in memory - they all must be marked stale and attached to
1664 * the cluster buffer.
1668 struct xfs_trans
*tp
,
1669 struct xfs_perag
*pag
,
1670 struct xfs_inode
*free_ip
,
1671 struct xfs_icluster
*xic
)
1673 struct xfs_mount
*mp
= free_ip
->i_mount
;
1674 struct xfs_ino_geometry
*igeo
= M_IGEO(mp
);
1677 xfs_ino_t inum
= xic
->first_ino
;
1683 nbufs
= igeo
->ialloc_blks
/ igeo
->blocks_per_cluster
;
1685 for (j
= 0; j
< nbufs
; j
++, inum
+= igeo
->inodes_per_cluster
) {
1687 * The allocation bitmap tells us which inodes of the chunk were
1688 * physically allocated. Skip the cluster if an inode falls into
1691 ioffset
= inum
- xic
->first_ino
;
1692 if ((xic
->alloc
& XFS_INOBT_MASK(ioffset
)) == 0) {
1693 ASSERT(ioffset
% igeo
->inodes_per_cluster
== 0);
1697 blkno
= XFS_AGB_TO_DADDR(mp
, XFS_INO_TO_AGNO(mp
, inum
),
1698 XFS_INO_TO_AGBNO(mp
, inum
));
1701 * We obtain and lock the backing buffer first in the process
1702 * here to ensure dirty inodes attached to the buffer remain in
1703 * the flushing state while we mark them stale.
1705 * If we scan the in-memory inodes first, then buffer IO can
1706 * complete before we get a lock on it, and hence we may fail
1707 * to mark all the active inodes on the buffer stale.
1709 error
= xfs_trans_get_buf(tp
, mp
->m_ddev_targp
, blkno
,
1710 mp
->m_bsize
* igeo
->blocks_per_cluster
,
1716 * This buffer may not have been correctly initialised as we
1717 * didn't read it from disk. That's not important because we are
1718 * only using to mark the buffer as stale in the log, and to
1719 * attach stale cached inodes on it.
1721 * For the inode that triggered the cluster freeing, this
1722 * attachment may occur in xfs_inode_item_precommit() after we
1723 * have marked this buffer stale. If this buffer was not in
1724 * memory before xfs_ifree_cluster() started, it will not be
1725 * marked XBF_DONE and this will cause problems later in
1726 * xfs_inode_item_precommit() when we trip over a (stale, !done)
1727 * buffer to attached to the transaction.
1729 * Hence we have to mark the buffer as XFS_DONE here. This is
1730 * safe because we are also marking the buffer as XBF_STALE and
1731 * XFS_BLI_STALE. That means it will never be dispatched for
1732 * IO and it won't be unlocked until the cluster freeing has
1733 * been committed to the journal and the buffer unpinned. If it
1734 * is written, we want to know about it, and we want it to
1735 * fail. We can acheive this by adding a write verifier to the
1738 bp
->b_flags
|= XBF_DONE
;
1739 bp
->b_ops
= &xfs_inode_buf_ops
;
1742 * Now we need to set all the cached clean inodes as XFS_ISTALE,
1743 * too. This requires lookups, and will skip inodes that we've
1744 * already marked XFS_ISTALE.
1746 for (i
= 0; i
< igeo
->inodes_per_cluster
; i
++)
1747 xfs_ifree_mark_inode_stale(pag
, free_ip
, inum
+ i
);
1749 xfs_trans_stale_inode_buf(tp
, bp
);
1750 xfs_trans_binval(tp
, bp
);
1756 * This is called to return an inode to the inode free list. The inode should
1757 * already be truncated to 0 length and have no pages associated with it. This
1758 * routine also assumes that the inode is already a part of the transaction.
1760 * The on-disk copy of the inode will have been added to the list of unlinked
1761 * inodes in the AGI. We need to remove the inode from that list atomically with
1762 * respect to freeing it here.
1766 struct xfs_trans
*tp
,
1767 struct xfs_inode
*ip
)
1769 struct xfs_mount
*mp
= ip
->i_mount
;
1770 struct xfs_perag
*pag
;
1771 struct xfs_icluster xic
= { 0 };
1772 struct xfs_inode_log_item
*iip
= ip
->i_itemp
;
1775 xfs_assert_ilocked(ip
, XFS_ILOCK_EXCL
);
1776 ASSERT(VFS_I(ip
)->i_nlink
== 0);
1777 ASSERT(ip
->i_df
.if_nextents
== 0);
1778 ASSERT(ip
->i_disk_size
== 0 || !S_ISREG(VFS_I(ip
)->i_mode
));
1779 ASSERT(ip
->i_nblocks
== 0);
1781 pag
= xfs_perag_get(mp
, XFS_INO_TO_AGNO(mp
, ip
->i_ino
));
1783 error
= xfs_inode_uninit(tp
, pag
, ip
, &xic
);
1787 if (xfs_iflags_test(ip
, XFS_IPRESERVE_DM_FIELDS
))
1788 xfs_iflags_clear(ip
, XFS_IPRESERVE_DM_FIELDS
);
1790 /* Don't attempt to replay owner changes for a deleted inode */
1791 spin_lock(&iip
->ili_lock
);
1792 iip
->ili_fields
&= ~(XFS_ILOG_AOWNER
| XFS_ILOG_DOWNER
);
1793 spin_unlock(&iip
->ili_lock
);
1796 error
= xfs_ifree_cluster(tp
, pag
, ip
, &xic
);
1803 * This is called to unpin an inode. The caller must have the inode locked
1804 * in at least shared mode so that the buffer cannot be subsequently pinned
1805 * once someone is waiting for it to be unpinned.
1809 struct xfs_inode
*ip
)
1811 xfs_assert_ilocked(ip
, XFS_ILOCK_EXCL
| XFS_ILOCK_SHARED
);
1813 trace_xfs_inode_unpin_nowait(ip
, _RET_IP_
);
1815 /* Give the log a push to start the unpinning I/O */
1816 xfs_log_force_seq(ip
->i_mount
, ip
->i_itemp
->ili_commit_seq
, 0, NULL
);
1822 struct xfs_inode
*ip
)
1824 wait_queue_head_t
*wq
= bit_waitqueue(&ip
->i_flags
, __XFS_IPINNED_BIT
);
1825 DEFINE_WAIT_BIT(wait
, &ip
->i_flags
, __XFS_IPINNED_BIT
);
1830 prepare_to_wait(wq
, &wait
.wq_entry
, TASK_UNINTERRUPTIBLE
);
1831 if (xfs_ipincount(ip
))
1833 } while (xfs_ipincount(ip
));
1834 finish_wait(wq
, &wait
.wq_entry
);
1839 struct xfs_inode
*ip
)
1841 if (xfs_ipincount(ip
))
1842 __xfs_iunpin_wait(ip
);
1846 * Removing an inode from the namespace involves removing the directory entry
1847 * and dropping the link count on the inode. Removing the directory entry can
1848 * result in locking an AGF (directory blocks were freed) and removing a link
1849 * count can result in placing the inode on an unlinked list which results in
1852 * The big problem here is that we have an ordering constraint on AGF and AGI
1853 * locking - inode allocation locks the AGI, then can allocate a new extent for
1854 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
1855 * removes the inode from the unlinked list, requiring that we lock the AGI
1856 * first, and then freeing the inode can result in an inode chunk being freed
1857 * and hence freeing disk space requiring that we lock an AGF.
1859 * Hence the ordering that is imposed by other parts of the code is AGI before
1860 * AGF. This means we cannot remove the directory entry before we drop the inode
1861 * reference count and put it on the unlinked list as this results in a lock
1862 * order of AGF then AGI, and this can deadlock against inode allocation and
1863 * freeing. Therefore we must drop the link counts before we remove the
1866 * This is still safe from a transactional point of view - it is not until we
1867 * get to xfs_defer_finish() that we have the possibility of multiple
1868 * transactions in this operation. Hence as long as we remove the directory
1869 * entry and drop the link count in the first transaction of the remove
1870 * operation, there are no transactional constraints on the ordering here.
1874 struct xfs_inode
*dp
,
1875 struct xfs_name
*name
,
1876 struct xfs_inode
*ip
)
1878 struct xfs_dir_update du
= {
1883 struct xfs_mount
*mp
= dp
->i_mount
;
1884 struct xfs_trans
*tp
= NULL
;
1885 int is_dir
= S_ISDIR(VFS_I(ip
)->i_mode
);
1890 trace_xfs_remove(dp
, name
);
1892 if (xfs_is_shutdown(mp
))
1894 if (xfs_ifork_zapped(dp
, XFS_DATA_FORK
))
1897 error
= xfs_qm_dqattach(dp
);
1901 error
= xfs_qm_dqattach(ip
);
1905 error
= xfs_parent_start(mp
, &du
.ppargs
);
1910 * We try to get the real space reservation first, allowing for
1911 * directory btree deletion(s) implying possible bmap insert(s). If we
1912 * can't get the space reservation then we use 0 instead, and avoid the
1913 * bmap btree insert(s) in the directory code by, if the bmap insert
1914 * tries to happen, instead trimming the LAST block from the directory.
1916 * Ignore EDQUOT and ENOSPC being returned via nospace_error because
1917 * the directory code can handle a reservationless update and we don't
1918 * want to prevent a user from trying to free space by deleting things.
1920 resblks
= xfs_remove_space_res(mp
, name
->len
);
1921 error
= xfs_trans_alloc_dir(dp
, &M_RES(mp
)->tr_remove
, ip
, &resblks
,
1924 ASSERT(error
!= -ENOSPC
);
1928 error
= xfs_dir_remove_child(tp
, resblks
, &du
);
1930 goto out_trans_cancel
;
1933 * If this is a synchronous mount, make sure that the
1934 * remove transaction goes to disk before returning to
1937 if (xfs_has_wsync(mp
) || xfs_has_dirsync(mp
))
1938 xfs_trans_set_sync(tp
);
1940 error
= xfs_trans_commit(tp
);
1944 if (is_dir
&& xfs_inode_is_filestream(ip
))
1945 xfs_filestream_deassociate(ip
);
1947 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1948 xfs_iunlock(dp
, XFS_ILOCK_EXCL
);
1949 xfs_parent_finish(mp
, du
.ppargs
);
1953 xfs_trans_cancel(tp
);
1955 xfs_iunlock(ip
, XFS_ILOCK_EXCL
);
1956 xfs_iunlock(dp
, XFS_ILOCK_EXCL
);
1958 xfs_parent_finish(mp
, du
.ppargs
);
1965 struct xfs_inode
**i_tab
,
1970 for (i
= num_inodes
- 1; i
>= 0; i
--) {
1971 /* Skip duplicate inodes if src and target dps are the same */
1972 if (!i_tab
[i
] || (i
> 0 && i_tab
[i
] == i_tab
[i
- 1]))
1974 xfs_iunlock(i_tab
[i
], XFS_ILOCK_EXCL
);
1979 * Enter all inodes for a rename transaction into a sorted array.
1981 #define __XFS_SORT_INODES 5
1983 xfs_sort_for_rename(
1984 struct xfs_inode
*dp1
, /* in: old (source) directory inode */
1985 struct xfs_inode
*dp2
, /* in: new (target) directory inode */
1986 struct xfs_inode
*ip1
, /* in: inode of old entry */
1987 struct xfs_inode
*ip2
, /* in: inode of new entry */
1988 struct xfs_inode
*wip
, /* in: whiteout inode */
1989 struct xfs_inode
**i_tab
,/* out: sorted array of inodes */
1990 int *num_inodes
) /* in/out: inodes in array */
1994 ASSERT(*num_inodes
== __XFS_SORT_INODES
);
1995 memset(i_tab
, 0, *num_inodes
* sizeof(struct xfs_inode
*));
1998 * i_tab contains a list of pointers to inodes. We initialize
1999 * the table here & we'll sort it. We will then use it to
2000 * order the acquisition of the inode locks.
2002 * Note that the table may contain duplicates. e.g., dp1 == dp2.
2014 xfs_sort_inodes(i_tab
, *num_inodes
);
2019 struct xfs_inode
**i_tab
,
2020 unsigned int num_inodes
)
2024 ASSERT(num_inodes
<= __XFS_SORT_INODES
);
2027 * Sort the elements via bubble sort. (Remember, there are at
2028 * most 5 elements to sort, so this is adequate.)
2030 for (i
= 0; i
< num_inodes
; i
++) {
2031 for (j
= 1; j
< num_inodes
; j
++) {
2032 if (i_tab
[j
]->i_ino
< i_tab
[j
-1]->i_ino
)
2033 swap(i_tab
[j
], i_tab
[j
- 1]);
2039 * xfs_rename_alloc_whiteout()
2041 * Return a referenced, unlinked, unlocked inode that can be used as a
2042 * whiteout in a rename transaction. We use a tmpfile inode here so that if we
2043 * crash between allocating the inode and linking it into the rename transaction
2044 * recovery will free the inode and we won't leak it.
2047 xfs_rename_alloc_whiteout(
2048 struct mnt_idmap
*idmap
,
2049 struct xfs_name
*src_name
,
2050 struct xfs_inode
*dp
,
2051 struct xfs_inode
**wip
)
2053 struct xfs_icreate_args args
= {
2056 .mode
= S_IFCHR
| WHITEOUT_MODE
,
2057 .flags
= XFS_ICREATE_TMPFILE
,
2059 struct xfs_inode
*tmpfile
;
2063 error
= xfs_create_tmpfile(&args
, &tmpfile
);
2067 name
.name
= src_name
->name
;
2068 name
.len
= src_name
->len
;
2069 error
= xfs_inode_init_security(VFS_I(tmpfile
), VFS_I(dp
), &name
);
2071 xfs_finish_inode_setup(tmpfile
);
2077 * Prepare the tmpfile inode as if it were created through the VFS.
2078 * Complete the inode setup and flag it as linkable. nlink is already
2079 * zero, so we can skip the drop_nlink.
2081 xfs_setup_iops(tmpfile
);
2082 xfs_finish_inode_setup(tmpfile
);
2083 VFS_I(tmpfile
)->i_state
|= I_LINKABLE
;
2094 struct mnt_idmap
*idmap
,
2095 struct xfs_inode
*src_dp
,
2096 struct xfs_name
*src_name
,
2097 struct xfs_inode
*src_ip
,
2098 struct xfs_inode
*target_dp
,
2099 struct xfs_name
*target_name
,
2100 struct xfs_inode
*target_ip
,
2103 struct xfs_dir_update du_src
= {
2108 struct xfs_dir_update du_tgt
= {
2110 .name
= target_name
,
2113 struct xfs_dir_update du_wip
= { };
2114 struct xfs_mount
*mp
= src_dp
->i_mount
;
2115 struct xfs_trans
*tp
;
2116 struct xfs_inode
*inodes
[__XFS_SORT_INODES
];
2118 int num_inodes
= __XFS_SORT_INODES
;
2119 bool new_parent
= (src_dp
!= target_dp
);
2120 bool src_is_directory
= S_ISDIR(VFS_I(src_ip
)->i_mode
);
2122 bool retried
= false;
2123 int error
, nospace_error
= 0;
2125 trace_xfs_rename(src_dp
, target_dp
, src_name
, target_name
);
2127 if ((flags
& RENAME_EXCHANGE
) && !target_ip
)
2131 * If we are doing a whiteout operation, allocate the whiteout inode
2132 * we will be placing at the target and ensure the type is set
2135 if (flags
& RENAME_WHITEOUT
) {
2136 error
= xfs_rename_alloc_whiteout(idmap
, src_name
, target_dp
,
2141 /* setup target dirent info as whiteout */
2142 src_name
->type
= XFS_DIR3_FT_CHRDEV
;
2145 xfs_sort_for_rename(src_dp
, target_dp
, src_ip
, target_ip
, du_wip
.ip
,
2146 inodes
, &num_inodes
);
2148 error
= xfs_parent_start(mp
, &du_src
.ppargs
);
2150 goto out_release_wip
;
2153 error
= xfs_parent_start(mp
, &du_wip
.ppargs
);
2155 goto out_src_ppargs
;
2159 error
= xfs_parent_start(mp
, &du_tgt
.ppargs
);
2161 goto out_wip_ppargs
;
2166 spaceres
= xfs_rename_space_res(mp
, src_name
->len
, target_ip
!= NULL
,
2167 target_name
->len
, du_wip
.ip
!= NULL
);
2168 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_rename
, spaceres
, 0, 0, &tp
);
2169 if (error
== -ENOSPC
) {
2170 nospace_error
= error
;
2172 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_rename
, 0, 0, 0,
2176 goto out_tgt_ppargs
;
2179 * We don't allow reservationless renaming when parent pointers are
2180 * enabled because we can't back out if the xattrs must grow.
2182 if (du_src
.ppargs
&& nospace_error
) {
2183 error
= nospace_error
;
2184 xfs_trans_cancel(tp
);
2185 goto out_tgt_ppargs
;
2189 * Attach the dquots to the inodes
2191 error
= xfs_qm_vop_rename_dqattach(inodes
);
2193 xfs_trans_cancel(tp
);
2194 goto out_tgt_ppargs
;
2198 * Lock all the participating inodes. Depending upon whether
2199 * the target_name exists in the target directory, and
2200 * whether the target directory is the same as the source
2201 * directory, we can lock from 2 to 5 inodes.
2203 xfs_lock_inodes(inodes
, num_inodes
, XFS_ILOCK_EXCL
);
2206 * Join all the inodes to the transaction.
2208 xfs_trans_ijoin(tp
, src_dp
, 0);
2210 xfs_trans_ijoin(tp
, target_dp
, 0);
2211 xfs_trans_ijoin(tp
, src_ip
, 0);
2213 xfs_trans_ijoin(tp
, target_ip
, 0);
2215 xfs_trans_ijoin(tp
, du_wip
.ip
, 0);
2218 * If we are using project inheritance, we only allow renames
2219 * into our tree when the project IDs are the same; else the
2220 * tree quota mechanism would be circumvented.
2222 if (unlikely((target_dp
->i_diflags
& XFS_DIFLAG_PROJINHERIT
) &&
2223 target_dp
->i_projid
!= src_ip
->i_projid
)) {
2225 goto out_trans_cancel
;
2228 /* RENAME_EXCHANGE is unique from here on. */
2229 if (flags
& RENAME_EXCHANGE
) {
2230 error
= xfs_dir_exchange_children(tp
, &du_src
, &du_tgt
,
2233 goto out_trans_cancel
;
2238 * Try to reserve quota to handle an expansion of the target directory.
2239 * We'll allow the rename to continue in reservationless mode if we hit
2240 * a space usage constraint. If we trigger reservationless mode, save
2241 * the errno if there isn't any free space in the target directory.
2243 if (spaceres
!= 0) {
2244 error
= xfs_trans_reserve_quota_nblks(tp
, target_dp
, spaceres
,
2246 if (error
== -EDQUOT
|| error
== -ENOSPC
) {
2248 xfs_trans_cancel(tp
);
2249 xfs_iunlock_rename(inodes
, num_inodes
);
2250 xfs_blockgc_free_quota(target_dp
, 0);
2255 nospace_error
= error
;
2260 goto out_trans_cancel
;
2264 * We don't allow quotaless renaming when parent pointers are enabled
2265 * because we can't back out if the xattrs must grow.
2267 if (du_src
.ppargs
&& nospace_error
) {
2268 error
= nospace_error
;
2269 goto out_trans_cancel
;
2273 * Lock the AGI buffers we need to handle bumping the nlink of the
2274 * whiteout inode off the unlinked list and to handle dropping the
2275 * nlink of the target inode. Per locking order rules, do this in
2276 * increasing AG order and before directory block allocation tries to
2277 * grab AGFs because we grab AGIs before AGFs.
2279 * The (vfs) caller must ensure that if src is a directory then
2280 * target_ip is either null or an empty directory.
2282 for (i
= 0; i
< num_inodes
&& inodes
[i
] != NULL
; i
++) {
2283 if (inodes
[i
] == du_wip
.ip
||
2284 (inodes
[i
] == target_ip
&&
2285 (VFS_I(target_ip
)->i_nlink
== 1 || src_is_directory
))) {
2286 struct xfs_perag
*pag
;
2289 pag
= xfs_perag_get(mp
,
2290 XFS_INO_TO_AGNO(mp
, inodes
[i
]->i_ino
));
2291 error
= xfs_read_agi(pag
, tp
, 0, &bp
);
2294 goto out_trans_cancel
;
2298 error
= xfs_dir_rename_children(tp
, &du_src
, &du_tgt
, spaceres
,
2301 goto out_trans_cancel
;
2305 * Now we have a real link, clear the "I'm a tmpfile" state
2306 * flag from the inode so it doesn't accidentally get misused in
2309 VFS_I(du_wip
.ip
)->i_state
&= ~I_LINKABLE
;
2314 * If this is a synchronous mount, make sure that the rename
2315 * transaction goes to disk before returning to the user.
2317 if (xfs_has_wsync(tp
->t_mountp
) || xfs_has_dirsync(tp
->t_mountp
))
2318 xfs_trans_set_sync(tp
);
2320 error
= xfs_trans_commit(tp
);
2325 xfs_trans_cancel(tp
);
2327 xfs_iunlock_rename(inodes
, num_inodes
);
2329 xfs_parent_finish(mp
, du_tgt
.ppargs
);
2331 xfs_parent_finish(mp
, du_wip
.ppargs
);
2333 xfs_parent_finish(mp
, du_src
.ppargs
);
2336 xfs_irele(du_wip
.ip
);
2337 if (error
== -ENOSPC
&& nospace_error
)
2338 error
= nospace_error
;
2344 struct xfs_inode
*ip
,
2347 struct xfs_inode_log_item
*iip
= ip
->i_itemp
;
2348 struct xfs_dinode
*dip
;
2349 struct xfs_mount
*mp
= ip
->i_mount
;
2352 xfs_assert_ilocked(ip
, XFS_ILOCK_EXCL
| XFS_ILOCK_SHARED
);
2353 ASSERT(xfs_iflags_test(ip
, XFS_IFLUSHING
));
2354 ASSERT(ip
->i_df
.if_format
!= XFS_DINODE_FMT_BTREE
||
2355 ip
->i_df
.if_nextents
> XFS_IFORK_MAXEXT(ip
, XFS_DATA_FORK
));
2356 ASSERT(iip
->ili_item
.li_buf
== bp
);
2358 dip
= xfs_buf_offset(bp
, ip
->i_imap
.im_boffset
);
2361 * We don't flush the inode if any of the following checks fail, but we
2362 * do still update the log item and attach to the backing buffer as if
2363 * the flush happened. This is a formality to facilitate predictable
2364 * error handling as the caller will shutdown and fail the buffer.
2366 error
= -EFSCORRUPTED
;
2367 if (XFS_TEST_ERROR(dip
->di_magic
!= cpu_to_be16(XFS_DINODE_MAGIC
),
2368 mp
, XFS_ERRTAG_IFLUSH_1
)) {
2369 xfs_alert_tag(mp
, XFS_PTAG_IFLUSH
,
2370 "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT
,
2371 __func__
, ip
->i_ino
, be16_to_cpu(dip
->di_magic
), dip
);
2374 if (S_ISREG(VFS_I(ip
)->i_mode
)) {
2376 ip
->i_df
.if_format
!= XFS_DINODE_FMT_EXTENTS
&&
2377 ip
->i_df
.if_format
!= XFS_DINODE_FMT_BTREE
,
2378 mp
, XFS_ERRTAG_IFLUSH_3
)) {
2379 xfs_alert_tag(mp
, XFS_PTAG_IFLUSH
,
2380 "%s: Bad regular inode %llu, ptr "PTR_FMT
,
2381 __func__
, ip
->i_ino
, ip
);
2384 } else if (S_ISDIR(VFS_I(ip
)->i_mode
)) {
2386 ip
->i_df
.if_format
!= XFS_DINODE_FMT_EXTENTS
&&
2387 ip
->i_df
.if_format
!= XFS_DINODE_FMT_BTREE
&&
2388 ip
->i_df
.if_format
!= XFS_DINODE_FMT_LOCAL
,
2389 mp
, XFS_ERRTAG_IFLUSH_4
)) {
2390 xfs_alert_tag(mp
, XFS_PTAG_IFLUSH
,
2391 "%s: Bad directory inode %llu, ptr "PTR_FMT
,
2392 __func__
, ip
->i_ino
, ip
);
2396 if (XFS_TEST_ERROR(ip
->i_df
.if_nextents
+ xfs_ifork_nextents(&ip
->i_af
) >
2397 ip
->i_nblocks
, mp
, XFS_ERRTAG_IFLUSH_5
)) {
2398 xfs_alert_tag(mp
, XFS_PTAG_IFLUSH
,
2399 "%s: detected corrupt incore inode %llu, "
2400 "total extents = %llu nblocks = %lld, ptr "PTR_FMT
,
2401 __func__
, ip
->i_ino
,
2402 ip
->i_df
.if_nextents
+ xfs_ifork_nextents(&ip
->i_af
),
2406 if (XFS_TEST_ERROR(ip
->i_forkoff
> mp
->m_sb
.sb_inodesize
,
2407 mp
, XFS_ERRTAG_IFLUSH_6
)) {
2408 xfs_alert_tag(mp
, XFS_PTAG_IFLUSH
,
2409 "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT
,
2410 __func__
, ip
->i_ino
, ip
->i_forkoff
, ip
);
2415 * Inode item log recovery for v2 inodes are dependent on the flushiter
2416 * count for correct sequencing. We bump the flush iteration count so
2417 * we can detect flushes which postdate a log record during recovery.
2418 * This is redundant as we now log every change and hence this can't
2419 * happen but we need to still do it to ensure backwards compatibility
2420 * with old kernels that predate logging all inode changes.
2422 if (!xfs_has_v3inodes(mp
))
2426 * If there are inline format data / attr forks attached to this inode,
2427 * make sure they are not corrupt.
2429 if (ip
->i_df
.if_format
== XFS_DINODE_FMT_LOCAL
&&
2430 xfs_ifork_verify_local_data(ip
))
2432 if (xfs_inode_has_attr_fork(ip
) &&
2433 ip
->i_af
.if_format
== XFS_DINODE_FMT_LOCAL
&&
2434 xfs_ifork_verify_local_attr(ip
))
2438 * Copy the dirty parts of the inode into the on-disk inode. We always
2439 * copy out the core of the inode, because if the inode is dirty at all
2442 xfs_inode_to_disk(ip
, dip
, iip
->ili_item
.li_lsn
);
2444 /* Wrap, we never let the log put out DI_MAX_FLUSH */
2445 if (!xfs_has_v3inodes(mp
)) {
2446 if (ip
->i_flushiter
== DI_MAX_FLUSH
)
2447 ip
->i_flushiter
= 0;
2450 xfs_iflush_fork(ip
, dip
, iip
, XFS_DATA_FORK
);
2451 if (xfs_inode_has_attr_fork(ip
))
2452 xfs_iflush_fork(ip
, dip
, iip
, XFS_ATTR_FORK
);
2455 * We've recorded everything logged in the inode, so we'd like to clear
2456 * the ili_fields bits so we don't log and flush things unnecessarily.
2457 * However, we can't stop logging all this information until the data
2458 * we've copied into the disk buffer is written to disk. If we did we
2459 * might overwrite the copy of the inode in the log with all the data
2460 * after re-logging only part of it, and in the face of a crash we
2461 * wouldn't have all the data we need to recover.
2463 * What we do is move the bits to the ili_last_fields field. When
2464 * logging the inode, these bits are moved back to the ili_fields field.
2465 * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
2466 * we know that the information those bits represent is permanently on
2467 * disk. As long as the flush completes before the inode is logged
2468 * again, then both ili_fields and ili_last_fields will be cleared.
2472 spin_lock(&iip
->ili_lock
);
2473 iip
->ili_last_fields
= iip
->ili_fields
;
2474 iip
->ili_fields
= 0;
2475 iip
->ili_fsync_fields
= 0;
2476 set_bit(XFS_LI_FLUSHING
, &iip
->ili_item
.li_flags
);
2477 spin_unlock(&iip
->ili_lock
);
2480 * Store the current LSN of the inode so that we can tell whether the
2481 * item has moved in the AIL from xfs_buf_inode_iodone().
2483 xfs_trans_ail_copy_lsn(mp
->m_ail
, &iip
->ili_flush_lsn
,
2484 &iip
->ili_item
.li_lsn
);
2486 /* generate the checksum. */
2487 xfs_dinode_calc_crc(mp
, dip
);
2489 xfs_inode_mark_sick(ip
, XFS_SICK_INO_CORE
);
2494 * Non-blocking flush of dirty inode metadata into the backing buffer.
2496 * The caller must have a reference to the inode and hold the cluster buffer
2497 * locked. The function will walk across all the inodes on the cluster buffer it
2498 * can find and lock without blocking, and flush them to the cluster buffer.
2500 * On successful flushing of at least one inode, the caller must write out the
2501 * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
2502 * the caller needs to release the buffer. On failure, the filesystem will be
2503 * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
2510 struct xfs_mount
*mp
= bp
->b_mount
;
2511 struct xfs_log_item
*lip
, *n
;
2512 struct xfs_inode
*ip
;
2513 struct xfs_inode_log_item
*iip
;
2518 * We must use the safe variant here as on shutdown xfs_iflush_abort()
2519 * will remove itself from the list.
2521 list_for_each_entry_safe(lip
, n
, &bp
->b_li_list
, li_bio_list
) {
2522 iip
= (struct xfs_inode_log_item
*)lip
;
2523 ip
= iip
->ili_inode
;
2526 * Quick and dirty check to avoid locks if possible.
2528 if (__xfs_iflags_test(ip
, XFS_IRECLAIM
| XFS_IFLUSHING
))
2530 if (xfs_ipincount(ip
))
2534 * The inode is still attached to the buffer, which means it is
2535 * dirty but reclaim might try to grab it. Check carefully for
2536 * that, and grab the ilock while still holding the i_flags_lock
2537 * to guarantee reclaim will not be able to reclaim this inode
2538 * once we drop the i_flags_lock.
2540 spin_lock(&ip
->i_flags_lock
);
2541 ASSERT(!__xfs_iflags_test(ip
, XFS_ISTALE
));
2542 if (__xfs_iflags_test(ip
, XFS_IRECLAIM
| XFS_IFLUSHING
)) {
2543 spin_unlock(&ip
->i_flags_lock
);
2548 * ILOCK will pin the inode against reclaim and prevent
2549 * concurrent transactions modifying the inode while we are
2550 * flushing the inode. If we get the lock, set the flushing
2551 * state before we drop the i_flags_lock.
2553 if (!xfs_ilock_nowait(ip
, XFS_ILOCK_SHARED
)) {
2554 spin_unlock(&ip
->i_flags_lock
);
2557 __xfs_iflags_set(ip
, XFS_IFLUSHING
);
2558 spin_unlock(&ip
->i_flags_lock
);
2561 * Abort flushing this inode if we are shut down because the
2562 * inode may not currently be in the AIL. This can occur when
2563 * log I/O failure unpins the inode without inserting into the
2564 * AIL, leaving a dirty/unpinned inode attached to the buffer
2565 * that otherwise looks like it should be flushed.
2567 if (xlog_is_shutdown(mp
->m_log
)) {
2568 xfs_iunpin_wait(ip
);
2569 xfs_iflush_abort(ip
);
2570 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
2575 /* don't block waiting on a log force to unpin dirty inodes */
2576 if (xfs_ipincount(ip
)) {
2577 xfs_iflags_clear(ip
, XFS_IFLUSHING
);
2578 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
2582 if (!xfs_inode_clean(ip
))
2583 error
= xfs_iflush(ip
, bp
);
2585 xfs_iflags_clear(ip
, XFS_IFLUSHING
);
2586 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
2594 * Shutdown first so we kill the log before we release this
2595 * buffer. If it is an INODE_ALLOC buffer and pins the tail
2596 * of the log, failing it before the _log_ is shut down can
2597 * result in the log tail being moved forward in the journal
2598 * on disk because log writes can still be taking place. Hence
2599 * unpinning the tail will allow the ICREATE intent to be
2600 * removed from the log an recovery will fail with uninitialised
2601 * inode cluster buffers.
2603 xfs_force_shutdown(mp
, SHUTDOWN_CORRUPT_INCORE
);
2604 bp
->b_flags
|= XBF_ASYNC
;
2605 xfs_buf_ioend_fail(bp
);
2612 XFS_STATS_INC(mp
, xs_icluster_flushcnt
);
2613 XFS_STATS_ADD(mp
, xs_icluster_flushinode
, clcount
);
2618 /* Release an inode. */
2621 struct xfs_inode
*ip
)
2623 trace_xfs_irele(ip
, _RET_IP_
);
2628 * Ensure all commited transactions touching the inode are written to the log.
2631 xfs_log_force_inode(
2632 struct xfs_inode
*ip
)
2636 xfs_ilock(ip
, XFS_ILOCK_SHARED
);
2637 if (xfs_ipincount(ip
))
2638 seq
= ip
->i_itemp
->ili_commit_seq
;
2639 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
2643 return xfs_log_force_seq(ip
->i_mount
, seq
, XFS_LOG_SYNC
, NULL
);
2647 * Grab the exclusive iolock for a data copy from src to dest, making sure to
2648 * abide vfs locking order (lowest pointer value goes first) and breaking the
2649 * layout leases before proceeding. The loop is needed because we cannot call
2650 * the blocking break_layout() with the iolocks held, and therefore have to
2651 * back out both locks.
2654 xfs_iolock_two_inodes_and_break_layout(
2664 /* Wait to break both inodes' layouts before we start locking. */
2665 error
= break_layout(src
, true);
2669 error
= break_layout(dest
, true);
2674 /* Lock one inode and make sure nobody got in and leased it. */
2676 error
= break_layout(src
, false);
2679 if (error
== -EWOULDBLOCK
)
2687 /* Lock the other inode and make sure nobody got in and leased it. */
2688 inode_lock_nested(dest
, I_MUTEX_NONDIR2
);
2689 error
= break_layout(dest
, false);
2693 if (error
== -EWOULDBLOCK
)
2702 xfs_mmaplock_two_inodes_and_break_dax_layout(
2703 struct xfs_inode
*ip1
,
2704 struct xfs_inode
*ip2
)
2710 if (ip1
->i_ino
> ip2
->i_ino
)
2715 /* Lock the first inode */
2716 xfs_ilock(ip1
, XFS_MMAPLOCK_EXCL
);
2717 error
= xfs_break_dax_layouts(VFS_I(ip1
), &retry
);
2718 if (error
|| retry
) {
2719 xfs_iunlock(ip1
, XFS_MMAPLOCK_EXCL
);
2720 if (error
== 0 && retry
)
2728 /* Nested lock the second inode */
2729 xfs_ilock(ip2
, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL
, 1));
2731 * We cannot use xfs_break_dax_layouts() directly here because it may
2732 * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
2733 * for this nested lock case.
2735 page
= dax_layout_busy_page(VFS_I(ip2
)->i_mapping
);
2736 if (page
&& page_ref_count(page
) != 1) {
2737 xfs_iunlock(ip2
, XFS_MMAPLOCK_EXCL
);
2738 xfs_iunlock(ip1
, XFS_MMAPLOCK_EXCL
);
2746 * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
2751 struct xfs_inode
*ip1
,
2752 struct xfs_inode
*ip2
)
2756 ret
= xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1
), VFS_I(ip2
));
2760 if (IS_DAX(VFS_I(ip1
)) && IS_DAX(VFS_I(ip2
))) {
2761 ret
= xfs_mmaplock_two_inodes_and_break_dax_layout(ip1
, ip2
);
2763 inode_unlock(VFS_I(ip2
));
2765 inode_unlock(VFS_I(ip1
));
2769 filemap_invalidate_lock_two(VFS_I(ip1
)->i_mapping
,
2770 VFS_I(ip2
)->i_mapping
);
2775 /* Unlock both inodes to allow IO and mmap activity. */
2777 xfs_iunlock2_io_mmap(
2778 struct xfs_inode
*ip1
,
2779 struct xfs_inode
*ip2
)
2781 if (IS_DAX(VFS_I(ip1
)) && IS_DAX(VFS_I(ip2
))) {
2782 xfs_iunlock(ip2
, XFS_MMAPLOCK_EXCL
);
2784 xfs_iunlock(ip1
, XFS_MMAPLOCK_EXCL
);
2786 filemap_invalidate_unlock_two(VFS_I(ip1
)->i_mapping
,
2787 VFS_I(ip2
)->i_mapping
);
2789 inode_unlock(VFS_I(ip2
));
2791 inode_unlock(VFS_I(ip1
));
2794 /* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
2796 xfs_iunlock2_remapping(
2797 struct xfs_inode
*ip1
,
2798 struct xfs_inode
*ip2
)
2800 xfs_iflags_clear(ip1
, XFS_IREMAPPING
);
2803 xfs_iunlock(ip1
, XFS_MMAPLOCK_SHARED
);
2804 xfs_iunlock(ip2
, XFS_MMAPLOCK_EXCL
);
2807 inode_unlock_shared(VFS_I(ip1
));
2808 inode_unlock(VFS_I(ip2
));
2812 * Reload the incore inode list for this inode. Caller should ensure that
2813 * the link count cannot change, either by taking ILOCK_SHARED or otherwise
2814 * preventing other threads from executing.
2817 xfs_inode_reload_unlinked_bucket(
2818 struct xfs_trans
*tp
,
2819 struct xfs_inode
*ip
)
2821 struct xfs_mount
*mp
= tp
->t_mountp
;
2822 struct xfs_buf
*agibp
;
2823 struct xfs_agi
*agi
;
2824 struct xfs_perag
*pag
;
2825 xfs_agnumber_t agno
= XFS_INO_TO_AGNO(mp
, ip
->i_ino
);
2826 xfs_agino_t agino
= XFS_INO_TO_AGINO(mp
, ip
->i_ino
);
2827 xfs_agino_t prev_agino
, next_agino
;
2828 unsigned int bucket
;
2829 bool foundit
= false;
2832 /* Grab the first inode in the list */
2833 pag
= xfs_perag_get(mp
, agno
);
2834 error
= xfs_ialloc_read_agi(pag
, tp
, 0, &agibp
);
2840 * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the
2841 * incore unlinked list pointers for this inode. Check once more to
2842 * see if we raced with anyone else to reload the unlinked list.
2844 if (!xfs_inode_unlinked_incomplete(ip
)) {
2849 bucket
= agino
% XFS_AGI_UNLINKED_BUCKETS
;
2850 agi
= agibp
->b_addr
;
2852 trace_xfs_inode_reload_unlinked_bucket(ip
);
2854 xfs_info_ratelimited(mp
,
2855 "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.",
2858 prev_agino
= NULLAGINO
;
2859 next_agino
= be32_to_cpu(agi
->agi_unlinked
[bucket
]);
2860 while (next_agino
!= NULLAGINO
) {
2861 struct xfs_inode
*next_ip
= NULL
;
2863 /* Found this caller's inode, set its backlink. */
2864 if (next_agino
== agino
) {
2866 next_ip
->i_prev_unlinked
= prev_agino
;
2871 /* Try in-memory lookup first. */
2872 next_ip
= xfs_iunlink_lookup(pag
, next_agino
);
2876 /* Inode not in memory, try reloading it. */
2877 error
= xfs_iunlink_reload_next(tp
, agibp
, prev_agino
,
2882 /* Grab the reloaded inode. */
2883 next_ip
= xfs_iunlink_lookup(pag
, next_agino
);
2885 /* No incore inode at all? We reloaded it... */
2886 ASSERT(next_ip
!= NULL
);
2887 error
= -EFSCORRUPTED
;
2892 prev_agino
= next_agino
;
2893 next_agino
= next_ip
->i_next_unlinked
;
2897 xfs_trans_brelse(tp
, agibp
);
2898 /* Should have found this inode somewhere in the iunlinked bucket. */
2899 if (!error
&& !foundit
)
2900 error
= -EFSCORRUPTED
;
2904 /* Decide if this inode is missing its unlinked list and reload it. */
2906 xfs_inode_reload_unlinked(
2907 struct xfs_inode
*ip
)
2909 struct xfs_trans
*tp
;
2912 error
= xfs_trans_alloc_empty(ip
->i_mount
, &tp
);
2916 xfs_ilock(ip
, XFS_ILOCK_SHARED
);
2917 if (xfs_inode_unlinked_incomplete(ip
))
2918 error
= xfs_inode_reload_unlinked_bucket(tp
, ip
);
2919 xfs_iunlock(ip
, XFS_ILOCK_SHARED
);
2920 xfs_trans_cancel(tp
);
2925 /* Has this inode fork been zapped by repair? */
2928 const struct xfs_inode
*ip
,
2931 unsigned int datamask
= 0;
2933 switch (whichfork
) {
2935 switch (ip
->i_vnode
.i_mode
& S_IFMT
) {
2937 datamask
= XFS_SICK_INO_DIR_ZAPPED
;
2940 datamask
= XFS_SICK_INO_SYMLINK_ZAPPED
;
2943 return ip
->i_sick
& (XFS_SICK_INO_BMBTD_ZAPPED
| datamask
);
2945 return ip
->i_sick
& XFS_SICK_INO_BMBTA_ZAPPED
;
2951 /* Compute the number of data and realtime blocks used by a file. */
2953 xfs_inode_count_blocks(
2954 struct xfs_trans
*tp
,
2955 struct xfs_inode
*ip
,
2956 xfs_filblks_t
*dblocks
,
2957 xfs_filblks_t
*rblocks
)
2959 struct xfs_ifork
*ifp
= xfs_ifork_ptr(ip
, XFS_DATA_FORK
);
2962 if (XFS_IS_REALTIME_INODE(ip
))
2963 xfs_bmap_count_leaves(ifp
, rblocks
);
2964 *dblocks
= ip
->i_nblocks
- *rblocks
;
2969 struct inode
*inode
)
2971 struct xfs_inode
*ip
= XFS_I(inode
);
2973 xfs_iunlock(ip
, XFS_MMAPLOCK_EXCL
);
2975 xfs_ilock(ip
, XFS_MMAPLOCK_EXCL
);
2979 xfs_break_dax_layouts(
2980 struct inode
*inode
,
2985 xfs_assert_ilocked(XFS_I(inode
), XFS_MMAPLOCK_EXCL
);
2987 page
= dax_layout_busy_page(inode
->i_mapping
);
2992 return ___wait_var_event(&page
->_refcount
,
2993 atomic_read(&page
->_refcount
) == 1, TASK_INTERRUPTIBLE
,
2994 0, 0, xfs_wait_dax_page(inode
));
2999 struct inode
*inode
,
3001 enum layout_break_reason reason
)
3006 xfs_assert_ilocked(XFS_I(inode
), XFS_IOLOCK_SHARED
| XFS_IOLOCK_EXCL
);
3012 error
= xfs_break_dax_layouts(inode
, &retry
);
3017 error
= xfs_break_leased_layouts(inode
, iolock
, &retry
);
3023 } while (error
== 0 && retry
);
3028 /* Returns the size of fundamental allocation unit for a file, in bytes. */
3030 xfs_inode_alloc_unitsize(
3031 struct xfs_inode
*ip
)
3033 unsigned int blocks
= 1;
3035 if (XFS_IS_REALTIME_INODE(ip
))
3036 blocks
= ip
->i_mount
->m_sb
.sb_rextsize
;
3038 return XFS_FSB_TO_B(ip
->i_mount
, blocks
);
3041 /* Should we always be using copy on write for file writes? */
3043 xfs_is_always_cow_inode(
3044 struct xfs_inode
*ip
)
3046 return ip
->i_mount
->m_always_cow
&& xfs_has_reflink(ip
->i_mount
);