1 // SPDX-License-Identifier: GPL-2.0
7 #include "btree_update.h"
14 #include "fs-common.h"
17 #include "fs-io-buffered.h"
18 #include "fs-io-direct.h"
19 #include "fs-io-pagecache.h"
31 #include <linux/aio.h>
32 #include <linux/backing-dev.h>
33 #include <linux/exportfs.h>
34 #include <linux/fiemap.h>
35 #include <linux/fs_context.h>
36 #include <linux/module.h>
37 #include <linux/pagemap.h>
38 #include <linux/posix_acl.h>
39 #include <linux/random.h>
40 #include <linux/seq_file.h>
41 #include <linux/statfs.h>
42 #include <linux/string.h>
43 #include <linux/xattr.h>
45 static struct kmem_cache
*bch2_inode_cache
;
47 static void bch2_vfs_inode_init(struct btree_trans
*, subvol_inum
,
48 struct bch_inode_info
*,
49 struct bch_inode_unpacked
*,
50 struct bch_subvolume
*);
52 void bch2_inode_update_after_write(struct btree_trans
*trans
,
53 struct bch_inode_info
*inode
,
54 struct bch_inode_unpacked
*bi
,
57 struct bch_fs
*c
= trans
->c
;
59 BUG_ON(bi
->bi_inum
!= inode
->v
.i_ino
);
61 bch2_assert_pos_locked(trans
, BTREE_ID_inodes
, POS(0, bi
->bi_inum
));
63 set_nlink(&inode
->v
, bch2_inode_nlink_get(bi
));
64 i_uid_write(&inode
->v
, bi
->bi_uid
);
65 i_gid_write(&inode
->v
, bi
->bi_gid
);
66 inode
->v
.i_mode
= bi
->bi_mode
;
68 if (fields
& ATTR_ATIME
)
69 inode_set_atime_to_ts(&inode
->v
, bch2_time_to_timespec(c
, bi
->bi_atime
));
70 if (fields
& ATTR_MTIME
)
71 inode_set_mtime_to_ts(&inode
->v
, bch2_time_to_timespec(c
, bi
->bi_mtime
));
72 if (fields
& ATTR_CTIME
)
73 inode_set_ctime_to_ts(&inode
->v
, bch2_time_to_timespec(c
, bi
->bi_ctime
));
75 inode
->ei_inode
= *bi
;
77 bch2_inode_flags_to_vfs(inode
);
80 int __must_check
bch2_write_inode(struct bch_fs
*c
,
81 struct bch_inode_info
*inode
,
83 void *p
, unsigned fields
)
85 struct btree_trans
*trans
= bch2_trans_get(c
);
86 struct btree_iter iter
= { NULL
};
87 struct bch_inode_unpacked inode_u
;
90 bch2_trans_begin(trans
);
92 ret
= bch2_inode_peek(trans
, &iter
, &inode_u
, inode_inum(inode
),
94 (set
? set(trans
, inode
, &inode_u
, p
) : 0) ?:
95 bch2_inode_write(trans
, &iter
, &inode_u
) ?:
96 bch2_trans_commit(trans
, NULL
, NULL
, BCH_TRANS_COMMIT_no_enospc
);
99 * the btree node lock protects inode->ei_inode, not ei_update_lock;
100 * this is important for inode updates via bchfs_write_index_update
103 bch2_inode_update_after_write(trans
, inode
, &inode_u
, fields
);
105 bch2_trans_iter_exit(trans
, &iter
);
107 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
110 bch2_fs_fatal_err_on(bch2_err_matches(ret
, ENOENT
), c
,
111 "%s: inode %llu:%llu not found when updating",
113 inode_inum(inode
).subvol
,
114 inode_inum(inode
).inum
);
116 bch2_trans_put(trans
);
117 return ret
< 0 ? ret
: 0;
120 int bch2_fs_quota_transfer(struct bch_fs
*c
,
121 struct bch_inode_info
*inode
,
122 struct bch_qid new_qid
,
124 enum quota_acct_mode mode
)
129 qtypes
&= enabled_qtypes(c
);
131 for (i
= 0; i
< QTYP_NR
; i
++)
132 if (new_qid
.q
[i
] == inode
->ei_qid
.q
[i
])
133 qtypes
&= ~(1U << i
);
138 mutex_lock(&inode
->ei_quota_lock
);
140 ret
= bch2_quota_transfer(c
, qtypes
, new_qid
,
143 inode
->ei_quota_reserved
,
146 for (i
= 0; i
< QTYP_NR
; i
++)
147 if (qtypes
& (1 << i
))
148 inode
->ei_qid
.q
[i
] = new_qid
.q
[i
];
150 mutex_unlock(&inode
->ei_quota_lock
);
155 static bool subvol_inum_eq(subvol_inum a
, subvol_inum b
)
157 return a
.subvol
== b
.subvol
&& a
.inum
== b
.inum
;
160 static u32
bch2_vfs_inode_hash_fn(const void *data
, u32 len
, u32 seed
)
162 const subvol_inum
*inum
= data
;
164 return jhash(&inum
->inum
, sizeof(inum
->inum
), seed
);
167 static u32
bch2_vfs_inode_obj_hash_fn(const void *data
, u32 len
, u32 seed
)
169 const struct bch_inode_info
*inode
= data
;
171 return bch2_vfs_inode_hash_fn(&inode
->ei_inum
, sizeof(inode
->ei_inum
), seed
);
174 static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg
*arg
,
177 const struct bch_inode_info
*inode
= obj
;
178 const subvol_inum
*v
= arg
->key
;
180 return !subvol_inum_eq(inode
->ei_inum
, *v
);
183 static const struct rhashtable_params bch2_vfs_inodes_params
= {
184 .head_offset
= offsetof(struct bch_inode_info
, hash
),
185 .key_offset
= offsetof(struct bch_inode_info
, ei_inum
),
186 .key_len
= sizeof(subvol_inum
),
187 .hashfn
= bch2_vfs_inode_hash_fn
,
188 .obj_hashfn
= bch2_vfs_inode_obj_hash_fn
,
189 .obj_cmpfn
= bch2_vfs_inode_cmp_fn
,
190 .automatic_shrinking
= true,
193 int bch2_inode_or_descendents_is_open(struct btree_trans
*trans
, struct bpos p
)
195 struct bch_fs
*c
= trans
->c
;
196 struct rhashtable
*ht
= &c
->vfs_inodes_table
;
197 subvol_inum inum
= (subvol_inum
) { .inum
= p
.offset
};
201 if (!test_bit(BCH_FS_started
, &c
->flags
))
204 darray_init(&subvols
);
208 * Tweaked version of __rhashtable_lookup(); we need to get a list of
209 * subvolumes in which the given inode number is open.
211 * For this to work, we don't include the subvolume ID in the key that
212 * we hash - all inodes with the same inode number regardless of
213 * subvolume will hash to the same slot.
215 * This will be less than ideal if the same file is ever open
216 * simultaneously in many different snapshots:
219 struct rhash_lock_head __rcu
*const *bkt
;
220 struct rhash_head
*he
;
222 struct bucket_table
*tbl
= rht_dereference_rcu(ht
->tbl
, ht
);
224 hash
= rht_key_hashfn(ht
, tbl
, &inum
, bch2_vfs_inodes_params
);
225 bkt
= rht_bucket(tbl
, hash
);
227 struct bch_inode_info
*inode
;
229 rht_for_each_entry_rcu_from(inode
, he
, rht_ptr_rcu(bkt
), tbl
, hash
, hash
) {
230 if (inode
->ei_inum
.inum
== inum
.inum
) {
231 ret
= darray_push_gfp(&subvols
, inode
->ei_inum
.subvol
,
232 GFP_NOWAIT
|__GFP_NOWARN
);
235 ret
= darray_make_room(&subvols
, 1);
239 goto restart_from_top
;
243 /* An object might have been moved to a different hash chain,
244 * while we walk along it - better check and retry.
246 } while (he
!= RHT_NULLS_MARKER(bkt
));
248 /* Ensure we see any new tables. */
251 tbl
= rht_dereference_rcu(tbl
->future_tbl
, ht
);
256 darray_for_each(subvols
, i
) {
258 ret
= bch2_subvolume_get_snapshot(trans
, *i
, &snap
);
262 ret
= bch2_snapshot_is_ancestor(c
, snap
, p
.snapshot
);
267 darray_exit(&subvols
);
271 static struct bch_inode_info
*__bch2_inode_hash_find(struct bch_fs
*c
, subvol_inum inum
)
273 return rhashtable_lookup_fast(&c
->vfs_inodes_table
, &inum
, bch2_vfs_inodes_params
);
276 static void __wait_on_freeing_inode(struct bch_fs
*c
,
277 struct bch_inode_info
*inode
,
280 wait_queue_head_t
*wq
;
281 struct wait_bit_queue_entry wait
;
283 wq
= inode_bit_waitqueue(&wait
, &inode
->v
, __I_NEW
);
284 prepare_to_wait(wq
, &wait
.wq_entry
, TASK_UNINTERRUPTIBLE
);
285 spin_unlock(&inode
->v
.i_lock
);
287 if (__bch2_inode_hash_find(c
, inum
) == inode
)
288 schedule_timeout(HZ
* 10);
289 finish_wait(wq
, &wait
.wq_entry
);
292 static struct bch_inode_info
*bch2_inode_hash_find(struct bch_fs
*c
, struct btree_trans
*trans
,
295 struct bch_inode_info
*inode
;
297 inode
= __bch2_inode_hash_find(c
, inum
);
299 spin_lock(&inode
->v
.i_lock
);
300 if (!test_bit(EI_INODE_HASHED
, &inode
->ei_flags
)) {
301 spin_unlock(&inode
->v
.i_lock
);
304 if ((inode
->v
.i_state
& (I_FREEING
|I_WILL_FREE
))) {
306 __wait_on_freeing_inode(c
, inode
, inum
);
308 bch2_trans_unlock(trans
);
309 __wait_on_freeing_inode(c
, inode
, inum
);
310 int ret
= bch2_trans_relock(trans
);
317 spin_unlock(&inode
->v
.i_lock
);
323 static void bch2_inode_hash_remove(struct bch_fs
*c
, struct bch_inode_info
*inode
)
325 spin_lock(&inode
->v
.i_lock
);
326 bool remove
= test_and_clear_bit(EI_INODE_HASHED
, &inode
->ei_flags
);
327 spin_unlock(&inode
->v
.i_lock
);
330 int ret
= rhashtable_remove_fast(&c
->vfs_inodes_table
,
331 &inode
->hash
, bch2_vfs_inodes_params
);
333 inode
->v
.i_hash
.pprev
= NULL
;
335 * This pairs with the bch2_inode_hash_find() ->
336 * __wait_on_freeing_inode() path
338 inode_wake_up_bit(&inode
->v
, __I_NEW
);
342 static struct bch_inode_info
*bch2_inode_hash_insert(struct bch_fs
*c
,
343 struct btree_trans
*trans
,
344 struct bch_inode_info
*inode
)
346 struct bch_inode_info
*old
= inode
;
348 set_bit(EI_INODE_HASHED
, &inode
->ei_flags
);
350 if (unlikely(rhashtable_lookup_insert_key(&c
->vfs_inodes_table
,
353 bch2_vfs_inodes_params
))) {
354 old
= bch2_inode_hash_find(c
, trans
, inode
->ei_inum
);
358 clear_bit(EI_INODE_HASHED
, &inode
->ei_flags
);
361 * bcachefs doesn't use I_NEW; we have no use for it since we
362 * only insert fully created inodes in the inode hash table. But
363 * discard_new_inode() expects it to be set...
365 inode
->v
.i_state
|= I_NEW
;
367 * We don't want bch2_evict_inode() to delete the inode on disk,
368 * we just raced and had another inode in cache. Normally new
369 * inodes don't have nlink == 0 - except tmpfiles do...
371 set_nlink(&inode
->v
, 1);
372 discard_new_inode(&inode
->v
);
375 inode_fake_hash(&inode
->v
);
377 inode_sb_list_add(&inode
->v
);
379 mutex_lock(&c
->vfs_inodes_lock
);
380 list_add(&inode
->ei_vfs_inode_list
, &c
->vfs_inodes_list
);
381 mutex_unlock(&c
->vfs_inodes_lock
);
386 #define memalloc_flags_do(_flags, _do) \
388 unsigned _saved_flags = memalloc_flags_save(_flags); \
389 typeof(_do) _ret = _do; \
390 memalloc_noreclaim_restore(_saved_flags); \
394 static struct inode
*bch2_alloc_inode(struct super_block
*sb
)
399 static struct bch_inode_info
*__bch2_new_inode(struct bch_fs
*c
, gfp_t gfp
)
401 struct bch_inode_info
*inode
= alloc_inode_sb(c
->vfs_sb
,
402 bch2_inode_cache
, gfp
);
406 inode_init_once(&inode
->v
);
407 mutex_init(&inode
->ei_update_lock
);
408 two_state_lock_init(&inode
->ei_pagecache_lock
);
409 INIT_LIST_HEAD(&inode
->ei_vfs_inode_list
);
411 mutex_init(&inode
->ei_quota_lock
);
412 memset(&inode
->ei_devs_need_flush
, 0, sizeof(inode
->ei_devs_need_flush
));
414 if (unlikely(inode_init_always_gfp(c
->vfs_sb
, &inode
->v
, gfp
))) {
415 kmem_cache_free(bch2_inode_cache
, inode
);
423 * Allocate a new inode, dropping/retaking btree locks if necessary:
425 static struct bch_inode_info
*bch2_new_inode(struct btree_trans
*trans
)
427 struct bch_inode_info
*inode
= __bch2_new_inode(trans
->c
, GFP_NOWAIT
);
429 if (unlikely(!inode
)) {
430 int ret
= drop_locks_do(trans
, (inode
= __bch2_new_inode(trans
->c
, GFP_NOFS
)) ? 0 : -ENOMEM
);
432 __destroy_inode(&inode
->v
);
433 kmem_cache_free(bch2_inode_cache
, inode
);
442 static struct bch_inode_info
*bch2_inode_hash_init_insert(struct btree_trans
*trans
,
444 struct bch_inode_unpacked
*bi
,
445 struct bch_subvolume
*subvol
)
447 struct bch_inode_info
*inode
= bch2_new_inode(trans
);
451 bch2_vfs_inode_init(trans
, inum
, inode
, bi
, subvol
);
453 return bch2_inode_hash_insert(trans
->c
, trans
, inode
);
457 struct inode
*bch2_vfs_inode_get(struct bch_fs
*c
, subvol_inum inum
)
459 struct bch_inode_info
*inode
= bch2_inode_hash_find(c
, NULL
, inum
);
463 struct btree_trans
*trans
= bch2_trans_get(c
);
465 struct bch_inode_unpacked inode_u
;
466 struct bch_subvolume subvol
;
467 int ret
= lockrestart_do(trans
,
468 bch2_subvolume_get(trans
, inum
.subvol
, true, 0, &subvol
) ?:
469 bch2_inode_find_by_inum_trans(trans
, inum
, &inode_u
)) ?:
470 PTR_ERR_OR_ZERO(inode
= bch2_inode_hash_init_insert(trans
, inum
, &inode_u
, &subvol
));
471 bch2_trans_put(trans
);
473 return ret
? ERR_PTR(ret
) : &inode
->v
;
476 struct bch_inode_info
*
477 __bch2_create(struct mnt_idmap
*idmap
,
478 struct bch_inode_info
*dir
, struct dentry
*dentry
,
479 umode_t mode
, dev_t rdev
, subvol_inum snapshot_src
,
482 struct bch_fs
*c
= dir
->v
.i_sb
->s_fs_info
;
483 struct btree_trans
*trans
;
484 struct bch_inode_unpacked dir_u
;
485 struct bch_inode_info
*inode
;
486 struct bch_inode_unpacked inode_u
;
487 struct posix_acl
*default_acl
= NULL
, *acl
= NULL
;
489 struct bch_subvolume subvol
;
496 * preallocate acls + vfs inode before btree transaction, so that
497 * nothing can fail after the transaction succeeds:
499 #ifdef CONFIG_BCACHEFS_POSIX_ACL
500 ret
= posix_acl_create(&dir
->v
, &mode
, &default_acl
, &acl
);
504 inode
= __bch2_new_inode(c
, GFP_NOFS
);
505 if (unlikely(!inode
)) {
506 inode
= ERR_PTR(-ENOMEM
);
510 bch2_inode_init_early(c
, &inode_u
);
512 if (!(flags
& BCH_CREATE_TMPFILE
))
513 mutex_lock(&dir
->ei_update_lock
);
515 trans
= bch2_trans_get(c
);
517 bch2_trans_begin(trans
);
519 kuid
= mapped_fsuid(idmap
, i_user_ns(&dir
->v
));
520 kgid
= mapped_fsgid(idmap
, i_user_ns(&dir
->v
));
521 ret
= bch2_subvol_is_ro_trans(trans
, dir
->ei_inum
.subvol
) ?:
522 bch2_create_trans(trans
,
523 inode_inum(dir
), &dir_u
, &inode_u
,
524 !(flags
& BCH_CREATE_TMPFILE
)
525 ? &dentry
->d_name
: NULL
,
526 from_kuid(i_user_ns(&dir
->v
), kuid
),
527 from_kgid(i_user_ns(&dir
->v
), kgid
),
529 default_acl
, acl
, snapshot_src
, flags
) ?:
530 bch2_quota_acct(c
, bch_qid(&inode_u
), Q_INO
, 1,
531 KEY_TYPE_QUOTA_PREALLOC
);
533 goto err_before_quota
;
535 inum
.subvol
= inode_u
.bi_subvol
?: dir
->ei_inum
.subvol
;
536 inum
.inum
= inode_u
.bi_inum
;
538 ret
= bch2_subvolume_get(trans
, inum
.subvol
, true,
539 BTREE_ITER_with_updates
, &subvol
) ?:
540 bch2_trans_commit(trans
, NULL
, &journal_seq
, 0);
542 bch2_quota_acct(c
, bch_qid(&inode_u
), Q_INO
, -1,
543 KEY_TYPE_QUOTA_WARN
);
545 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
550 if (!(flags
& BCH_CREATE_TMPFILE
)) {
551 bch2_inode_update_after_write(trans
, dir
, &dir_u
,
552 ATTR_MTIME
|ATTR_CTIME
);
553 mutex_unlock(&dir
->ei_update_lock
);
556 bch2_vfs_inode_init(trans
, inum
, inode
, &inode_u
, &subvol
);
558 set_cached_acl(&inode
->v
, ACL_TYPE_ACCESS
, acl
);
559 set_cached_acl(&inode
->v
, ACL_TYPE_DEFAULT
, default_acl
);
562 * we must insert the new inode into the inode cache before calling
563 * bch2_trans_exit() and dropping locks, else we could race with another
564 * thread pulling the inode in and modifying it:
566 * also, calling bch2_inode_hash_insert() without passing in the
567 * transaction object is sketchy - if we could ever end up in
568 * __wait_on_freeing_inode(), we'd risk deadlock.
570 * But that shouldn't be possible, since we still have the inode locked
571 * that we just created, and we _really_ can't take a transaction
574 inode
= bch2_inode_hash_insert(c
, NULL
, inode
);
575 bch2_trans_put(trans
);
577 posix_acl_release(default_acl
);
578 posix_acl_release(acl
);
581 if (!(flags
& BCH_CREATE_TMPFILE
))
582 mutex_unlock(&dir
->ei_update_lock
);
584 bch2_trans_put(trans
);
585 make_bad_inode(&inode
->v
);
587 inode
= ERR_PTR(ret
);
593 static struct bch_inode_info
*bch2_lookup_trans(struct btree_trans
*trans
,
594 subvol_inum dir
, struct bch_hash_info
*dir_hash_info
,
595 const struct qstr
*name
)
597 struct bch_fs
*c
= trans
->c
;
598 struct btree_iter dirent_iter
= {};
599 subvol_inum inum
= {};
600 struct printbuf buf
= PRINTBUF
;
602 struct bkey_s_c k
= bch2_hash_lookup(trans
, &dirent_iter
, bch2_dirent_hash_desc
,
603 dir_hash_info
, dir
, name
, 0);
604 int ret
= bkey_err(k
);
608 ret
= bch2_dirent_read_target(trans
, dir
, bkey_s_c_to_dirent(k
), &inum
);
614 struct bch_inode_info
*inode
= bch2_inode_hash_find(c
, trans
, inum
);
618 struct bch_subvolume subvol
;
619 struct bch_inode_unpacked inode_u
;
620 ret
= bch2_subvolume_get(trans
, inum
.subvol
, true, 0, &subvol
) ?:
621 bch2_inode_find_by_inum_nowarn_trans(trans
, inum
, &inode_u
) ?:
622 PTR_ERR_OR_ZERO(inode
= bch2_inode_hash_init_insert(trans
, inum
, &inode_u
, &subvol
));
624 bch2_fs_inconsistent_on(bch2_err_matches(ret
, ENOENT
),
625 c
, "dirent to missing inode:\n %s",
626 (bch2_bkey_val_to_text(&buf
, c
, k
), buf
.buf
));
630 /* regular files may have hardlinks: */
631 if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u
) &&
632 !bkey_eq(k
.k
->p
, POS(inode_u
.bi_dir
, inode_u
.bi_dir_offset
)),
634 "dirent points to inode that does not point back:\n %s",
635 (bch2_bkey_val_to_text(&buf
, c
, k
),
636 prt_printf(&buf
, "\n "),
637 bch2_inode_unpacked_to_text(&buf
, &inode_u
),
643 bch2_trans_iter_exit(trans
, &dirent_iter
);
647 inode
= ERR_PTR(ret
);
651 static struct dentry
*bch2_lookup(struct inode
*vdir
, struct dentry
*dentry
,
654 struct bch_fs
*c
= vdir
->i_sb
->s_fs_info
;
655 struct bch_inode_info
*dir
= to_bch_ei(vdir
);
656 struct bch_hash_info hash
= bch2_hash_info_init(c
, &dir
->ei_inode
);
658 struct bch_inode_info
*inode
;
660 PTR_ERR_OR_ZERO(inode
= bch2_lookup_trans(trans
, inode_inum(dir
),
661 &hash
, &dentry
->d_name
)));
665 return d_splice_alias(&inode
->v
, dentry
);
668 static int bch2_mknod(struct mnt_idmap
*idmap
,
669 struct inode
*vdir
, struct dentry
*dentry
,
670 umode_t mode
, dev_t rdev
)
672 struct bch_inode_info
*inode
=
673 __bch2_create(idmap
, to_bch_ei(vdir
), dentry
, mode
, rdev
,
674 (subvol_inum
) { 0 }, 0);
677 return bch2_err_class(PTR_ERR(inode
));
679 d_instantiate(dentry
, &inode
->v
);
683 static int bch2_create(struct mnt_idmap
*idmap
,
684 struct inode
*vdir
, struct dentry
*dentry
,
685 umode_t mode
, bool excl
)
687 return bch2_mknod(idmap
, vdir
, dentry
, mode
|S_IFREG
, 0);
690 static int __bch2_link(struct bch_fs
*c
,
691 struct bch_inode_info
*inode
,
692 struct bch_inode_info
*dir
,
693 struct dentry
*dentry
)
695 struct bch_inode_unpacked dir_u
, inode_u
;
698 mutex_lock(&inode
->ei_update_lock
);
699 struct btree_trans
*trans
= bch2_trans_get(c
);
701 ret
= commit_do(trans
, NULL
, NULL
, 0,
702 bch2_link_trans(trans
,
703 inode_inum(dir
), &dir_u
,
704 inode_inum(inode
), &inode_u
,
708 bch2_inode_update_after_write(trans
, dir
, &dir_u
,
709 ATTR_MTIME
|ATTR_CTIME
);
710 bch2_inode_update_after_write(trans
, inode
, &inode_u
, ATTR_CTIME
);
713 bch2_trans_put(trans
);
714 mutex_unlock(&inode
->ei_update_lock
);
718 static int bch2_link(struct dentry
*old_dentry
, struct inode
*vdir
,
719 struct dentry
*dentry
)
721 struct bch_fs
*c
= vdir
->i_sb
->s_fs_info
;
722 struct bch_inode_info
*dir
= to_bch_ei(vdir
);
723 struct bch_inode_info
*inode
= to_bch_ei(old_dentry
->d_inode
);
726 lockdep_assert_held(&inode
->v
.i_rwsem
);
728 ret
= bch2_subvol_is_ro(c
, dir
->ei_inum
.subvol
) ?:
729 bch2_subvol_is_ro(c
, inode
->ei_inum
.subvol
) ?:
730 __bch2_link(c
, inode
, dir
, dentry
);
732 return bch2_err_class(ret
);
735 d_instantiate(dentry
, &inode
->v
);
739 int __bch2_unlink(struct inode
*vdir
, struct dentry
*dentry
,
740 bool deleting_snapshot
)
742 struct bch_fs
*c
= vdir
->i_sb
->s_fs_info
;
743 struct bch_inode_info
*dir
= to_bch_ei(vdir
);
744 struct bch_inode_info
*inode
= to_bch_ei(dentry
->d_inode
);
745 struct bch_inode_unpacked dir_u
, inode_u
;
748 bch2_lock_inodes(INODE_UPDATE_LOCK
, dir
, inode
);
750 struct btree_trans
*trans
= bch2_trans_get(c
);
752 ret
= commit_do(trans
, NULL
, NULL
,
753 BCH_TRANS_COMMIT_no_enospc
,
754 bch2_unlink_trans(trans
,
755 inode_inum(dir
), &dir_u
,
756 &inode_u
, &dentry
->d_name
,
761 bch2_inode_update_after_write(trans
, dir
, &dir_u
,
762 ATTR_MTIME
|ATTR_CTIME
);
763 bch2_inode_update_after_write(trans
, inode
, &inode_u
,
766 if (inode_u
.bi_subvol
) {
768 * Subvolume deletion is asynchronous, but we still want to tell
769 * the VFS that it's been deleted here:
771 set_nlink(&inode
->v
, 0);
774 bch2_trans_put(trans
);
775 bch2_unlock_inodes(INODE_UPDATE_LOCK
, dir
, inode
);
780 static int bch2_unlink(struct inode
*vdir
, struct dentry
*dentry
)
782 struct bch_inode_info
*dir
= to_bch_ei(vdir
);
783 struct bch_fs
*c
= dir
->v
.i_sb
->s_fs_info
;
785 int ret
= bch2_subvol_is_ro(c
, dir
->ei_inum
.subvol
) ?:
786 __bch2_unlink(vdir
, dentry
, false);
787 return bch2_err_class(ret
);
790 static int bch2_symlink(struct mnt_idmap
*idmap
,
791 struct inode
*vdir
, struct dentry
*dentry
,
794 struct bch_fs
*c
= vdir
->i_sb
->s_fs_info
;
795 struct bch_inode_info
*dir
= to_bch_ei(vdir
), *inode
;
798 inode
= __bch2_create(idmap
, dir
, dentry
, S_IFLNK
|S_IRWXUGO
, 0,
799 (subvol_inum
) { 0 }, BCH_CREATE_TMPFILE
);
801 return bch2_err_class(PTR_ERR(inode
));
803 inode_lock(&inode
->v
);
804 ret
= page_symlink(&inode
->v
, symname
, strlen(symname
) + 1);
805 inode_unlock(&inode
->v
);
810 ret
= filemap_write_and_wait_range(inode
->v
.i_mapping
, 0, LLONG_MAX
);
814 ret
= __bch2_link(c
, inode
, dir
, dentry
);
818 d_instantiate(dentry
, &inode
->v
);
822 return bch2_err_class(ret
);
825 static int bch2_mkdir(struct mnt_idmap
*idmap
,
826 struct inode
*vdir
, struct dentry
*dentry
, umode_t mode
)
828 return bch2_mknod(idmap
, vdir
, dentry
, mode
|S_IFDIR
, 0);
831 static int bch2_rename2(struct mnt_idmap
*idmap
,
832 struct inode
*src_vdir
, struct dentry
*src_dentry
,
833 struct inode
*dst_vdir
, struct dentry
*dst_dentry
,
836 struct bch_fs
*c
= src_vdir
->i_sb
->s_fs_info
;
837 struct bch_inode_info
*src_dir
= to_bch_ei(src_vdir
);
838 struct bch_inode_info
*dst_dir
= to_bch_ei(dst_vdir
);
839 struct bch_inode_info
*src_inode
= to_bch_ei(src_dentry
->d_inode
);
840 struct bch_inode_info
*dst_inode
= to_bch_ei(dst_dentry
->d_inode
);
841 struct bch_inode_unpacked dst_dir_u
, src_dir_u
;
842 struct bch_inode_unpacked src_inode_u
, dst_inode_u
, *whiteout_inode_u
;
843 struct btree_trans
*trans
;
844 enum bch_rename_mode mode
= flags
& RENAME_EXCHANGE
845 ? BCH_RENAME_EXCHANGE
846 : dst_dentry
->d_inode
847 ? BCH_RENAME_OVERWRITE
: BCH_RENAME
;
848 bool whiteout
= !!(flags
& RENAME_WHITEOUT
);
851 if (flags
& ~(RENAME_NOREPLACE
|RENAME_EXCHANGE
|RENAME_WHITEOUT
))
854 if (mode
== BCH_RENAME_OVERWRITE
) {
855 ret
= filemap_write_and_wait_range(src_inode
->v
.i_mapping
,
861 bch2_lock_inodes(INODE_UPDATE_LOCK
,
867 trans
= bch2_trans_get(c
);
869 ret
= bch2_subvol_is_ro_trans(trans
, src_dir
->ei_inum
.subvol
) ?:
870 bch2_subvol_is_ro_trans(trans
, dst_dir
->ei_inum
.subvol
);
874 if (inode_attr_changing(dst_dir
, src_inode
, Inode_opt_project
)) {
875 ret
= bch2_fs_quota_transfer(c
, src_inode
,
878 KEY_TYPE_QUOTA_PREALLOC
);
883 if (mode
== BCH_RENAME_EXCHANGE
&&
884 inode_attr_changing(src_dir
, dst_inode
, Inode_opt_project
)) {
885 ret
= bch2_fs_quota_transfer(c
, dst_inode
,
888 KEY_TYPE_QUOTA_PREALLOC
);
893 bch2_trans_begin(trans
);
895 ret
= bch2_rename_trans(trans
,
896 inode_inum(src_dir
), &src_dir_u
,
897 inode_inum(dst_dir
), &dst_dir_u
,
907 whiteout_inode_u
= bch2_trans_kmalloc_nomemzero(trans
, sizeof(*whiteout_inode_u
));
908 ret
= PTR_ERR_OR_ZERO(whiteout_inode_u
);
911 bch2_inode_init_early(c
, whiteout_inode_u
);
913 ret
= bch2_create_trans(trans
,
914 inode_inum(src_dir
), &src_dir_u
,
917 from_kuid(i_user_ns(&src_dir
->v
), current_fsuid()),
918 from_kgid(i_user_ns(&src_dir
->v
), current_fsgid()),
919 S_IFCHR
|WHITEOUT_MODE
, 0,
920 NULL
, NULL
, (subvol_inum
) { 0 }, 0) ?:
921 bch2_quota_acct(c
, bch_qid(whiteout_inode_u
), Q_INO
, 1,
922 KEY_TYPE_QUOTA_PREALLOC
);
927 ret
= bch2_trans_commit(trans
, NULL
, NULL
, 0);
930 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
935 BUG_ON(src_inode
->v
.i_ino
!= src_inode_u
.bi_inum
);
937 dst_inode
->v
.i_ino
!= dst_inode_u
.bi_inum
);
939 bch2_inode_update_after_write(trans
, src_dir
, &src_dir_u
,
940 ATTR_MTIME
|ATTR_CTIME
);
942 if (src_dir
!= dst_dir
)
943 bch2_inode_update_after_write(trans
, dst_dir
, &dst_dir_u
,
944 ATTR_MTIME
|ATTR_CTIME
);
946 bch2_inode_update_after_write(trans
, src_inode
, &src_inode_u
,
950 bch2_inode_update_after_write(trans
, dst_inode
, &dst_inode_u
,
953 bch2_trans_put(trans
);
955 bch2_fs_quota_transfer(c
, src_inode
,
956 bch_qid(&src_inode
->ei_inode
),
958 KEY_TYPE_QUOTA_NOCHECK
);
960 bch2_fs_quota_transfer(c
, dst_inode
,
961 bch_qid(&dst_inode
->ei_inode
),
963 KEY_TYPE_QUOTA_NOCHECK
);
965 bch2_unlock_inodes(INODE_UPDATE_LOCK
,
971 return bch2_err_class(ret
);
974 static void bch2_setattr_copy(struct mnt_idmap
*idmap
,
975 struct bch_inode_info
*inode
,
976 struct bch_inode_unpacked
*bi
,
979 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
980 unsigned int ia_valid
= attr
->ia_valid
;
984 if (ia_valid
& ATTR_UID
) {
985 kuid
= from_vfsuid(idmap
, i_user_ns(&inode
->v
), attr
->ia_vfsuid
);
986 bi
->bi_uid
= from_kuid(i_user_ns(&inode
->v
), kuid
);
988 if (ia_valid
& ATTR_GID
) {
989 kgid
= from_vfsgid(idmap
, i_user_ns(&inode
->v
), attr
->ia_vfsgid
);
990 bi
->bi_gid
= from_kgid(i_user_ns(&inode
->v
), kgid
);
993 if (ia_valid
& ATTR_SIZE
)
994 bi
->bi_size
= attr
->ia_size
;
996 if (ia_valid
& ATTR_ATIME
)
997 bi
->bi_atime
= timespec_to_bch2_time(c
, attr
->ia_atime
);
998 if (ia_valid
& ATTR_MTIME
)
999 bi
->bi_mtime
= timespec_to_bch2_time(c
, attr
->ia_mtime
);
1000 if (ia_valid
& ATTR_CTIME
)
1001 bi
->bi_ctime
= timespec_to_bch2_time(c
, attr
->ia_ctime
);
1003 if (ia_valid
& ATTR_MODE
) {
1004 umode_t mode
= attr
->ia_mode
;
1005 kgid_t gid
= ia_valid
& ATTR_GID
1009 if (!in_group_or_capable(idmap
, &inode
->v
,
1010 make_vfsgid(idmap
, i_user_ns(&inode
->v
), gid
)))
1016 int bch2_setattr_nonsize(struct mnt_idmap
*idmap
,
1017 struct bch_inode_info
*inode
,
1020 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
1022 struct btree_trans
*trans
;
1023 struct btree_iter inode_iter
= { NULL
};
1024 struct bch_inode_unpacked inode_u
;
1025 struct posix_acl
*acl
= NULL
;
1030 mutex_lock(&inode
->ei_update_lock
);
1032 qid
= inode
->ei_qid
;
1034 if (attr
->ia_valid
& ATTR_UID
) {
1035 kuid
= from_vfsuid(idmap
, i_user_ns(&inode
->v
), attr
->ia_vfsuid
);
1036 qid
.q
[QTYP_USR
] = from_kuid(i_user_ns(&inode
->v
), kuid
);
1039 if (attr
->ia_valid
& ATTR_GID
) {
1040 kgid
= from_vfsgid(idmap
, i_user_ns(&inode
->v
), attr
->ia_vfsgid
);
1041 qid
.q
[QTYP_GRP
] = from_kgid(i_user_ns(&inode
->v
), kgid
);
1044 ret
= bch2_fs_quota_transfer(c
, inode
, qid
, ~0,
1045 KEY_TYPE_QUOTA_PREALLOC
);
1049 trans
= bch2_trans_get(c
);
1051 bch2_trans_begin(trans
);
1055 ret
= bch2_inode_peek(trans
, &inode_iter
, &inode_u
, inode_inum(inode
),
1060 bch2_setattr_copy(idmap
, inode
, &inode_u
, attr
);
1062 if (attr
->ia_valid
& ATTR_MODE
) {
1063 ret
= bch2_acl_chmod(trans
, inode_inum(inode
), &inode_u
,
1064 inode_u
.bi_mode
, &acl
);
1069 ret
= bch2_inode_write(trans
, &inode_iter
, &inode_u
) ?:
1070 bch2_trans_commit(trans
, NULL
, NULL
,
1071 BCH_TRANS_COMMIT_no_enospc
);
1073 bch2_trans_iter_exit(trans
, &inode_iter
);
1075 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
1080 bch2_inode_update_after_write(trans
, inode
, &inode_u
, attr
->ia_valid
);
1083 set_cached_acl(&inode
->v
, ACL_TYPE_ACCESS
, acl
);
1085 bch2_trans_put(trans
);
1087 mutex_unlock(&inode
->ei_update_lock
);
1089 return bch2_err_class(ret
);
1092 static int bch2_getattr(struct mnt_idmap
*idmap
,
1093 const struct path
*path
, struct kstat
*stat
,
1094 u32 request_mask
, unsigned query_flags
)
1096 struct bch_inode_info
*inode
= to_bch_ei(d_inode(path
->dentry
));
1097 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
1098 vfsuid_t vfsuid
= i_uid_into_vfsuid(idmap
, &inode
->v
);
1099 vfsgid_t vfsgid
= i_gid_into_vfsgid(idmap
, &inode
->v
);
1101 stat
->dev
= inode
->v
.i_sb
->s_dev
;
1102 stat
->ino
= inode
->v
.i_ino
;
1103 stat
->mode
= inode
->v
.i_mode
;
1104 stat
->nlink
= inode
->v
.i_nlink
;
1105 stat
->uid
= vfsuid_into_kuid(vfsuid
);
1106 stat
->gid
= vfsgid_into_kgid(vfsgid
);
1107 stat
->rdev
= inode
->v
.i_rdev
;
1108 stat
->size
= i_size_read(&inode
->v
);
1109 stat
->atime
= inode_get_atime(&inode
->v
);
1110 stat
->mtime
= inode_get_mtime(&inode
->v
);
1111 stat
->ctime
= inode_get_ctime(&inode
->v
);
1112 stat
->blksize
= block_bytes(c
);
1113 stat
->blocks
= inode
->v
.i_blocks
;
1115 stat
->subvol
= inode
->ei_inum
.subvol
;
1116 stat
->result_mask
|= STATX_SUBVOL
;
1118 if ((request_mask
& STATX_DIOALIGN
) && S_ISREG(inode
->v
.i_mode
)) {
1119 stat
->result_mask
|= STATX_DIOALIGN
;
1121 * this is incorrect; we should be tracking this in superblock,
1122 * and checking the alignment of open devices
1124 stat
->dio_mem_align
= SECTOR_SIZE
;
1125 stat
->dio_offset_align
= block_bytes(c
);
1128 if (request_mask
& STATX_BTIME
) {
1129 stat
->result_mask
|= STATX_BTIME
;
1130 stat
->btime
= bch2_time_to_timespec(c
, inode
->ei_inode
.bi_otime
);
1133 if (inode
->ei_inode
.bi_flags
& BCH_INODE_immutable
)
1134 stat
->attributes
|= STATX_ATTR_IMMUTABLE
;
1135 stat
->attributes_mask
|= STATX_ATTR_IMMUTABLE
;
1137 if (inode
->ei_inode
.bi_flags
& BCH_INODE_append
)
1138 stat
->attributes
|= STATX_ATTR_APPEND
;
1139 stat
->attributes_mask
|= STATX_ATTR_APPEND
;
1141 if (inode
->ei_inode
.bi_flags
& BCH_INODE_nodump
)
1142 stat
->attributes
|= STATX_ATTR_NODUMP
;
1143 stat
->attributes_mask
|= STATX_ATTR_NODUMP
;
1148 static int bch2_setattr(struct mnt_idmap
*idmap
,
1149 struct dentry
*dentry
, struct iattr
*iattr
)
1151 struct bch_inode_info
*inode
= to_bch_ei(dentry
->d_inode
);
1152 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
1155 lockdep_assert_held(&inode
->v
.i_rwsem
);
1157 ret
= bch2_subvol_is_ro(c
, inode
->ei_inum
.subvol
) ?:
1158 setattr_prepare(idmap
, dentry
, iattr
);
1162 return iattr
->ia_valid
& ATTR_SIZE
1163 ? bchfs_truncate(idmap
, inode
, iattr
)
1164 : bch2_setattr_nonsize(idmap
, inode
, iattr
);
1167 static int bch2_tmpfile(struct mnt_idmap
*idmap
,
1168 struct inode
*vdir
, struct file
*file
, umode_t mode
)
1170 struct bch_inode_info
*inode
=
1171 __bch2_create(idmap
, to_bch_ei(vdir
),
1172 file
->f_path
.dentry
, mode
, 0,
1173 (subvol_inum
) { 0 }, BCH_CREATE_TMPFILE
);
1176 return bch2_err_class(PTR_ERR(inode
));
1178 d_mark_tmpfile(file
, &inode
->v
);
1179 d_instantiate(file
->f_path
.dentry
, &inode
->v
);
1180 return finish_open_simple(file
, 0);
1183 static int bch2_fill_extent(struct bch_fs
*c
,
1184 struct fiemap_extent_info
*info
,
1185 struct bkey_s_c k
, unsigned flags
)
1187 if (bkey_extent_is_direct_data(k
.k
)) {
1188 struct bkey_ptrs_c ptrs
= bch2_bkey_ptrs_c(k
);
1189 const union bch_extent_entry
*entry
;
1190 struct extent_ptr_decoded p
;
1193 if (k
.k
->type
== KEY_TYPE_reflink_v
)
1194 flags
|= FIEMAP_EXTENT_SHARED
;
1196 bkey_for_each_ptr_decode(k
.k
, ptrs
, p
, entry
) {
1198 u64 offset
= p
.ptr
.offset
;
1200 if (p
.ptr
.unwritten
)
1201 flags2
|= FIEMAP_EXTENT_UNWRITTEN
;
1203 if (p
.crc
.compression_type
)
1204 flags2
|= FIEMAP_EXTENT_ENCODED
;
1206 offset
+= p
.crc
.offset
;
1208 if ((offset
& (block_sectors(c
) - 1)) ||
1209 (k
.k
->size
& (block_sectors(c
) - 1)))
1210 flags2
|= FIEMAP_EXTENT_NOT_ALIGNED
;
1212 ret
= fiemap_fill_next_extent(info
,
1213 bkey_start_offset(k
.k
) << 9,
1215 k
.k
->size
<< 9, flags
|flags2
);
1221 } else if (bkey_extent_is_inline_data(k
.k
)) {
1222 return fiemap_fill_next_extent(info
,
1223 bkey_start_offset(k
.k
) << 9,
1226 FIEMAP_EXTENT_DATA_INLINE
);
1227 } else if (k
.k
->type
== KEY_TYPE_reservation
) {
1228 return fiemap_fill_next_extent(info
,
1229 bkey_start_offset(k
.k
) << 9,
1232 FIEMAP_EXTENT_DELALLOC
|
1233 FIEMAP_EXTENT_UNWRITTEN
);
1239 static int bch2_fiemap(struct inode
*vinode
, struct fiemap_extent_info
*info
,
1242 struct bch_fs
*c
= vinode
->i_sb
->s_fs_info
;
1243 struct bch_inode_info
*ei
= to_bch_ei(vinode
);
1244 struct btree_trans
*trans
;
1245 struct btree_iter iter
;
1247 struct bkey_buf cur
, prev
;
1248 unsigned offset_into_extent
, sectors
;
1249 bool have_extent
= false;
1252 ret
= fiemap_prep(&ei
->v
, info
, start
, &len
, FIEMAP_FLAG_SYNC
);
1256 struct bpos end
= POS(ei
->v
.i_ino
, (start
+ len
) >> 9);
1257 if (start
+ len
< start
)
1262 bch2_bkey_buf_init(&cur
);
1263 bch2_bkey_buf_init(&prev
);
1264 trans
= bch2_trans_get(c
);
1266 bch2_trans_iter_init(trans
, &iter
, BTREE_ID_extents
,
1267 POS(ei
->v
.i_ino
, start
), 0);
1269 while (!ret
|| bch2_err_matches(ret
, BCH_ERR_transaction_restart
)) {
1270 enum btree_id data_btree
= BTREE_ID_extents
;
1272 bch2_trans_begin(trans
);
1275 ret
= bch2_subvolume_get_snapshot(trans
, ei
->ei_inum
.subvol
, &snapshot
);
1279 bch2_btree_iter_set_snapshot(&iter
, snapshot
);
1281 k
= bch2_btree_iter_peek_upto(&iter
, end
);
1289 if (!bkey_extent_is_data(k
.k
) &&
1290 k
.k
->type
!= KEY_TYPE_reservation
) {
1291 bch2_btree_iter_advance(&iter
);
1295 offset_into_extent
= iter
.pos
.offset
-
1296 bkey_start_offset(k
.k
);
1297 sectors
= k
.k
->size
- offset_into_extent
;
1299 bch2_bkey_buf_reassemble(&cur
, c
, k
);
1301 ret
= bch2_read_indirect_extent(trans
, &data_btree
,
1302 &offset_into_extent
, &cur
);
1306 k
= bkey_i_to_s_c(cur
.k
);
1307 bch2_bkey_buf_realloc(&prev
, c
, k
.k
->u64s
);
1309 sectors
= min(sectors
, k
.k
->size
- offset_into_extent
);
1311 bch2_cut_front(POS(k
.k
->p
.inode
,
1312 bkey_start_offset(k
.k
) +
1313 offset_into_extent
),
1315 bch2_key_resize(&cur
.k
->k
, sectors
);
1316 cur
.k
->k
.p
= iter
.pos
;
1317 cur
.k
->k
.p
.offset
+= cur
.k
->k
.size
;
1320 bch2_trans_unlock(trans
);
1321 ret
= bch2_fill_extent(c
, info
,
1322 bkey_i_to_s_c(prev
.k
), 0);
1327 bkey_copy(prev
.k
, cur
.k
);
1330 bch2_btree_iter_set_pos(&iter
,
1331 POS(iter
.pos
.inode
, iter
.pos
.offset
+ sectors
));
1333 bch2_trans_iter_exit(trans
, &iter
);
1335 if (!ret
&& have_extent
) {
1336 bch2_trans_unlock(trans
);
1337 ret
= bch2_fill_extent(c
, info
, bkey_i_to_s_c(prev
.k
),
1338 FIEMAP_EXTENT_LAST
);
1341 bch2_trans_put(trans
);
1342 bch2_bkey_buf_exit(&cur
, c
);
1343 bch2_bkey_buf_exit(&prev
, c
);
1344 return ret
< 0 ? ret
: 0;
1347 static const struct vm_operations_struct bch_vm_ops
= {
1348 .fault
= bch2_page_fault
,
1349 .map_pages
= filemap_map_pages
,
1350 .page_mkwrite
= bch2_page_mkwrite
,
1353 static int bch2_mmap(struct file
*file
, struct vm_area_struct
*vma
)
1355 file_accessed(file
);
1357 vma
->vm_ops
= &bch_vm_ops
;
1363 static loff_t
bch2_dir_llseek(struct file
*file
, loff_t offset
, int whence
)
1365 return generic_file_llseek_size(file
, offset
, whence
,
1369 static int bch2_vfs_readdir(struct file
*file
, struct dir_context
*ctx
)
1371 struct bch_inode_info
*inode
= file_bch_inode(file
);
1372 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
1374 if (!dir_emit_dots(file
, ctx
))
1377 int ret
= bch2_readdir(c
, inode_inum(inode
), ctx
);
1380 return bch2_err_class(ret
);
1383 static int bch2_open(struct inode
*vinode
, struct file
*file
)
1385 if (file
->f_flags
& (O_WRONLY
|O_RDWR
)) {
1386 struct bch_inode_info
*inode
= to_bch_ei(vinode
);
1387 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
1389 int ret
= bch2_subvol_is_ro(c
, inode
->ei_inum
.subvol
);
1394 file
->f_mode
|= FMODE_CAN_ODIRECT
;
1396 return generic_file_open(vinode
, file
);
1399 static const struct file_operations bch_file_operations
= {
1401 .llseek
= bch2_llseek
,
1402 .read_iter
= bch2_read_iter
,
1403 .write_iter
= bch2_write_iter
,
1405 .get_unmapped_area
= thp_get_unmapped_area
,
1406 .fsync
= bch2_fsync
,
1407 .splice_read
= filemap_splice_read
,
1408 .splice_write
= iter_file_splice_write
,
1409 .fallocate
= bch2_fallocate_dispatch
,
1410 .unlocked_ioctl
= bch2_fs_file_ioctl
,
1411 #ifdef CONFIG_COMPAT
1412 .compat_ioctl
= bch2_compat_fs_ioctl
,
1414 .remap_file_range
= bch2_remap_file_range
,
1417 static const struct inode_operations bch_file_inode_operations
= {
1418 .getattr
= bch2_getattr
,
1419 .setattr
= bch2_setattr
,
1420 .fiemap
= bch2_fiemap
,
1421 .listxattr
= bch2_xattr_list
,
1422 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1423 .get_inode_acl
= bch2_get_acl
,
1424 .set_acl
= bch2_set_acl
,
1428 static const struct inode_operations bch_dir_inode_operations
= {
1429 .lookup
= bch2_lookup
,
1430 .create
= bch2_create
,
1432 .unlink
= bch2_unlink
,
1433 .symlink
= bch2_symlink
,
1434 .mkdir
= bch2_mkdir
,
1435 .rmdir
= bch2_unlink
,
1436 .mknod
= bch2_mknod
,
1437 .rename
= bch2_rename2
,
1438 .getattr
= bch2_getattr
,
1439 .setattr
= bch2_setattr
,
1440 .tmpfile
= bch2_tmpfile
,
1441 .listxattr
= bch2_xattr_list
,
1442 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1443 .get_inode_acl
= bch2_get_acl
,
1444 .set_acl
= bch2_set_acl
,
1448 static const struct file_operations bch_dir_file_operations
= {
1449 .llseek
= bch2_dir_llseek
,
1450 .read
= generic_read_dir
,
1451 .iterate_shared
= bch2_vfs_readdir
,
1452 .fsync
= bch2_fsync
,
1453 .unlocked_ioctl
= bch2_fs_file_ioctl
,
1454 #ifdef CONFIG_COMPAT
1455 .compat_ioctl
= bch2_compat_fs_ioctl
,
1459 static const struct inode_operations bch_symlink_inode_operations
= {
1460 .get_link
= page_get_link
,
1461 .getattr
= bch2_getattr
,
1462 .setattr
= bch2_setattr
,
1463 .listxattr
= bch2_xattr_list
,
1464 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1465 .get_inode_acl
= bch2_get_acl
,
1466 .set_acl
= bch2_set_acl
,
1470 static const struct inode_operations bch_special_inode_operations
= {
1471 .getattr
= bch2_getattr
,
1472 .setattr
= bch2_setattr
,
1473 .listxattr
= bch2_xattr_list
,
1474 #ifdef CONFIG_BCACHEFS_POSIX_ACL
1475 .get_inode_acl
= bch2_get_acl
,
1476 .set_acl
= bch2_set_acl
,
1480 static const struct address_space_operations bch_address_space_operations
= {
1481 .read_folio
= bch2_read_folio
,
1482 .writepages
= bch2_writepages
,
1483 .readahead
= bch2_readahead
,
1484 .dirty_folio
= filemap_dirty_folio
,
1485 .write_begin
= bch2_write_begin
,
1486 .write_end
= bch2_write_end
,
1487 .invalidate_folio
= bch2_invalidate_folio
,
1488 .release_folio
= bch2_release_folio
,
1489 #ifdef CONFIG_MIGRATION
1490 .migrate_folio
= filemap_migrate_folio
,
1492 .error_remove_folio
= generic_error_remove_folio
,
1495 struct bcachefs_fid
{
1501 struct bcachefs_fid_with_parent
{
1502 struct bcachefs_fid fid
;
1503 struct bcachefs_fid dir
;
1506 static int bcachefs_fid_valid(int fh_len
, int fh_type
)
1509 case FILEID_BCACHEFS_WITHOUT_PARENT
:
1510 return fh_len
== sizeof(struct bcachefs_fid
) / sizeof(u32
);
1511 case FILEID_BCACHEFS_WITH_PARENT
:
1512 return fh_len
== sizeof(struct bcachefs_fid_with_parent
) / sizeof(u32
);
1518 static struct bcachefs_fid
bch2_inode_to_fid(struct bch_inode_info
*inode
)
1520 return (struct bcachefs_fid
) {
1521 .inum
= inode
->ei_inum
.inum
,
1522 .subvol
= inode
->ei_inum
.subvol
,
1523 .gen
= inode
->ei_inode
.bi_generation
,
1527 static int bch2_encode_fh(struct inode
*vinode
, u32
*fh
, int *len
,
1530 struct bch_inode_info
*inode
= to_bch_ei(vinode
);
1531 struct bch_inode_info
*dir
= to_bch_ei(vdir
);
1534 if (!S_ISDIR(inode
->v
.i_mode
) && dir
) {
1535 struct bcachefs_fid_with_parent
*fid
= (void *) fh
;
1537 min_len
= sizeof(*fid
) / sizeof(u32
);
1538 if (*len
< min_len
) {
1540 return FILEID_INVALID
;
1543 fid
->fid
= bch2_inode_to_fid(inode
);
1544 fid
->dir
= bch2_inode_to_fid(dir
);
1547 return FILEID_BCACHEFS_WITH_PARENT
;
1549 struct bcachefs_fid
*fid
= (void *) fh
;
1551 min_len
= sizeof(*fid
) / sizeof(u32
);
1552 if (*len
< min_len
) {
1554 return FILEID_INVALID
;
1556 *fid
= bch2_inode_to_fid(inode
);
1559 return FILEID_BCACHEFS_WITHOUT_PARENT
;
1563 static struct inode
*bch2_nfs_get_inode(struct super_block
*sb
,
1564 struct bcachefs_fid fid
)
1566 struct bch_fs
*c
= sb
->s_fs_info
;
1567 struct inode
*vinode
= bch2_vfs_inode_get(c
, (subvol_inum
) {
1568 .subvol
= fid
.subvol
,
1571 if (!IS_ERR(vinode
) && vinode
->i_generation
!= fid
.gen
) {
1573 vinode
= ERR_PTR(-ESTALE
);
1578 static struct dentry
*bch2_fh_to_dentry(struct super_block
*sb
, struct fid
*_fid
,
1579 int fh_len
, int fh_type
)
1581 struct bcachefs_fid
*fid
= (void *) _fid
;
1583 if (!bcachefs_fid_valid(fh_len
, fh_type
))
1586 return d_obtain_alias(bch2_nfs_get_inode(sb
, *fid
));
1589 static struct dentry
*bch2_fh_to_parent(struct super_block
*sb
, struct fid
*_fid
,
1590 int fh_len
, int fh_type
)
1592 struct bcachefs_fid_with_parent
*fid
= (void *) _fid
;
1594 if (!bcachefs_fid_valid(fh_len
, fh_type
) ||
1595 fh_type
!= FILEID_BCACHEFS_WITH_PARENT
)
1598 return d_obtain_alias(bch2_nfs_get_inode(sb
, fid
->dir
));
1601 static struct dentry
*bch2_get_parent(struct dentry
*child
)
1603 struct bch_inode_info
*inode
= to_bch_ei(child
->d_inode
);
1604 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
1605 subvol_inum parent_inum
= {
1606 .subvol
= inode
->ei_inode
.bi_parent_subvol
?:
1607 inode
->ei_inum
.subvol
,
1608 .inum
= inode
->ei_inode
.bi_dir
,
1611 return d_obtain_alias(bch2_vfs_inode_get(c
, parent_inum
));
1614 static int bch2_get_name(struct dentry
*parent
, char *name
, struct dentry
*child
)
1616 struct bch_inode_info
*inode
= to_bch_ei(child
->d_inode
);
1617 struct bch_inode_info
*dir
= to_bch_ei(parent
->d_inode
);
1618 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
1619 struct btree_trans
*trans
;
1620 struct btree_iter iter1
;
1621 struct btree_iter iter2
;
1623 struct bkey_s_c_dirent d
;
1624 struct bch_inode_unpacked inode_u
;
1627 struct qstr dirent_name
;
1628 unsigned name_len
= 0;
1631 if (!S_ISDIR(dir
->v
.i_mode
))
1634 trans
= bch2_trans_get(c
);
1636 bch2_trans_iter_init(trans
, &iter1
, BTREE_ID_dirents
,
1637 POS(dir
->ei_inode
.bi_inum
, 0), 0);
1638 bch2_trans_iter_init(trans
, &iter2
, BTREE_ID_dirents
,
1639 POS(dir
->ei_inode
.bi_inum
, 0), 0);
1641 bch2_trans_begin(trans
);
1643 ret
= bch2_subvolume_get_snapshot(trans
, dir
->ei_inum
.subvol
, &snapshot
);
1647 bch2_btree_iter_set_snapshot(&iter1
, snapshot
);
1648 bch2_btree_iter_set_snapshot(&iter2
, snapshot
);
1650 ret
= bch2_inode_find_by_inum_trans(trans
, inode_inum(inode
), &inode_u
);
1654 if (inode_u
.bi_dir
== dir
->ei_inode
.bi_inum
) {
1655 bch2_btree_iter_set_pos(&iter1
, POS(inode_u
.bi_dir
, inode_u
.bi_dir_offset
));
1657 k
= bch2_btree_iter_peek_slot(&iter1
);
1662 if (k
.k
->type
!= KEY_TYPE_dirent
) {
1663 ret
= -BCH_ERR_ENOENT_dirent_doesnt_match_inode
;
1667 d
= bkey_s_c_to_dirent(k
);
1668 ret
= bch2_dirent_read_target(trans
, inode_inum(dir
), d
, &target
);
1670 ret
= -BCH_ERR_ENOENT_dirent_doesnt_match_inode
;
1674 if (subvol_inum_eq(target
, inode
->ei_inum
))
1678 * File with multiple hardlinks and our backref is to the wrong
1679 * directory - linear search:
1681 for_each_btree_key_continue_norestart(iter2
, 0, k
, ret
) {
1682 if (k
.k
->p
.inode
> dir
->ei_inode
.bi_inum
)
1685 if (k
.k
->type
!= KEY_TYPE_dirent
)
1688 d
= bkey_s_c_to_dirent(k
);
1689 ret
= bch2_dirent_read_target(trans
, inode_inum(dir
), d
, &target
);
1695 if (subvol_inum_eq(target
, inode
->ei_inum
))
1703 dirent_name
= bch2_dirent_get_name(d
);
1705 name_len
= min_t(unsigned, dirent_name
.len
, NAME_MAX
);
1706 memcpy(name
, dirent_name
.name
, name_len
);
1707 name
[name_len
] = '\0';
1709 if (bch2_err_matches(ret
, BCH_ERR_transaction_restart
))
1712 bch2_trans_iter_exit(trans
, &iter1
);
1713 bch2_trans_iter_exit(trans
, &iter2
);
1714 bch2_trans_put(trans
);
1719 static const struct export_operations bch_export_ops
= {
1720 .encode_fh
= bch2_encode_fh
,
1721 .fh_to_dentry
= bch2_fh_to_dentry
,
1722 .fh_to_parent
= bch2_fh_to_parent
,
1723 .get_parent
= bch2_get_parent
,
1724 .get_name
= bch2_get_name
,
1727 static void bch2_vfs_inode_init(struct btree_trans
*trans
,
1729 struct bch_inode_info
*inode
,
1730 struct bch_inode_unpacked
*bi
,
1731 struct bch_subvolume
*subvol
)
1733 inode
->v
.i_ino
= inum
.inum
;
1734 inode
->ei_inum
= inum
;
1735 inode
->ei_inode
.bi_inum
= inum
.inum
;
1736 bch2_inode_update_after_write(trans
, inode
, bi
, ~0);
1738 inode
->v
.i_blocks
= bi
->bi_sectors
;
1739 inode
->v
.i_ino
= bi
->bi_inum
;
1740 inode
->v
.i_rdev
= bi
->bi_dev
;
1741 inode
->v
.i_generation
= bi
->bi_generation
;
1742 inode
->v
.i_size
= bi
->bi_size
;
1744 inode
->ei_flags
= 0;
1745 inode
->ei_quota_reserved
= 0;
1746 inode
->ei_qid
= bch_qid(bi
);
1748 if (BCH_SUBVOLUME_SNAP(subvol
))
1749 set_bit(EI_INODE_SNAPSHOT
, &inode
->ei_flags
);
1751 inode
->v
.i_mapping
->a_ops
= &bch_address_space_operations
;
1753 switch (inode
->v
.i_mode
& S_IFMT
) {
1755 inode
->v
.i_op
= &bch_file_inode_operations
;
1756 inode
->v
.i_fop
= &bch_file_operations
;
1759 inode
->v
.i_op
= &bch_dir_inode_operations
;
1760 inode
->v
.i_fop
= &bch_dir_file_operations
;
1763 inode_nohighmem(&inode
->v
);
1764 inode
->v
.i_op
= &bch_symlink_inode_operations
;
1767 init_special_inode(&inode
->v
, inode
->v
.i_mode
, inode
->v
.i_rdev
);
1768 inode
->v
.i_op
= &bch_special_inode_operations
;
1772 mapping_set_large_folios(inode
->v
.i_mapping
);
1775 static void bch2_free_inode(struct inode
*vinode
)
1777 kmem_cache_free(bch2_inode_cache
, to_bch_ei(vinode
));
1780 static int inode_update_times_fn(struct btree_trans
*trans
,
1781 struct bch_inode_info
*inode
,
1782 struct bch_inode_unpacked
*bi
,
1785 struct bch_fs
*c
= inode
->v
.i_sb
->s_fs_info
;
1787 bi
->bi_atime
= timespec_to_bch2_time(c
, inode_get_atime(&inode
->v
));
1788 bi
->bi_mtime
= timespec_to_bch2_time(c
, inode_get_mtime(&inode
->v
));
1789 bi
->bi_ctime
= timespec_to_bch2_time(c
, inode_get_ctime(&inode
->v
));
1794 static int bch2_vfs_write_inode(struct inode
*vinode
,
1795 struct writeback_control
*wbc
)
1797 struct bch_fs
*c
= vinode
->i_sb
->s_fs_info
;
1798 struct bch_inode_info
*inode
= to_bch_ei(vinode
);
1801 mutex_lock(&inode
->ei_update_lock
);
1802 ret
= bch2_write_inode(c
, inode
, inode_update_times_fn
, NULL
,
1803 ATTR_ATIME
|ATTR_MTIME
|ATTR_CTIME
);
1804 mutex_unlock(&inode
->ei_update_lock
);
1806 return bch2_err_class(ret
);
1809 static void bch2_evict_inode(struct inode
*vinode
)
1811 struct bch_fs
*c
= vinode
->i_sb
->s_fs_info
;
1812 struct bch_inode_info
*inode
= to_bch_ei(vinode
);
1813 bool delete = !inode
->v
.i_nlink
&& !is_bad_inode(&inode
->v
);
1816 * evict() has waited for outstanding writeback, we'll do no more IO
1817 * through this inode: it's safe to remove from VFS inode hashtable here
1819 * Do that now so that other threads aren't blocked from pulling it back
1820 * in, there's no reason for them to be:
1823 bch2_inode_hash_remove(c
, inode
);
1825 truncate_inode_pages_final(&inode
->v
.i_data
);
1827 clear_inode(&inode
->v
);
1829 BUG_ON(!is_bad_inode(&inode
->v
) && inode
->ei_quota_reserved
);
1832 bch2_quota_acct(c
, inode
->ei_qid
, Q_SPC
, -((s64
) inode
->v
.i_blocks
),
1833 KEY_TYPE_QUOTA_WARN
);
1834 bch2_quota_acct(c
, inode
->ei_qid
, Q_INO
, -1,
1835 KEY_TYPE_QUOTA_WARN
);
1836 bch2_inode_rm(c
, inode_inum(inode
));
1839 * If we are deleting, we need it present in the vfs hash table
1840 * so that fsck can check if unlinked inodes are still open:
1842 bch2_inode_hash_remove(c
, inode
);
1845 mutex_lock(&c
->vfs_inodes_lock
);
1846 list_del_init(&inode
->ei_vfs_inode_list
);
1847 mutex_unlock(&c
->vfs_inodes_lock
);
1850 void bch2_evict_subvolume_inodes(struct bch_fs
*c
, snapshot_id_list
*s
)
1852 struct bch_inode_info
*inode
;
1853 DARRAY(struct bch_inode_info
*) grabbed
;
1854 bool clean_pass
= false, this_pass_clean
;
1857 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
1858 * be pruned with d_mark_dontcache().
1860 * Once we've had a clean pass where we didn't find any inodes without
1861 * I_DONTCACHE, we wait for them to be freed:
1864 darray_init(&grabbed
);
1865 darray_make_room(&grabbed
, 1024);
1868 this_pass_clean
= true;
1870 mutex_lock(&c
->vfs_inodes_lock
);
1871 list_for_each_entry(inode
, &c
->vfs_inodes_list
, ei_vfs_inode_list
) {
1872 if (!snapshot_list_has_id(s
, inode
->ei_inum
.subvol
))
1875 if (!(inode
->v
.i_state
& I_DONTCACHE
) &&
1876 !(inode
->v
.i_state
& I_FREEING
) &&
1878 this_pass_clean
= false;
1880 if (darray_push_gfp(&grabbed
, inode
, GFP_ATOMIC
|__GFP_NOWARN
)) {
1884 } else if (clean_pass
&& this_pass_clean
) {
1885 struct wait_bit_queue_entry wqe
;
1886 struct wait_queue_head
*wq_head
;
1888 wq_head
= inode_bit_waitqueue(&wqe
, &inode
->v
, __I_NEW
);
1889 prepare_to_wait_event(wq_head
, &wqe
.wq_entry
,
1890 TASK_UNINTERRUPTIBLE
);
1891 mutex_unlock(&c
->vfs_inodes_lock
);
1894 finish_wait(wq_head
, &wqe
.wq_entry
);
1898 mutex_unlock(&c
->vfs_inodes_lock
);
1900 darray_for_each(grabbed
, i
) {
1902 d_mark_dontcache(&inode
->v
);
1903 d_prune_aliases(&inode
->v
);
1908 if (!clean_pass
|| !this_pass_clean
) {
1909 clean_pass
= this_pass_clean
;
1913 darray_exit(&grabbed
);
1916 static int bch2_statfs(struct dentry
*dentry
, struct kstatfs
*buf
)
1918 struct super_block
*sb
= dentry
->d_sb
;
1919 struct bch_fs
*c
= sb
->s_fs_info
;
1920 struct bch_fs_usage_short usage
= bch2_fs_usage_read_short(c
);
1921 unsigned shift
= sb
->s_blocksize_bits
- 9;
1923 * this assumes inodes take up 64 bytes, which is a decent average
1926 u64 avail_inodes
= ((usage
.capacity
- usage
.used
) << 3);
1928 buf
->f_type
= BCACHEFS_STATFS_MAGIC
;
1929 buf
->f_bsize
= sb
->s_blocksize
;
1930 buf
->f_blocks
= usage
.capacity
>> shift
;
1931 buf
->f_bfree
= usage
.free
>> shift
;
1932 buf
->f_bavail
= avail_factor(usage
.free
) >> shift
;
1934 buf
->f_files
= usage
.nr_inodes
+ avail_inodes
;
1935 buf
->f_ffree
= avail_inodes
;
1937 buf
->f_fsid
= uuid_to_fsid(c
->sb
.user_uuid
.b
);
1938 buf
->f_namelen
= BCH_NAME_MAX
;
1943 static int bch2_sync_fs(struct super_block
*sb
, int wait
)
1945 struct bch_fs
*c
= sb
->s_fs_info
;
1948 trace_bch2_sync_fs(sb
, wait
);
1950 if (c
->opts
.journal_flush_disabled
)
1954 bch2_journal_flush_async(&c
->journal
, NULL
);
1958 ret
= bch2_journal_flush(&c
->journal
);
1959 return bch2_err_class(ret
);
1962 static struct bch_fs
*bch2_path_to_fs(const char *path
)
1968 ret
= lookup_bdev(path
, &dev
);
1970 return ERR_PTR(ret
);
1972 c
= bch2_dev_to_fs(dev
);
1974 closure_put(&c
->cl
);
1975 return c
?: ERR_PTR(-ENOENT
);
1978 static int bch2_remount(struct super_block
*sb
, int *flags
,
1979 struct bch_opts opts
)
1981 struct bch_fs
*c
= sb
->s_fs_info
;
1984 opt_set(opts
, read_only
, (*flags
& SB_RDONLY
) != 0);
1986 if (opts
.read_only
!= c
->opts
.read_only
) {
1987 down_write(&c
->state_lock
);
1989 if (opts
.read_only
) {
1990 bch2_fs_read_only(c
);
1992 sb
->s_flags
|= SB_RDONLY
;
1994 ret
= bch2_fs_read_write(c
);
1996 bch_err(c
, "error going rw: %i", ret
);
1997 up_write(&c
->state_lock
);
2002 sb
->s_flags
&= ~SB_RDONLY
;
2005 c
->opts
.read_only
= opts
.read_only
;
2007 up_write(&c
->state_lock
);
2010 if (opt_defined(opts
, errors
))
2011 c
->opts
.errors
= opts
.errors
;
2013 return bch2_err_class(ret
);
2016 static int bch2_show_devname(struct seq_file
*seq
, struct dentry
*root
)
2018 struct bch_fs
*c
= root
->d_sb
->s_fs_info
;
2021 for_each_online_member(c
, ca
) {
2025 seq_puts(seq
, ca
->disk_sb
.sb_name
);
2031 static int bch2_show_options(struct seq_file
*seq
, struct dentry
*root
)
2033 struct bch_fs
*c
= root
->d_sb
->s_fs_info
;
2034 struct printbuf buf
= PRINTBUF
;
2036 bch2_opts_to_text(&buf
, c
->opts
, c
, c
->disk_sb
.sb
,
2037 OPT_MOUNT
, OPT_HIDDEN
, OPT_SHOW_MOUNT_STYLE
);
2038 printbuf_nul_terminate(&buf
);
2039 seq_printf(seq
, ",%s", buf
.buf
);
2041 int ret
= buf
.allocation_failure
? -ENOMEM
: 0;
2042 printbuf_exit(&buf
);
2046 static void bch2_put_super(struct super_block
*sb
)
2048 struct bch_fs
*c
= sb
->s_fs_info
;
2054 * bcachefs doesn't currently integrate intwrite freeze protection but the
2055 * internal write references serve the same purpose. Therefore reuse the
2056 * read-only transition code to perform the quiesce. The caveat is that we don't
2057 * currently have the ability to block tasks that want a write reference while
2058 * the superblock is frozen. This is fine for now, but we should either add
2059 * blocking support or find a way to integrate sb_start_intwrite() and friends.
2061 static int bch2_freeze(struct super_block
*sb
)
2063 struct bch_fs
*c
= sb
->s_fs_info
;
2065 down_write(&c
->state_lock
);
2066 bch2_fs_read_only(c
);
2067 up_write(&c
->state_lock
);
2071 static int bch2_unfreeze(struct super_block
*sb
)
2073 struct bch_fs
*c
= sb
->s_fs_info
;
2076 if (test_bit(BCH_FS_emergency_ro
, &c
->flags
))
2079 down_write(&c
->state_lock
);
2080 ret
= bch2_fs_read_write(c
);
2081 up_write(&c
->state_lock
);
2085 static const struct super_operations bch_super_operations
= {
2086 .alloc_inode
= bch2_alloc_inode
,
2087 .free_inode
= bch2_free_inode
,
2088 .write_inode
= bch2_vfs_write_inode
,
2089 .evict_inode
= bch2_evict_inode
,
2090 .sync_fs
= bch2_sync_fs
,
2091 .statfs
= bch2_statfs
,
2092 .show_devname
= bch2_show_devname
,
2093 .show_options
= bch2_show_options
,
2094 .put_super
= bch2_put_super
,
2095 .freeze_fs
= bch2_freeze
,
2096 .unfreeze_fs
= bch2_unfreeze
,
2099 static int bch2_set_super(struct super_block
*s
, void *data
)
2101 s
->s_fs_info
= data
;
2105 static int bch2_noset_super(struct super_block
*s
, void *data
)
2110 typedef DARRAY(struct bch_fs
*) darray_fs
;
2112 static int bch2_test_super(struct super_block
*s
, void *data
)
2114 struct bch_fs
*c
= s
->s_fs_info
;
2115 darray_fs
*d
= data
;
2120 darray_for_each(*d
, i
)
2126 static int bch2_fs_get_tree(struct fs_context
*fc
)
2129 struct super_block
*sb
;
2130 struct inode
*vinode
;
2131 struct bch2_opts_parse
*opts_parse
= fc
->fs_private
;
2132 struct bch_opts opts
= opts_parse
->opts
;
2134 darray_fs devs_to_fs
= {};
2137 opt_set(opts
, read_only
, (fc
->sb_flags
& SB_RDONLY
) != 0);
2138 opt_set(opts
, nostart
, true);
2140 if (!fc
->source
|| strlen(fc
->source
) == 0)
2143 ret
= bch2_split_devs(fc
->source
, &devs
);
2147 darray_for_each(devs
, i
) {
2148 ret
= darray_push(&devs_to_fs
, bch2_path_to_fs(*i
));
2153 sb
= sget(fc
->fs_type
, bch2_test_super
, bch2_noset_super
, fc
->sb_flags
|SB_NOSEC
, &devs_to_fs
);
2157 c
= bch2_fs_open(devs
.data
, devs
.nr
, opts
);
2158 ret
= PTR_ERR_OR_ZERO(c
);
2162 /* Some options can't be parsed until after the fs is started: */
2163 opts
= bch2_opts_empty();
2164 ret
= bch2_parse_mount_opts(c
, &opts
, NULL
, opts_parse
->parse_later
.buf
);
2168 bch2_opts_apply(&c
->opts
, opts
);
2170 ret
= bch2_fs_start(c
);
2174 sb
= sget(fc
->fs_type
, NULL
, bch2_set_super
, fc
->sb_flags
|SB_NOSEC
, c
);
2175 ret
= PTR_ERR_OR_ZERO(sb
);
2182 if ((fc
->sb_flags
^ sb
->s_flags
) & SB_RDONLY
) {
2189 sb
->s_blocksize
= block_bytes(c
);
2190 sb
->s_blocksize_bits
= ilog2(block_bytes(c
));
2191 sb
->s_maxbytes
= MAX_LFS_FILESIZE
;
2192 sb
->s_op
= &bch_super_operations
;
2193 sb
->s_export_op
= &bch_export_ops
;
2194 #ifdef CONFIG_BCACHEFS_QUOTA
2195 sb
->s_qcop
= &bch2_quotactl_operations
;
2196 sb
->s_quota_types
= QTYPE_MASK_USR
|QTYPE_MASK_GRP
|QTYPE_MASK_PRJ
;
2198 sb
->s_xattr
= bch2_xattr_handlers
;
2199 sb
->s_magic
= BCACHEFS_STATFS_MAGIC
;
2200 sb
->s_time_gran
= c
->sb
.nsec_per_time_unit
;
2201 sb
->s_time_min
= div_s64(S64_MIN
, c
->sb
.time_units_per_sec
) + 1;
2202 sb
->s_time_max
= div_s64(S64_MAX
, c
->sb
.time_units_per_sec
);
2203 sb
->s_uuid
= c
->sb
.user_uuid
;
2204 sb
->s_shrink
->seeks
= 0;
2206 strscpy(sb
->s_id
, c
->name
, sizeof(sb
->s_id
));
2208 ret
= super_setup_bdi(sb
);
2212 sb
->s_bdi
->ra_pages
= VM_READAHEAD_PAGES
;
2214 for_each_online_member(c
, ca
) {
2215 struct block_device
*bdev
= ca
->disk_sb
.bdev
;
2217 /* XXX: create an anonymous device for multi device filesystems */
2219 sb
->s_dev
= bdev
->bd_dev
;
2220 percpu_ref_put(&ca
->io_ref
);
2226 #ifdef CONFIG_BCACHEFS_POSIX_ACL
2228 sb
->s_flags
|= SB_POSIXACL
;
2231 sb
->s_shrink
->seeks
= 0;
2233 vinode
= bch2_vfs_inode_get(c
, BCACHEFS_ROOT_SUBVOL_INUM
);
2234 ret
= PTR_ERR_OR_ZERO(vinode
);
2235 bch_err_msg(c
, ret
, "mounting: error getting root inode");
2239 sb
->s_root
= d_make_root(vinode
);
2241 bch_err(c
, "error mounting: error allocating root dentry");
2246 sb
->s_flags
|= SB_ACTIVE
;
2248 fc
->root
= dget(sb
->s_root
);
2250 darray_exit(&devs_to_fs
);
2251 bch2_darray_str_exit(&devs
);
2253 pr_err("error: %s", bch2_err_str(ret
));
2255 * On an inconsistency error in recovery we might see an -EROFS derived
2256 * errorcode (from the journal), but we don't want to return that to
2257 * userspace as that causes util-linux to retry the mount RO - which is
2260 if (bch2_err_matches(ret
, EROFS
) && ret
!= -EROFS
)
2262 return bch2_err_class(ret
);
2270 deactivate_locked_super(sb
);
2274 static void bch2_kill_sb(struct super_block
*sb
)
2276 struct bch_fs
*c
= sb
->s_fs_info
;
2278 generic_shutdown_super(sb
);
2282 static void bch2_fs_context_free(struct fs_context
*fc
)
2284 struct bch2_opts_parse
*opts
= fc
->fs_private
;
2287 printbuf_exit(&opts
->parse_later
);
2292 static int bch2_fs_parse_param(struct fs_context
*fc
,
2293 struct fs_parameter
*param
)
2296 * the "source" param, i.e., the name of the device(s) to mount,
2297 * is handled by the VFS layer.
2299 if (!strcmp(param
->key
, "source"))
2302 struct bch2_opts_parse
*opts
= fc
->fs_private
;
2303 struct bch_fs
*c
= NULL
;
2305 /* for reconfigure, we already have a struct bch_fs */
2307 c
= fc
->root
->d_sb
->s_fs_info
;
2309 int ret
= bch2_parse_one_mount_opt(c
, &opts
->opts
,
2310 &opts
->parse_later
, param
->key
,
2313 return bch2_err_class(ret
);
2316 static int bch2_fs_reconfigure(struct fs_context
*fc
)
2318 struct super_block
*sb
= fc
->root
->d_sb
;
2319 struct bch2_opts_parse
*opts
= fc
->fs_private
;
2321 return bch2_remount(sb
, &fc
->sb_flags
, opts
->opts
);
2324 static const struct fs_context_operations bch2_context_ops
= {
2325 .free
= bch2_fs_context_free
,
2326 .parse_param
= bch2_fs_parse_param
,
2327 .get_tree
= bch2_fs_get_tree
,
2328 .reconfigure
= bch2_fs_reconfigure
,
2331 static int bch2_init_fs_context(struct fs_context
*fc
)
2333 struct bch2_opts_parse
*opts
= kzalloc(sizeof(*opts
), GFP_KERNEL
);
2338 opts
->parse_later
= PRINTBUF
;
2340 fc
->ops
= &bch2_context_ops
;
2341 fc
->fs_private
= opts
;
2346 void bch2_fs_vfs_exit(struct bch_fs
*c
)
2348 if (c
->vfs_inodes_table
.tbl
)
2349 rhashtable_destroy(&c
->vfs_inodes_table
);
2352 int bch2_fs_vfs_init(struct bch_fs
*c
)
2354 return rhashtable_init(&c
->vfs_inodes_table
, &bch2_vfs_inodes_params
);
2357 static struct file_system_type bcache_fs_type
= {
2358 .owner
= THIS_MODULE
,
2360 .init_fs_context
= bch2_init_fs_context
,
2361 .kill_sb
= bch2_kill_sb
,
2362 .fs_flags
= FS_REQUIRES_DEV
| FS_ALLOW_IDMAP
,
2365 MODULE_ALIAS_FS("bcachefs");
2367 void bch2_vfs_exit(void)
2369 unregister_filesystem(&bcache_fs_type
);
2370 kmem_cache_destroy(bch2_inode_cache
);
2373 int __init
bch2_vfs_init(void)
2377 bch2_inode_cache
= KMEM_CACHE(bch_inode_info
, SLAB_RECLAIM_ACCOUNT
|
2379 if (!bch2_inode_cache
)
2382 ret
= register_filesystem(&bcache_fs_type
);
2392 #endif /* NO_BCACHEFS_FS */