1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* vnode and volume validity verification.
4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
8 #include <linux/kernel.h>
9 #include <linux/module.h>
10 #include <linux/sched.h>
14 * Data validation is managed through a number of mechanisms from the server:
16 * (1) On first contact with a server (such as if it has just been rebooted),
17 * the server sends us a CB.InitCallBackState* request.
19 * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
20 * calls, the server maintains a time-limited per-vnode promise that it
21 * will send us a CB.CallBack request if a third party alters the vnodes
24 * Note that a vnode-level callbacks may also be sent for other reasons,
25 * such as filelock release.
27 * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
28 * calls, each server maintains a time-limited per-volume promise that it
29 * will send us a CB.CallBack request if the RO volume is updated to a
30 * snapshot of the RW volume ("vos release"). This is an atomic event
31 * that cuts over all instances of the RO volume across multiple servers
34 * Note that a volume-level callbacks may also be sent for other reasons,
35 * such as the volumeserver taking over control of the volume from the
38 * Note also that each server maintains an independent time limit on an
39 * independent callback.
41 * (4) Certain RPC calls include a volume information record "VolSync" in
42 * their reply. This contains a creation date for the volume that should
43 * remain unchanged for a RW volume (but will be changed if the volume is
44 * restored from backup) or will be bumped to the time of snapshotting
45 * when a RO volume is released.
47 * In order to track this events, the following are provided:
49 * ->cb_v_break. A counter of events that might mean that the contents of
50 * a volume have been altered since we last checked a vnode.
52 * ->cb_v_check. A counter of the number of events that we've sent a
53 * query to the server for. Everything's up to date if this equals
56 * ->cb_scrub. A counter of the number of regression events for which we
57 * have to completely wipe the cache.
59 * ->cb_ro_snapshot. A counter of the number of times that we've
60 * recognised that a RO volume has been updated.
62 * ->cb_break. A counter of events that might mean that the contents of a
63 * vnode have been altered.
65 * ->cb_expires_at. The time at which the callback promise expires or
66 * AFS_NO_CB_PROMISE if we have no promise.
68 * The way we manage things is:
70 * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
71 * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
72 * volume and volume's server record.
74 * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
75 * callback break on all the volumes that have been using that volume
76 * (ie. increment ->cb_v_break and reset ->cb_expires_at).
78 * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
79 * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also
80 * dispatch a work item to unmap all PTEs to the vnode's pagecache to
81 * force reentry to the filesystem for revalidation.
83 * (4) When entering the filesystem, we call afs_validate() to check the
84 * validity of a vnode. This first checks to see if ->cb_v_check and
85 * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
86 * exclusively and perform an FS.FetchStatus on the vnode.
88 * After checking the volume, we check the vnode. If there's a mismatch
89 * between the volume counters and the vnode's mirrors of those counters,
90 * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
92 * (5) When the reply from FS.FetchStatus arrives, the VolSync record is
95 * (A) If the Creation timestamp has changed on a RW volume or regressed
96 * on a RO volume, we try to increment ->cb_scrub; if it advances on a
97 * RO volume, we assume "vos release" happened and try to increment
100 * (B) If the Update timestamp has regressed, we try to increment
103 * Note that in both of these cases, we only do the increment if we can
104 * cmpxchg the value of the timestamp from the value we noted before the
105 * op. This tries to prevent parallel ops from fighting one another.
107 * volume->cb_v_check is then set to ->cb_v_break.
109 * (6) The AFSCallBack record included in the FS.FetchStatus reply is also
110 * parsed and used to set the promise in ->cb_expires_at for the vnode,
111 * the volume and the volume's server record.
113 * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
118 * Check the validity of a vnode/inode and its parent volume.
120 bool afs_check_validity(const struct afs_vnode
*vnode
)
122 const struct afs_volume
*volume
= vnode
->volume
;
123 time64_t deadline
= ktime_get_real_seconds() + 10;
125 if (test_bit(AFS_VNODE_DELETED
, &vnode
->flags
))
128 if (atomic_read(&volume
->cb_v_check
) != atomic_read(&volume
->cb_v_break
) ||
129 atomic64_read(&vnode
->cb_expires_at
) <= deadline
||
130 volume
->cb_expires_at
<= deadline
||
131 vnode
->cb_ro_snapshot
!= atomic_read(&volume
->cb_ro_snapshot
) ||
132 vnode
->cb_scrub
!= atomic_read(&volume
->cb_scrub
) ||
133 test_bit(AFS_VNODE_ZAP_DATA
, &vnode
->flags
)) {
142 * See if the server we've just talked to is currently excluded.
144 static bool __afs_is_server_excluded(struct afs_operation
*op
, struct afs_volume
*volume
)
146 const struct afs_server_entry
*se
;
147 const struct afs_server_list
*slist
;
148 bool is_excluded
= true;
153 slist
= rcu_dereference(volume
->servers
);
154 for (i
= 0; i
< slist
->nr_servers
; i
++) {
155 se
= &slist
->servers
[i
];
156 if (op
->server
== se
->server
) {
157 is_excluded
= test_bit(AFS_SE_EXCLUDED
, &se
->flags
);
167 * Update the volume's server list when the creation time changes and see if
168 * the server we've just talked to is currently excluded.
170 static int afs_is_server_excluded(struct afs_operation
*op
, struct afs_volume
*volume
)
174 if (__afs_is_server_excluded(op
, volume
))
177 set_bit(AFS_VOLUME_NEEDS_UPDATE
, &volume
->flags
);
178 ret
= afs_check_volume_status(op
->volume
, op
);
182 return __afs_is_server_excluded(op
, volume
);
186 * Handle a change to the volume creation time in the VolSync record.
188 static int afs_update_volume_creation_time(struct afs_operation
*op
, struct afs_volume
*volume
)
191 time64_t cur
= volume
->creation_time
;
192 time64_t old
= op
->pre_volsync
.creation
;
193 time64_t
new = op
->volsync
.creation
;
196 _enter("%llx,%llx,%llx->%llx", volume
->vid
, cur
, old
, new);
198 if (cur
== TIME64_MIN
) {
199 volume
->creation_time
= new;
206 /* Try to advance the creation timestamp from what we had before the
207 * operation to what we got back from the server. This should
208 * hopefully ensure that in a race between multiple operations only one
209 * of them will do this.
214 /* If the creation time changes in an unexpected way, we need to scrub
215 * our caches. For a RW vol, this will only change if the volume is
216 * restored from a backup; for a RO/Backup vol, this will advance when
217 * the volume is updated to a new snapshot (eg. "vos release").
219 if (volume
->type
== AFSVL_RWVOL
)
221 if (volume
->type
== AFSVL_BACKVOL
) {
227 /* We have an RO volume, we need to query the VL server and look at the
228 * server flags to see if RW->RO replication is in progress.
230 ret
= afs_is_server_excluded(op
, volume
);
234 snap
= atomic_read(&volume
->cb_ro_snapshot
);
235 trace_afs_cb_v_break(volume
->vid
, snap
, afs_cb_break_volume_excluded
);
240 snap
= atomic_inc_return(&volume
->cb_ro_snapshot
);
241 trace_afs_cb_v_break(volume
->vid
, snap
, afs_cb_break_for_vos_release
);
242 volume
->creation_time
= new;
246 atomic_inc(&volume
->cb_scrub
);
247 trace_afs_cb_v_break(volume
->vid
, 0, afs_cb_break_for_creation_regress
);
248 volume
->creation_time
= new;
253 * Handle a change to the volume update time in the VolSync record.
255 static void afs_update_volume_update_time(struct afs_operation
*op
, struct afs_volume
*volume
)
257 enum afs_cb_break_reason reason
= afs_cb_break_no_break
;
258 time64_t cur
= volume
->update_time
;
259 time64_t old
= op
->pre_volsync
.update
;
260 time64_t
new = op
->volsync
.update
;
262 _enter("%llx,%llx,%llx->%llx", volume
->vid
, cur
, old
, new);
264 if (cur
== TIME64_MIN
) {
265 volume
->update_time
= new;
272 /* If the volume update time changes in an unexpected way, we need to
273 * scrub our caches. For a RW vol, this will advance on every
274 * modification op; for a RO/Backup vol, this will advance when the
275 * volume is updated to a new snapshot (eg. "vos release").
278 reason
= afs_cb_break_for_update_regress
;
280 /* Try to advance the update timestamp from what we had before the
281 * operation to what we got back from the server. This should
282 * hopefully ensure that in a race between multiple operations only one
283 * of them will do this.
286 if (reason
== afs_cb_break_for_update_regress
) {
287 atomic_inc(&volume
->cb_scrub
);
288 trace_afs_cb_v_break(volume
->vid
, 0, reason
);
290 volume
->update_time
= new;
294 static int afs_update_volume_times(struct afs_operation
*op
, struct afs_volume
*volume
)
298 if (likely(op
->volsync
.creation
== volume
->creation_time
&&
299 op
->volsync
.update
== volume
->update_time
))
302 mutex_lock(&volume
->volsync_lock
);
303 if (op
->volsync
.creation
!= volume
->creation_time
) {
304 ret
= afs_update_volume_creation_time(op
, volume
);
308 if (op
->volsync
.update
!= volume
->update_time
)
309 afs_update_volume_update_time(op
, volume
);
311 mutex_unlock(&volume
->volsync_lock
);
316 * Update the state of a volume, including recording the expiration time of the
317 * callback promise. Returns 1 to redo the operation from the start.
319 int afs_update_volume_state(struct afs_operation
*op
)
321 struct afs_server_list
*slist
= op
->server_list
;
322 struct afs_server_entry
*se
= &slist
->servers
[op
->server_index
];
323 struct afs_callback
*cb
= &op
->file
[0].scb
.callback
;
324 struct afs_volume
*volume
= op
->volume
;
325 unsigned int cb_v_break
= atomic_read(&volume
->cb_v_break
);
326 unsigned int cb_v_check
= atomic_read(&volume
->cb_v_check
);
329 _enter("%llx", op
->volume
->vid
);
331 if (op
->volsync
.creation
!= TIME64_MIN
|| op
->volsync
.update
!= TIME64_MIN
) {
332 ret
= afs_update_volume_times(op
, volume
);
334 _leave(" = %d", ret
);
339 if (op
->cb_v_break
== cb_v_break
&&
340 (op
->file
[0].scb
.have_cb
|| op
->file
[1].scb
.have_cb
)) {
341 time64_t expires_at
= cb
->expires_at
;
343 if (!op
->file
[0].scb
.have_cb
)
344 expires_at
= op
->file
[1].scb
.callback
.expires_at
;
346 se
->cb_expires_at
= expires_at
;
347 volume
->cb_expires_at
= expires_at
;
349 if (cb_v_check
< op
->cb_v_break
)
350 atomic_cmpxchg(&volume
->cb_v_check
, cb_v_check
, op
->cb_v_break
);
355 * mark the data attached to an inode as obsolete due to a write on the server
356 * - might also want to ditch all the outstanding writes and dirty pages
358 static void afs_zap_data(struct afs_vnode
*vnode
)
360 _enter("{%llx:%llu}", vnode
->fid
.vid
, vnode
->fid
.vnode
);
362 afs_invalidate_cache(vnode
, 0);
364 /* nuke all the non-dirty pages that aren't locked, mapped or being
365 * written back in a regular file and completely discard the pages in a
366 * directory or symlink */
367 if (S_ISREG(vnode
->netfs
.inode
.i_mode
))
368 filemap_invalidate_inode(&vnode
->netfs
.inode
, true, 0, LLONG_MAX
);
370 filemap_invalidate_inode(&vnode
->netfs
.inode
, false, 0, LLONG_MAX
);
374 * validate a vnode/inode
375 * - there are several things we need to check
376 * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
378 * - parent dir metadata changed (security changes)
379 * - dentry data changed (write, truncate)
380 * - dentry metadata changed (security changes)
382 int afs_validate(struct afs_vnode
*vnode
, struct key
*key
)
384 struct afs_volume
*volume
= vnode
->volume
;
385 unsigned int cb_ro_snapshot
, cb_scrub
;
386 time64_t deadline
= ktime_get_real_seconds() + 10;
387 bool zap
= false, locked_vol
= false;
390 _enter("{v={%llx:%llu} fl=%lx},%x",
391 vnode
->fid
.vid
, vnode
->fid
.vnode
, vnode
->flags
,
394 if (afs_check_validity(vnode
))
395 return test_bit(AFS_VNODE_DELETED
, &vnode
->flags
) ? -ESTALE
: 0;
397 ret
= down_write_killable(&vnode
->validate_lock
);
401 if (test_bit(AFS_VNODE_DELETED
, &vnode
->flags
)) {
406 /* Validate a volume after the v_break has changed or the volume
407 * callback expired. We only want to do this once per volume per
408 * v_break change. The actual work will be done when parsing the
409 * status fetch reply.
411 if (volume
->cb_expires_at
<= deadline
||
412 atomic_read(&volume
->cb_v_check
) != atomic_read(&volume
->cb_v_break
)) {
413 ret
= mutex_lock_interruptible(&volume
->cb_check_lock
);
419 cb_ro_snapshot
= atomic_read(&volume
->cb_ro_snapshot
);
420 cb_scrub
= atomic_read(&volume
->cb_scrub
);
421 if (vnode
->cb_ro_snapshot
!= cb_ro_snapshot
||
422 vnode
->cb_scrub
!= cb_scrub
)
423 unmap_mapping_pages(vnode
->netfs
.inode
.i_mapping
, 0, 0, false);
425 if (vnode
->cb_ro_snapshot
!= cb_ro_snapshot
||
426 vnode
->cb_scrub
!= cb_scrub
||
427 volume
->cb_expires_at
<= deadline
||
428 atomic_read(&volume
->cb_v_check
) != atomic_read(&volume
->cb_v_break
) ||
429 atomic64_read(&vnode
->cb_expires_at
) <= deadline
431 ret
= afs_fetch_status(vnode
, key
, false, NULL
);
433 if (ret
== -ENOENT
) {
434 set_bit(AFS_VNODE_DELETED
, &vnode
->flags
);
440 _debug("new promise [fl=%lx]", vnode
->flags
);
443 /* We can drop the volume lock now as. */
445 mutex_unlock(&volume
->cb_check_lock
);
449 cb_ro_snapshot
= atomic_read(&volume
->cb_ro_snapshot
);
450 cb_scrub
= atomic_read(&volume
->cb_scrub
);
451 _debug("vnode inval %x==%x %x==%x",
452 vnode
->cb_ro_snapshot
, cb_ro_snapshot
,
453 vnode
->cb_scrub
, cb_scrub
);
454 if (vnode
->cb_scrub
!= cb_scrub
)
456 vnode
->cb_ro_snapshot
= cb_ro_snapshot
;
457 vnode
->cb_scrub
= cb_scrub
;
459 /* if the vnode's data version number changed then its contents are
461 zap
|= test_and_clear_bit(AFS_VNODE_ZAP_DATA
, &vnode
->flags
);
464 up_write(&vnode
->validate_lock
);
470 mutex_unlock(&volume
->cb_check_lock
);
471 up_write(&vnode
->validate_lock
);
473 _leave(" = %d", ret
);