fs/afs/validation.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /* vnode and volume validity verification.
   3  *
   4  * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
   5  * Written by David Howells (dhowells@redhat.com)
   6  */
   7
   8 #include <linux/kernel.h>
   9 #include <linux/module.h>
  10 #include <linux/sched.h>
  11 #include "internal.h"
  12
  13 /*
  14  * Data validation is managed through a number of mechanisms from the server:
  15  *
  16  *  (1) On first contact with a server (such as if it has just been rebooted),
  17  *      the server sends us a CB.InitCallBackState* request.
  18  *
  19  *  (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
  20  *      calls, the server maintains a time-limited per-vnode promise that it
  21  *      will send us a CB.CallBack request if a third party alters the vnodes
  22  *      accessed.
  23  *
  24  *      Note that a vnode-level callbacks may also be sent for other reasons,
  25  *      such as filelock release.
  26  *
  27  *  (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
  28  *      calls, each server maintains a time-limited per-volume promise that it
  29  *      will send us a CB.CallBack request if the RO volume is updated to a
  30  *      snapshot of the RW volume ("vos release").  This is an atomic event
  31  *      that cuts over all instances of the RO volume across multiple servers
  32  *      simultaneously.
  33  *
  34  *      Note that a volume-level callbacks may also be sent for other reasons,
  35  *      such as the volumeserver taking over control of the volume from the
  36  *      fileserver.
  37  *
  38  *      Note also that each server maintains an independent time limit on an
  39  *      independent callback.
  40  *
  41  *  (4) Certain RPC calls include a volume information record "VolSync" in
  42  *      their reply.  This contains a creation date for the volume that should
  43  *      remain unchanged for a RW volume (but will be changed if the volume is
  44  *      restored from backup) or will be bumped to the time of snapshotting
  45  *      when a RO volume is released.
  46  *
  47  * In order to track this events, the following are provided:
  48  *
  49  *      ->cb_v_break.  A counter of events that might mean that the contents of
  50  *      a volume have been altered since we last checked a vnode.
  51  *
  52  *      ->cb_v_check.  A counter of the number of events that we've sent a
  53  *      query to the server for.  Everything's up to date if this equals
  54  *      cb_v_break.
  55  *
  56  *      ->cb_scrub.  A counter of the number of regression events for which we
  57  *      have to completely wipe the cache.
  58  *
  59  *      ->cb_ro_snapshot.  A counter of the number of times that we've
  60  *      recognised that a RO volume has been updated.
  61  *
  62  *      ->cb_break.  A counter of events that might mean that the contents of a
  63  *      vnode have been altered.
  64  *
  65  *      ->cb_expires_at.  The time at which the callback promise expires or
  66  *      AFS_NO_CB_PROMISE if we have no promise.
  67  *
  68  * The way we manage things is:
  69  *
  70  *  (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
  71  *      the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
  72  *      volume and volume's server record.
  73  *
  74  *  (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
  75  *      callback break on all the volumes that have been using that volume
  76  *      (ie. increment ->cb_v_break and reset ->cb_expires_at).
  77  *
  78  *  (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
  79  *      vnode and reset its ->cb_expires_at.  If the vnode is mmapped, we also
  80  *      dispatch a work item to unmap all PTEs to the vnode's pagecache to
  81  *      force reentry to the filesystem for revalidation.
  82  *
  83  *  (4) When entering the filesystem, we call afs_validate() to check the
  84  *      validity of a vnode.  This first checks to see if ->cb_v_check and
  85  *      ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
  86  *      exclusively and perform an FS.FetchStatus on the vnode.
  87  *
  88  *      After checking the volume, we check the vnode.  If there's a mismatch
  89  *      between the volume counters and the vnode's mirrors of those counters,
  90  *      we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
  91  *
  92  *  (5) When the reply from FS.FetchStatus arrives, the VolSync record is
  93  *      parsed:
  94  *
  95  *      (A) If the Creation timestamp has changed on a RW volume or regressed
  96  *          on a RO volume, we try to increment ->cb_scrub; if it advances on a
  97  *          RO volume, we assume "vos release" happened and try to increment
  98  *          ->cb_ro_snapshot.
  99  *
 100  *      (B) If the Update timestamp has regressed, we try to increment
 101  *          ->cb_scrub.
 102  *
 103  *      Note that in both of these cases, we only do the increment if we can
 104  *      cmpxchg the value of the timestamp from the value we noted before the
 105  *      op.  This tries to prevent parallel ops from fighting one another.
 106  *
 107  *      volume->cb_v_check is then set to ->cb_v_break.
 108  *
 109  *  (6) The AFSCallBack record included in the FS.FetchStatus reply is also
 110  *      parsed and used to set the promise in ->cb_expires_at for the vnode,
 111  *      the volume and the volume's server record.
 112  *
 113  *  (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
 114  *      the vnode.
 115  */
 116
 117 /*
 118  * Check the validity of a vnode/inode and its parent volume.
 119  */
 120 bool afs_check_validity(const struct afs_vnode *vnode)
 121 {
 122         const struct afs_volume *volume = vnode->volume;
 123         time64_t deadline = ktime_get_real_seconds() + 10;
 124
 125         if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
 126                 return true;
 127
 128         if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
 129             atomic64_read(&vnode->cb_expires_at)  <= deadline ||
 130             volume->cb_expires_at <= deadline ||
 131             vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
 132             vnode->cb_scrub       != atomic_read(&volume->cb_scrub) ||
 133             test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
 134                 _debug("inval");
 135                 return false;
 136         }
 137
 138         return true;
 139 }
 140
 141 /*
 142  * See if the server we've just talked to is currently excluded.
 143  */
 144 static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
 145 {
 146         const struct afs_server_entry *se;
 147         const struct afs_server_list *slist;
 148         bool is_excluded = true;
 149         int i;
 150
 151         rcu_read_lock();
 152
 153         slist = rcu_dereference(volume->servers);
 154         for (i = 0; i < slist->nr_servers; i++) {
 155                 se = &slist->servers[i];
 156                 if (op->server == se->server) {
 157                         is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
 158                         break;
 159                 }
 160         }
 161
 162         rcu_read_unlock();
 163         return is_excluded;
 164 }
 165
 166 /*
 167  * Update the volume's server list when the creation time changes and see if
 168  * the server we've just talked to is currently excluded.
 169  */
 170 static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
 171 {
 172         int ret;
 173
 174         if (__afs_is_server_excluded(op, volume))
 175                 return 1;
 176
 177         set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
 178         ret = afs_check_volume_status(op->volume, op);
 179         if (ret < 0)
 180                 return ret;
 181
 182         return __afs_is_server_excluded(op, volume);
 183 }
 184
 185 /*
 186  * Handle a change to the volume creation time in the VolSync record.
 187  */
 188 static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
 189 {
 190         unsigned int snap;
 191         time64_t cur = volume->creation_time;
 192         time64_t old = op->pre_volsync.creation;
 193         time64_t new = op->volsync.creation;
 194         int ret;
 195
 196         _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
 197
 198         if (cur == TIME64_MIN) {
 199                 volume->creation_time = new;
 200                 return 0;
 201         }
 202
 203         if (new == cur)
 204                 return 0;
 205
 206         /* Try to advance the creation timestamp from what we had before the
 207          * operation to what we got back from the server.  This should
 208          * hopefully ensure that in a race between multiple operations only one
 209          * of them will do this.
 210          */
 211         if (cur != old)
 212                 return 0;
 213
 214         /* If the creation time changes in an unexpected way, we need to scrub
 215          * our caches.  For a RW vol, this will only change if the volume is
 216          * restored from a backup; for a RO/Backup vol, this will advance when
 217          * the volume is updated to a new snapshot (eg. "vos release").
 218          */
 219         if (volume->type == AFSVL_RWVOL)
 220                 goto regressed;
 221         if (volume->type == AFSVL_BACKVOL) {
 222                 if (new < old)
 223                         goto regressed;
 224                 goto advance;
 225         }
 226
 227         /* We have an RO volume, we need to query the VL server and look at the
 228          * server flags to see if RW->RO replication is in progress.
 229          */
 230         ret = afs_is_server_excluded(op, volume);
 231         if (ret < 0)
 232                 return ret;
 233         if (ret > 0) {
 234                 snap = atomic_read(&volume->cb_ro_snapshot);
 235                 trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
 236                 return ret;
 237         }
 238
 239 advance:
 240         snap = atomic_inc_return(&volume->cb_ro_snapshot);
 241         trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
 242         volume->creation_time = new;
 243         return 0;
 244
 245 regressed:
 246         atomic_inc(&volume->cb_scrub);
 247         trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
 248         volume->creation_time = new;
 249         return 0;
 250 }
 251
 252 /*
 253  * Handle a change to the volume update time in the VolSync record.
 254  */
 255 static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
 256 {
 257         enum afs_cb_break_reason reason = afs_cb_break_no_break;
 258         time64_t cur = volume->update_time;
 259         time64_t old = op->pre_volsync.update;
 260         time64_t new = op->volsync.update;
 261
 262         _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
 263
 264         if (cur == TIME64_MIN) {
 265                 volume->update_time = new;
 266                 return;
 267         }
 268
 269         if (new == cur)
 270                 return;
 271
 272         /* If the volume update time changes in an unexpected way, we need to
 273          * scrub our caches.  For a RW vol, this will advance on every
 274          * modification op; for a RO/Backup vol, this will advance when the
 275          * volume is updated to a new snapshot (eg. "vos release").
 276          */
 277         if (new < old)
 278                 reason = afs_cb_break_for_update_regress;
 279
 280         /* Try to advance the update timestamp from what we had before the
 281          * operation to what we got back from the server.  This should
 282          * hopefully ensure that in a race between multiple operations only one
 283          * of them will do this.
 284          */
 285         if (cur == old) {
 286                 if (reason == afs_cb_break_for_update_regress) {
 287                         atomic_inc(&volume->cb_scrub);
 288                         trace_afs_cb_v_break(volume->vid, 0, reason);
 289                 }
 290                 volume->update_time = new;
 291         }
 292 }
 293
 294 static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
 295 {
 296         int ret = 0;
 297
 298         if (likely(op->volsync.creation == volume->creation_time &&
 299                    op->volsync.update == volume->update_time))
 300                 return 0;
 301
 302         mutex_lock(&volume->volsync_lock);
 303         if (op->volsync.creation != volume->creation_time) {
 304                 ret = afs_update_volume_creation_time(op, volume);
 305                 if (ret < 0)
 306                         goto out;
 307         }
 308         if (op->volsync.update != volume->update_time)
 309                 afs_update_volume_update_time(op, volume);
 310 out:
 311         mutex_unlock(&volume->volsync_lock);
 312         return ret;
 313 }
 314
 315 /*
 316  * Update the state of a volume, including recording the expiration time of the
 317  * callback promise.  Returns 1 to redo the operation from the start.
 318  */
 319 int afs_update_volume_state(struct afs_operation *op)
 320 {
 321         struct afs_server_list *slist = op->server_list;
 322         struct afs_server_entry *se = &slist->servers[op->server_index];
 323         struct afs_callback *cb = &op->file[0].scb.callback;
 324         struct afs_volume *volume = op->volume;
 325         unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
 326         unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
 327         int ret;
 328
 329         _enter("%llx", op->volume->vid);
 330
 331         if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
 332                 ret = afs_update_volume_times(op, volume);
 333                 if (ret != 0) {
 334                         _leave(" = %d", ret);
 335                         return ret;
 336                 }
 337         }
 338
 339         if (op->cb_v_break == cb_v_break &&
 340             (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
 341                 time64_t expires_at = cb->expires_at;
 342
 343                 if (!op->file[0].scb.have_cb)
 344                         expires_at = op->file[1].scb.callback.expires_at;
 345
 346                 se->cb_expires_at = expires_at;
 347                 volume->cb_expires_at = expires_at;
 348         }
 349         if (cb_v_check < op->cb_v_break)
 350                 atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
 351         return 0;
 352 }
 353
 354 /*
 355  * mark the data attached to an inode as obsolete due to a write on the server
 356  * - might also want to ditch all the outstanding writes and dirty pages
 357  */
 358 static void afs_zap_data(struct afs_vnode *vnode)
 359 {
 360         _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
 361
 362         afs_invalidate_cache(vnode, 0);
 363
 364         /* nuke all the non-dirty pages that aren't locked, mapped or being
 365          * written back in a regular file and completely discard the pages in a
 366          * directory or symlink */
 367         if (S_ISREG(vnode->netfs.inode.i_mode))
 368                 filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX);
 369         else
 370                 filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX);
 371 }
 372
 373 /*
 374  * validate a vnode/inode
 375  * - there are several things we need to check
 376  *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
 377  *     symlink)
 378  *   - parent dir metadata changed (security changes)
 379  *   - dentry data changed (write, truncate)
 380  *   - dentry metadata changed (security changes)
 381  */
 382 int afs_validate(struct afs_vnode *vnode, struct key *key)
 383 {
 384         struct afs_volume *volume = vnode->volume;
 385         unsigned int cb_ro_snapshot, cb_scrub;
 386         time64_t deadline = ktime_get_real_seconds() + 10;
 387         bool zap = false, locked_vol = false;
 388         int ret;
 389
 390         _enter("{v={%llx:%llu} fl=%lx},%x",
 391                vnode->fid.vid, vnode->fid.vnode, vnode->flags,
 392                key_serial(key));
 393
 394         if (afs_check_validity(vnode))
 395                 return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
 396
 397         ret = down_write_killable(&vnode->validate_lock);
 398         if (ret < 0)
 399                 goto error;
 400
 401         if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
 402                 ret = -ESTALE;
 403                 goto error_unlock;
 404         }
 405
 406         /* Validate a volume after the v_break has changed or the volume
 407          * callback expired.  We only want to do this once per volume per
 408          * v_break change.  The actual work will be done when parsing the
 409          * status fetch reply.
 410          */
 411         if (volume->cb_expires_at <= deadline ||
 412             atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
 413                 ret = mutex_lock_interruptible(&volume->cb_check_lock);
 414                 if (ret < 0)
 415                         goto error_unlock;
 416                 locked_vol = true;
 417         }
 418
 419         cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
 420         cb_scrub = atomic_read(&volume->cb_scrub);
 421         if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
 422             vnode->cb_scrub       != cb_scrub)
 423                 unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
 424
 425         if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
 426             vnode->cb_scrub       != cb_scrub ||
 427             volume->cb_expires_at <= deadline ||
 428             atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
 429             atomic64_read(&vnode->cb_expires_at) <= deadline
 430             ) {
 431                 ret = afs_fetch_status(vnode, key, false, NULL);
 432                 if (ret < 0) {
 433                         if (ret == -ENOENT) {
 434                                 set_bit(AFS_VNODE_DELETED, &vnode->flags);
 435                                 ret = -ESTALE;
 436                         }
 437                         goto error_unlock;
 438                 }
 439
 440                 _debug("new promise [fl=%lx]", vnode->flags);
 441         }
 442
 443         /* We can drop the volume lock now as. */
 444         if (locked_vol) {
 445                 mutex_unlock(&volume->cb_check_lock);
 446                 locked_vol = false;
 447         }
 448
 449         cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
 450         cb_scrub = atomic_read(&volume->cb_scrub);
 451         _debug("vnode inval %x==%x %x==%x",
 452                vnode->cb_ro_snapshot, cb_ro_snapshot,
 453                vnode->cb_scrub, cb_scrub);
 454         if (vnode->cb_scrub != cb_scrub)
 455                 zap = true;
 456         vnode->cb_ro_snapshot = cb_ro_snapshot;
 457         vnode->cb_scrub = cb_scrub;
 458
 459         /* if the vnode's data version number changed then its contents are
 460          * different */
 461         zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
 462         if (zap)
 463                 afs_zap_data(vnode);
 464         up_write(&vnode->validate_lock);
 465         _leave(" = 0");
 466         return 0;
 467
 468 error_unlock:
 469         if (locked_vol)
 470                 mutex_unlock(&volume->cb_check_lock);
 471         up_write(&vnode->validate_lock);
 472 error:
 473         _leave(" = %d", ret);
 474         return ret;
 475 }