fs/ceph/locks.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/ceph/ceph_debug.h>
   3
   4 #include <linux/file.h>
   5 #include <linux/namei.h>
   6 #include <linux/random.h>
   7
   8 #include "super.h"
   9 #include "mds_client.h"
  10 #include <linux/ceph/pagelist.h>
  11
  12 static u64 lock_secret;
  13 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
  14                                          struct ceph_mds_request *req);
  15
  16 static inline u64 secure_addr(void *addr)
  17 {
  18         u64 v = lock_secret ^ (u64)(unsigned long)addr;
  19         /*
  20          * Set the most significant bit, so that MDS knows the 'owner'
  21          * is sufficient to identify the owner of lock. (old code uses
  22          * both 'owner' and 'pid')
  23          */
  24         v |= (1ULL << 63);
  25         return v;
  26 }
  27
  28 void __init ceph_flock_init(void)
  29 {
  30         get_random_bytes(&lock_secret, sizeof(lock_secret));
  31 }
  32
  33 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
  34 {
  35         struct inode *inode = file_inode(src->fl_file);
  36         atomic_inc(&ceph_inode(inode)->i_filelock_ref);
  37 }
  38
  39 static void ceph_fl_release_lock(struct file_lock *fl)
  40 {
  41         struct inode *inode = file_inode(fl->fl_file);
  42         struct ceph_inode_info *ci = ceph_inode(inode);
  43         if (atomic_dec_and_test(&ci->i_filelock_ref)) {
  44                 /* clear error when all locks are released */
  45                 spin_lock(&ci->i_ceph_lock);
  46                 ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
  47                 spin_unlock(&ci->i_ceph_lock);
  48         }
  49 }
  50
  51 static const struct file_lock_operations ceph_fl_lock_ops = {
  52         .fl_copy_lock = ceph_fl_copy_lock,
  53         .fl_release_private = ceph_fl_release_lock,
  54 };
  55
  56 /**
  57  * Implement fcntl and flock locking functions.
  58  */
  59 static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
  60                              int cmd, u8 wait, struct file_lock *fl)
  61 {
  62         struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
  63         struct ceph_mds_request *req;
  64         int err;
  65         u64 length = 0;
  66         u64 owner;
  67
  68         if (operation == CEPH_MDS_OP_SETFILELOCK) {
  69                 /*
  70                  * increasing i_filelock_ref closes race window between
  71                  * handling request reply and adding file_lock struct to
  72                  * inode. Otherwise, auth caps may get trimmed in the
  73                  * window. Caller function will decrease the counter.
  74                  */
  75                 fl->fl_ops = &ceph_fl_lock_ops;
  76                 atomic_inc(&ceph_inode(inode)->i_filelock_ref);
  77         }
  78
  79         if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
  80                 wait = 0;
  81
  82         req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
  83         if (IS_ERR(req))
  84                 return PTR_ERR(req);
  85         req->r_inode = inode;
  86         ihold(inode);
  87         req->r_num_caps = 1;
  88
  89         /* mds requires start and length rather than start and end */
  90         if (LLONG_MAX == fl->fl_end)
  91                 length = 0;
  92         else
  93                 length = fl->fl_end - fl->fl_start + 1;
  94
  95         owner = secure_addr(fl->fl_owner);
  96
  97         dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
  98              "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type,
  99              (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
 100              wait, fl->fl_type);
 101
 102         req->r_args.filelock_change.rule = lock_type;
 103         req->r_args.filelock_change.type = cmd;
 104         req->r_args.filelock_change.owner = cpu_to_le64(owner);
 105         req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
 106         req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
 107         req->r_args.filelock_change.length = cpu_to_le64(length);
 108         req->r_args.filelock_change.wait = wait;
 109
 110         if (wait)
 111                 req->r_wait_for_completion = ceph_lock_wait_for_completion;
 112
 113         err = ceph_mdsc_do_request(mdsc, inode, req);
 114         if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
 115                 fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
 116                 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
 117                         fl->fl_type = F_RDLCK;
 118                 else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
 119                         fl->fl_type = F_WRLCK;
 120                 else
 121                         fl->fl_type = F_UNLCK;
 122
 123                 fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
 124                 length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
 125                                                  le64_to_cpu(req->r_reply_info.filelock_reply->length);
 126                 if (length >= 1)
 127                         fl->fl_end = length -1;
 128                 else
 129                         fl->fl_end = 0;
 130
 131         }
 132         ceph_mdsc_put_request(req);
 133         dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
 134              "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type,
 135              (int)operation, (u64)fl->fl_pid, fl->fl_start,
 136              length, wait, fl->fl_type, err);
 137         return err;
 138 }
 139
 140 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
 141                                          struct ceph_mds_request *req)
 142 {
 143         struct ceph_mds_request *intr_req;
 144         struct inode *inode = req->r_inode;
 145         int err, lock_type;
 146
 147         BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
 148         if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
 149                 lock_type = CEPH_LOCK_FCNTL_INTR;
 150         else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
 151                 lock_type = CEPH_LOCK_FLOCK_INTR;
 152         else
 153                 BUG_ON(1);
 154         BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
 155
 156         err = wait_for_completion_interruptible(&req->r_completion);
 157         if (!err)
 158                 return 0;
 159
 160         dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
 161              req->r_tid);
 162
 163         mutex_lock(&mdsc->mutex);
 164         if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
 165                 err = 0;
 166         } else {
 167                 /*
 168                  * ensure we aren't running concurrently with
 169                  * ceph_fill_trace or ceph_readdir_prepopulate, which
 170                  * rely on locks (dir mutex) held by our caller.
 171                  */
 172                 mutex_lock(&req->r_fill_mutex);
 173                 req->r_err = err;
 174                 set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
 175                 mutex_unlock(&req->r_fill_mutex);
 176
 177                 if (!req->r_session) {
 178                         // haven't sent the request
 179                         err = 0;
 180                 }
 181         }
 182         mutex_unlock(&mdsc->mutex);
 183         if (!err)
 184                 return 0;
 185
 186         intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
 187                                             USE_AUTH_MDS);
 188         if (IS_ERR(intr_req))
 189                 return PTR_ERR(intr_req);
 190
 191         intr_req->r_inode = inode;
 192         ihold(inode);
 193         intr_req->r_num_caps = 1;
 194
 195         intr_req->r_args.filelock_change = req->r_args.filelock_change;
 196         intr_req->r_args.filelock_change.rule = lock_type;
 197         intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
 198
 199         err = ceph_mdsc_do_request(mdsc, inode, intr_req);
 200         ceph_mdsc_put_request(intr_req);
 201
 202         if (err && err != -ERESTARTSYS)
 203                 return err;
 204
 205         wait_for_completion_killable(&req->r_safe_completion);
 206         return 0;
 207 }
 208
 209 /**
 210  * Attempt to set an fcntl lock.
 211  * For now, this just goes away to the server. Later it may be more awesome.
 212  */
 213 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 214 {
 215         struct inode *inode = file_inode(file);
 216         struct ceph_inode_info *ci = ceph_inode(inode);
 217         int err = 0;
 218         u16 op = CEPH_MDS_OP_SETFILELOCK;
 219         u8 wait = 0;
 220         u8 lock_cmd;
 221
 222         if (!(fl->fl_flags & FL_POSIX))
 223                 return -ENOLCK;
 224         /* No mandatory locks */
 225         if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
 226                 return -ENOLCK;
 227
 228         dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
 229
 230         /* set wait bit as appropriate, then make command as Ceph expects it*/
 231         if (IS_GETLK(cmd))
 232                 op = CEPH_MDS_OP_GETFILELOCK;
 233         else if (IS_SETLKW(cmd))
 234                 wait = 1;
 235
 236         spin_lock(&ci->i_ceph_lock);
 237         if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
 238                 err = -EIO;
 239         } else if (op == CEPH_MDS_OP_SETFILELOCK) {
 240                 /*
 241                  * increasing i_filelock_ref closes race window between
 242                  * handling request reply and adding file_lock struct to
 243                  * inode. Otherwise, i_auth_cap may get trimmed in the
 244                  * window. Caller function will decrease the counter.
 245                  */
 246                 fl->fl_ops = &ceph_fl_lock_ops;
 247                 atomic_inc(&ci->i_filelock_ref);
 248         }
 249         spin_unlock(&ci->i_ceph_lock);
 250         if (err < 0) {
 251                 if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type)
 252                         posix_lock_file(file, fl, NULL);
 253                 return err;
 254         }
 255
 256         if (F_RDLCK == fl->fl_type)
 257                 lock_cmd = CEPH_LOCK_SHARED;
 258         else if (F_WRLCK == fl->fl_type)
 259                 lock_cmd = CEPH_LOCK_EXCL;
 260         else
 261                 lock_cmd = CEPH_LOCK_UNLOCK;
 262
 263         err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
 264         if (!err) {
 265                 if (op == CEPH_MDS_OP_SETFILELOCK) {
 266                         dout("mds locked, locking locally\n");
 267                         err = posix_lock_file(file, fl, NULL);
 268                         if (err) {
 269                                 /* undo! This should only happen if
 270                                  * the kernel detects local
 271                                  * deadlock. */
 272                                 ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
 273                                                   CEPH_LOCK_UNLOCK, 0, fl);
 274                                 dout("got %d on posix_lock_file, undid lock\n",
 275                                      err);
 276                         }
 277                 }
 278         }
 279         return err;
 280 }
 281
 282 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 283 {
 284         struct inode *inode = file_inode(file);
 285         struct ceph_inode_info *ci = ceph_inode(inode);
 286         int err = 0;
 287         u8 wait = 0;
 288         u8 lock_cmd;
 289
 290         if (!(fl->fl_flags & FL_FLOCK))
 291                 return -ENOLCK;
 292         /* No mandatory locks */
 293         if (fl->fl_type & LOCK_MAND)
 294                 return -EOPNOTSUPP;
 295
 296         dout("ceph_flock, fl_file: %p\n", fl->fl_file);
 297
 298         spin_lock(&ci->i_ceph_lock);
 299         if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
 300                 err = -EIO;
 301         } else {
 302                 /* see comment in ceph_lock */
 303                 fl->fl_ops = &ceph_fl_lock_ops;
 304                 atomic_inc(&ci->i_filelock_ref);
 305         }
 306         spin_unlock(&ci->i_ceph_lock);
 307         if (err < 0) {
 308                 if (F_UNLCK == fl->fl_type)
 309                         locks_lock_file_wait(file, fl);
 310                 return err;
 311         }
 312
 313         if (IS_SETLKW(cmd))
 314                 wait = 1;
 315
 316         if (F_RDLCK == fl->fl_type)
 317                 lock_cmd = CEPH_LOCK_SHARED;
 318         else if (F_WRLCK == fl->fl_type)
 319                 lock_cmd = CEPH_LOCK_EXCL;
 320         else
 321                 lock_cmd = CEPH_LOCK_UNLOCK;
 322
 323         err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
 324                                 inode, lock_cmd, wait, fl);
 325         if (!err) {
 326                 err = locks_lock_file_wait(file, fl);
 327                 if (err) {
 328                         ceph_lock_message(CEPH_LOCK_FLOCK,
 329                                           CEPH_MDS_OP_SETFILELOCK,
 330                                           inode, CEPH_LOCK_UNLOCK, 0, fl);
 331                         dout("got %d on locks_lock_file_wait, undid lock\n", err);
 332                 }
 333         }
 334         return err;
 335 }
 336
 337 /*
 338  * Fills in the passed counter variables, so you can prepare pagelist metadata
 339  * before calling ceph_encode_locks.
 340  */
 341 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 342 {
 343         struct file_lock *lock;
 344         struct file_lock_context *ctx;
 345
 346         *fcntl_count = 0;
 347         *flock_count = 0;
 348
 349         ctx = inode->i_flctx;
 350         if (ctx) {
 351                 spin_lock(&ctx->flc_lock);
 352                 list_for_each_entry(lock, &ctx->flc_posix, fl_list)
 353                         ++(*fcntl_count);
 354                 list_for_each_entry(lock, &ctx->flc_flock, fl_list)
 355                         ++(*flock_count);
 356                 spin_unlock(&ctx->flc_lock);
 357         }
 358         dout("counted %d flock locks and %d fcntl locks\n",
 359              *flock_count, *fcntl_count);
 360 }
 361
 362 /*
 363  * Given a pointer to a lock, convert it to a ceph filelock
 364  */
 365 static int lock_to_ceph_filelock(struct file_lock *lock,
 366                                  struct ceph_filelock *cephlock)
 367 {
 368         int err = 0;
 369         cephlock->start = cpu_to_le64(lock->fl_start);
 370         cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
 371         cephlock->client = cpu_to_le64(0);
 372         cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
 373         cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
 374
 375         switch (lock->fl_type) {
 376         case F_RDLCK:
 377                 cephlock->type = CEPH_LOCK_SHARED;
 378                 break;
 379         case F_WRLCK:
 380                 cephlock->type = CEPH_LOCK_EXCL;
 381                 break;
 382         case F_UNLCK:
 383                 cephlock->type = CEPH_LOCK_UNLOCK;
 384                 break;
 385         default:
 386                 dout("Have unknown lock type %d\n", lock->fl_type);
 387                 err = -EINVAL;
 388         }
 389
 390         return err;
 391 }
 392
 393 /**
 394  * Encode the flock and fcntl locks for the given inode into the ceph_filelock
 395  * array. Must be called with inode->i_lock already held.
 396  * If we encounter more of a specific lock type than expected, return -ENOSPC.
 397  */
 398 int ceph_encode_locks_to_buffer(struct inode *inode,
 399                                 struct ceph_filelock *flocks,
 400                                 int num_fcntl_locks, int num_flock_locks)
 401 {
 402         struct file_lock *lock;
 403         struct file_lock_context *ctx = inode->i_flctx;
 404         int err = 0;
 405         int seen_fcntl = 0;
 406         int seen_flock = 0;
 407         int l = 0;
 408
 409         dout("encoding %d flock and %d fcntl locks\n", num_flock_locks,
 410              num_fcntl_locks);
 411
 412         if (!ctx)
 413                 return 0;
 414
 415         spin_lock(&ctx->flc_lock);
 416         list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
 417                 ++seen_fcntl;
 418                 if (seen_fcntl > num_fcntl_locks) {
 419                         err = -ENOSPC;
 420                         goto fail;
 421                 }
 422                 err = lock_to_ceph_filelock(lock, &flocks[l]);
 423                 if (err)
 424                         goto fail;
 425                 ++l;
 426         }
 427         list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
 428                 ++seen_flock;
 429                 if (seen_flock > num_flock_locks) {
 430                         err = -ENOSPC;
 431                         goto fail;
 432                 }
 433                 err = lock_to_ceph_filelock(lock, &flocks[l]);
 434                 if (err)
 435                         goto fail;
 436                 ++l;
 437         }
 438 fail:
 439         spin_unlock(&ctx->flc_lock);
 440         return err;
 441 }
 442
 443 /**
 444  * Copy the encoded flock and fcntl locks into the pagelist.
 445  * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
 446  * sequential flock locks.
 447  * Returns zero on success.
 448  */
 449 int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
 450                            struct ceph_pagelist *pagelist,
 451                            int num_fcntl_locks, int num_flock_locks)
 452 {
 453         int err = 0;
 454         __le32 nlocks;
 455
 456         nlocks = cpu_to_le32(num_fcntl_locks);
 457         err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
 458         if (err)
 459                 goto out_fail;
 460
 461         if (num_fcntl_locks > 0) {
 462                 err = ceph_pagelist_append(pagelist, flocks,
 463                                            num_fcntl_locks * sizeof(*flocks));
 464                 if (err)
 465                         goto out_fail;
 466         }
 467
 468         nlocks = cpu_to_le32(num_flock_locks);
 469         err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
 470         if (err)
 471                 goto out_fail;
 472
 473         if (num_flock_locks > 0) {
 474                 err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
 475                                            num_flock_locks * sizeof(*flocks));
 476         }
 477 out_fail:
 478         return err;
 479 }