fs/ceph/locks.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/ceph/ceph_debug.h>
   3
   4 #include <linux/file.h>
   5 #include <linux/namei.h>
   6 #include <linux/random.h>
   7
   8 #include "super.h"
   9 #include "mds_client.h"
  10 #include <linux/ceph/pagelist.h>
  11
  12 static u64 lock_secret;
  13 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
  14                                          struct ceph_mds_request *req);
  15
  16 static inline u64 secure_addr(void *addr)
  17 {
  18         u64 v = lock_secret ^ (u64)(unsigned long)addr;
  19         /*
  20          * Set the most significant bit, so that MDS knows the 'owner'
  21          * is sufficient to identify the owner of lock. (old code uses
  22          * both 'owner' and 'pid')
  23          */
  24         v |= (1ULL << 63);
  25         return v;
  26 }
  27
  28 void __init ceph_flock_init(void)
  29 {
  30         get_random_bytes(&lock_secret, sizeof(lock_secret));
  31 }
  32
  33 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
  34 {
  35         struct inode *inode = file_inode(src->fl_file);
  36         atomic_inc(&ceph_inode(inode)->i_filelock_ref);
  37 }
  38
  39 static void ceph_fl_release_lock(struct file_lock *fl)
  40 {
  41         struct inode *inode = file_inode(fl->fl_file);
  42         struct ceph_inode_info *ci = ceph_inode(inode);
  43         if (atomic_dec_and_test(&ci->i_filelock_ref)) {
  44                 /* clear error when all locks are released */
  45                 spin_lock(&ci->i_ceph_lock);
  46                 ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
  47                 spin_unlock(&ci->i_ceph_lock);
  48         }
  49 }
  50
  51 static const struct file_lock_operations ceph_fl_lock_ops = {
  52         .fl_copy_lock = ceph_fl_copy_lock,
  53         .fl_release_private = ceph_fl_release_lock,
  54 };
  55
  56 /**
  57  * Implement fcntl and flock locking functions.
  58  */
  59 static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
  60                              int cmd, u8 wait, struct file_lock *fl)
  61 {
  62         struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
  63         struct ceph_mds_request *req;
  64         int err;
  65         u64 length = 0;
  66         u64 owner;
  67
  68         if (operation == CEPH_MDS_OP_SETFILELOCK) {
  69                 /*
  70                  * increasing i_filelock_ref closes race window between
  71                  * handling request reply and adding file_lock struct to
  72                  * inode. Otherwise, auth caps may get trimmed in the
  73                  * window. Caller function will decrease the counter.
  74                  */
  75                 fl->fl_ops = &ceph_fl_lock_ops;
  76                 atomic_inc(&ceph_inode(inode)->i_filelock_ref);
  77         }
  78
  79         if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
  80                 wait = 0;
  81
  82         req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
  83         if (IS_ERR(req))
  84                 return PTR_ERR(req);
  85         req->r_inode = inode;
  86         ihold(inode);
  87         req->r_num_caps = 1;
  88
  89         /* mds requires start and length rather than start and end */
  90         if (LLONG_MAX == fl->fl_end)
  91                 length = 0;
  92         else
  93                 length = fl->fl_end - fl->fl_start + 1;
  94
  95         owner = secure_addr(fl->fl_owner);
  96
  97         dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
  98              "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type,
  99              (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
 100              wait, fl->fl_type);
 101
 102         req->r_args.filelock_change.rule = lock_type;
 103         req->r_args.filelock_change.type = cmd;
 104         req->r_args.filelock_change.owner = cpu_to_le64(owner);
 105         req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
 106         req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
 107         req->r_args.filelock_change.length = cpu_to_le64(length);
 108         req->r_args.filelock_change.wait = wait;
 109
 110         if (wait)
 111                 req->r_wait_for_completion = ceph_lock_wait_for_completion;
 112
 113         err = ceph_mdsc_do_request(mdsc, inode, req);
 114
 115         if (operation == CEPH_MDS_OP_GETFILELOCK) {
 116                 fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
 117                 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
 118                         fl->fl_type = F_RDLCK;
 119                 else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
 120                         fl->fl_type = F_WRLCK;
 121                 else
 122                         fl->fl_type = F_UNLCK;
 123
 124                 fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
 125                 length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
 126                                                  le64_to_cpu(req->r_reply_info.filelock_reply->length);
 127                 if (length >= 1)
 128                         fl->fl_end = length -1;
 129                 else
 130                         fl->fl_end = 0;
 131
 132         }
 133         ceph_mdsc_put_request(req);
 134         dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
 135              "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type,
 136              (int)operation, (u64)fl->fl_pid, fl->fl_start,
 137              length, wait, fl->fl_type, err);
 138         return err;
 139 }
 140
 141 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
 142                                          struct ceph_mds_request *req)
 143 {
 144         struct ceph_mds_request *intr_req;
 145         struct inode *inode = req->r_inode;
 146         int err, lock_type;
 147
 148         BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
 149         if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
 150                 lock_type = CEPH_LOCK_FCNTL_INTR;
 151         else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
 152                 lock_type = CEPH_LOCK_FLOCK_INTR;
 153         else
 154                 BUG_ON(1);
 155         BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
 156
 157         err = wait_for_completion_interruptible(&req->r_completion);
 158         if (!err)
 159                 return 0;
 160
 161         dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
 162              req->r_tid);
 163
 164         mutex_lock(&mdsc->mutex);
 165         if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
 166                 err = 0;
 167         } else {
 168                 /*
 169                  * ensure we aren't running concurrently with
 170                  * ceph_fill_trace or ceph_readdir_prepopulate, which
 171                  * rely on locks (dir mutex) held by our caller.
 172                  */
 173                 mutex_lock(&req->r_fill_mutex);
 174                 req->r_err = err;
 175                 set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
 176                 mutex_unlock(&req->r_fill_mutex);
 177
 178                 if (!req->r_session) {
 179                         // haven't sent the request
 180                         err = 0;
 181                 }
 182         }
 183         mutex_unlock(&mdsc->mutex);
 184         if (!err)
 185                 return 0;
 186
 187         intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
 188                                             USE_AUTH_MDS);
 189         if (IS_ERR(intr_req))
 190                 return PTR_ERR(intr_req);
 191
 192         intr_req->r_inode = inode;
 193         ihold(inode);
 194         intr_req->r_num_caps = 1;
 195
 196         intr_req->r_args.filelock_change = req->r_args.filelock_change;
 197         intr_req->r_args.filelock_change.rule = lock_type;
 198         intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
 199
 200         err = ceph_mdsc_do_request(mdsc, inode, intr_req);
 201         ceph_mdsc_put_request(intr_req);
 202
 203         if (err && err != -ERESTARTSYS)
 204                 return err;
 205
 206         wait_for_completion_killable(&req->r_safe_completion);
 207         return 0;
 208 }
 209
 210 /**
 211  * Attempt to set an fcntl lock.
 212  * For now, this just goes away to the server. Later it may be more awesome.
 213  */
 214 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 215 {
 216         struct inode *inode = file_inode(file);
 217         struct ceph_inode_info *ci = ceph_inode(inode);
 218         int err = 0;
 219         u16 op = CEPH_MDS_OP_SETFILELOCK;
 220         u8 wait = 0;
 221         u8 lock_cmd;
 222
 223         if (!(fl->fl_flags & FL_POSIX))
 224                 return -ENOLCK;
 225         /* No mandatory locks */
 226         if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
 227                 return -ENOLCK;
 228
 229         dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
 230
 231         /* set wait bit as appropriate, then make command as Ceph expects it*/
 232         if (IS_GETLK(cmd))
 233                 op = CEPH_MDS_OP_GETFILELOCK;
 234         else if (IS_SETLKW(cmd))
 235                 wait = 1;
 236
 237         spin_lock(&ci->i_ceph_lock);
 238         if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
 239                 err = -EIO;
 240         } else if (op == CEPH_MDS_OP_SETFILELOCK) {
 241                 /*
 242                  * increasing i_filelock_ref closes race window between
 243                  * handling request reply and adding file_lock struct to
 244                  * inode. Otherwise, i_auth_cap may get trimmed in the
 245                  * window. Caller function will decrease the counter.
 246                  */
 247                 fl->fl_ops = &ceph_fl_lock_ops;
 248                 atomic_inc(&ci->i_filelock_ref);
 249         }
 250         spin_unlock(&ci->i_ceph_lock);
 251         if (err < 0) {
 252                 if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type)
 253                         posix_lock_file(file, fl, NULL);
 254                 return err;
 255         }
 256
 257         if (F_RDLCK == fl->fl_type)
 258                 lock_cmd = CEPH_LOCK_SHARED;
 259         else if (F_WRLCK == fl->fl_type)
 260                 lock_cmd = CEPH_LOCK_EXCL;
 261         else
 262                 lock_cmd = CEPH_LOCK_UNLOCK;
 263
 264         err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
 265         if (!err) {
 266                 if (op == CEPH_MDS_OP_SETFILELOCK) {
 267                         dout("mds locked, locking locally\n");
 268                         err = posix_lock_file(file, fl, NULL);
 269                         if (err) {
 270                                 /* undo! This should only happen if
 271                                  * the kernel detects local
 272                                  * deadlock. */
 273                                 ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
 274                                                   CEPH_LOCK_UNLOCK, 0, fl);
 275                                 dout("got %d on posix_lock_file, undid lock\n",
 276                                      err);
 277                         }
 278                 }
 279         }
 280         return err;
 281 }
 282
 283 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 284 {
 285         struct inode *inode = file_inode(file);
 286         struct ceph_inode_info *ci = ceph_inode(inode);
 287         int err = 0;
 288         u8 wait = 0;
 289         u8 lock_cmd;
 290
 291         if (!(fl->fl_flags & FL_FLOCK))
 292                 return -ENOLCK;
 293         /* No mandatory locks */
 294         if (fl->fl_type & LOCK_MAND)
 295                 return -EOPNOTSUPP;
 296
 297         dout("ceph_flock, fl_file: %p\n", fl->fl_file);
 298
 299         spin_lock(&ci->i_ceph_lock);
 300         if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
 301                 err = -EIO;
 302         } else {
 303                 /* see comment in ceph_lock */
 304                 fl->fl_ops = &ceph_fl_lock_ops;
 305                 atomic_inc(&ci->i_filelock_ref);
 306         }
 307         spin_unlock(&ci->i_ceph_lock);
 308         if (err < 0) {
 309                 if (F_UNLCK == fl->fl_type)
 310                         locks_lock_file_wait(file, fl);
 311                 return err;
 312         }
 313
 314         if (IS_SETLKW(cmd))
 315                 wait = 1;
 316
 317         if (F_RDLCK == fl->fl_type)
 318                 lock_cmd = CEPH_LOCK_SHARED;
 319         else if (F_WRLCK == fl->fl_type)
 320                 lock_cmd = CEPH_LOCK_EXCL;
 321         else
 322                 lock_cmd = CEPH_LOCK_UNLOCK;
 323
 324         err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
 325                                 inode, lock_cmd, wait, fl);
 326         if (!err) {
 327                 err = locks_lock_file_wait(file, fl);
 328                 if (err) {
 329                         ceph_lock_message(CEPH_LOCK_FLOCK,
 330                                           CEPH_MDS_OP_SETFILELOCK,
 331                                           inode, CEPH_LOCK_UNLOCK, 0, fl);
 332                         dout("got %d on locks_lock_file_wait, undid lock\n", err);
 333                 }
 334         }
 335         return err;
 336 }
 337
 338 /*
 339  * Fills in the passed counter variables, so you can prepare pagelist metadata
 340  * before calling ceph_encode_locks.
 341  */
 342 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 343 {
 344         struct file_lock *lock;
 345         struct file_lock_context *ctx;
 346
 347         *fcntl_count = 0;
 348         *flock_count = 0;
 349
 350         ctx = inode->i_flctx;
 351         if (ctx) {
 352                 spin_lock(&ctx->flc_lock);
 353                 list_for_each_entry(lock, &ctx->flc_posix, fl_list)
 354                         ++(*fcntl_count);
 355                 list_for_each_entry(lock, &ctx->flc_flock, fl_list)
 356                         ++(*flock_count);
 357                 spin_unlock(&ctx->flc_lock);
 358         }
 359         dout("counted %d flock locks and %d fcntl locks\n",
 360              *flock_count, *fcntl_count);
 361 }
 362
 363 /*
 364  * Given a pointer to a lock, convert it to a ceph filelock
 365  */
 366 static int lock_to_ceph_filelock(struct file_lock *lock,
 367                                  struct ceph_filelock *cephlock)
 368 {
 369         int err = 0;
 370         cephlock->start = cpu_to_le64(lock->fl_start);
 371         cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
 372         cephlock->client = cpu_to_le64(0);
 373         cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
 374         cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
 375
 376         switch (lock->fl_type) {
 377         case F_RDLCK:
 378                 cephlock->type = CEPH_LOCK_SHARED;
 379                 break;
 380         case F_WRLCK:
 381                 cephlock->type = CEPH_LOCK_EXCL;
 382                 break;
 383         case F_UNLCK:
 384                 cephlock->type = CEPH_LOCK_UNLOCK;
 385                 break;
 386         default:
 387                 dout("Have unknown lock type %d\n", lock->fl_type);
 388                 err = -EINVAL;
 389         }
 390
 391         return err;
 392 }
 393
 394 /**
 395  * Encode the flock and fcntl locks for the given inode into the ceph_filelock
 396  * array. Must be called with inode->i_lock already held.
 397  * If we encounter more of a specific lock type than expected, return -ENOSPC.
 398  */
 399 int ceph_encode_locks_to_buffer(struct inode *inode,
 400                                 struct ceph_filelock *flocks,
 401                                 int num_fcntl_locks, int num_flock_locks)
 402 {
 403         struct file_lock *lock;
 404         struct file_lock_context *ctx = inode->i_flctx;
 405         int err = 0;
 406         int seen_fcntl = 0;
 407         int seen_flock = 0;
 408         int l = 0;
 409
 410         dout("encoding %d flock and %d fcntl locks\n", num_flock_locks,
 411              num_fcntl_locks);
 412
 413         if (!ctx)
 414                 return 0;
 415
 416         spin_lock(&ctx->flc_lock);
 417         list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
 418                 ++seen_fcntl;
 419                 if (seen_fcntl > num_fcntl_locks) {
 420                         err = -ENOSPC;
 421                         goto fail;
 422                 }
 423                 err = lock_to_ceph_filelock(lock, &flocks[l]);
 424                 if (err)
 425                         goto fail;
 426                 ++l;
 427         }
 428         list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
 429                 ++seen_flock;
 430                 if (seen_flock > num_flock_locks) {
 431                         err = -ENOSPC;
 432                         goto fail;
 433                 }
 434                 err = lock_to_ceph_filelock(lock, &flocks[l]);
 435                 if (err)
 436                         goto fail;
 437                 ++l;
 438         }
 439 fail:
 440         spin_unlock(&ctx->flc_lock);
 441         return err;
 442 }
 443
 444 /**
 445  * Copy the encoded flock and fcntl locks into the pagelist.
 446  * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
 447  * sequential flock locks.
 448  * Returns zero on success.
 449  */
 450 int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
 451                            struct ceph_pagelist *pagelist,
 452                            int num_fcntl_locks, int num_flock_locks)
 453 {
 454         int err = 0;
 455         __le32 nlocks;
 456
 457         nlocks = cpu_to_le32(num_fcntl_locks);
 458         err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
 459         if (err)
 460                 goto out_fail;
 461
 462         if (num_fcntl_locks > 0) {
 463                 err = ceph_pagelist_append(pagelist, flocks,
 464                                            num_fcntl_locks * sizeof(*flocks));
 465                 if (err)
 466                         goto out_fail;
 467         }
 468
 469         nlocks = cpu_to_le32(num_flock_locks);
 470         err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
 471         if (err)
 472                 goto out_fail;
 473
 474         if (num_flock_locks > 0) {
 475                 err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
 476                                            num_flock_locks * sizeof(*flocks));
 477         }
 478 out_fail:
 479         return err;
 480 }