1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/ceph/ceph_debug.h>
4 #include <linux/file.h>
5 #include <linux/namei.h>
6 #include <linux/random.h>
9 #include "mds_client.h"
10 #include <linux/filelock.h>
11 #include <linux/ceph/pagelist.h>
13 static u64 lock_secret
;
14 static int ceph_lock_wait_for_completion(struct ceph_mds_client
*mdsc
,
15 struct ceph_mds_request
*req
);
17 static inline u64
secure_addr(void *addr
)
19 u64 v
= lock_secret
^ (u64
)(unsigned long)addr
;
21 * Set the most significant bit, so that MDS knows the 'owner'
22 * is sufficient to identify the owner of lock. (old code uses
23 * both 'owner' and 'pid')
29 void __init
ceph_flock_init(void)
31 get_random_bytes(&lock_secret
, sizeof(lock_secret
));
34 static void ceph_fl_copy_lock(struct file_lock
*dst
, struct file_lock
*src
)
36 struct inode
*inode
= file_inode(dst
->c
.flc_file
);
37 atomic_inc(&ceph_inode(inode
)->i_filelock_ref
);
38 dst
->fl_u
.ceph
.inode
= igrab(inode
);
42 * Do not use the 'fl->fl_file' in release function, which
43 * is possibly already released by another thread.
45 static void ceph_fl_release_lock(struct file_lock
*fl
)
47 struct inode
*inode
= fl
->fl_u
.ceph
.inode
;
48 struct ceph_inode_info
*ci
;
51 * If inode is NULL it should be a request file_lock,
57 ci
= ceph_inode(inode
);
58 if (atomic_dec_and_test(&ci
->i_filelock_ref
)) {
59 /* clear error when all locks are released */
60 spin_lock(&ci
->i_ceph_lock
);
61 ci
->i_ceph_flags
&= ~CEPH_I_ERROR_FILELOCK
;
62 spin_unlock(&ci
->i_ceph_lock
);
64 fl
->fl_u
.ceph
.inode
= NULL
;
68 static const struct file_lock_operations ceph_fl_lock_ops
= {
69 .fl_copy_lock
= ceph_fl_copy_lock
,
70 .fl_release_private
= ceph_fl_release_lock
,
74 * Implement fcntl and flock locking functions.
76 static int ceph_lock_message(u8 lock_type
, u16 operation
, struct inode
*inode
,
77 int cmd
, u8 wait
, struct file_lock
*fl
)
79 struct ceph_mds_client
*mdsc
= ceph_sb_to_mdsc(inode
->i_sb
);
80 struct ceph_client
*cl
= mdsc
->fsc
->client
;
81 struct ceph_mds_request
*req
;
86 if (operation
== CEPH_MDS_OP_SETFILELOCK
) {
88 * increasing i_filelock_ref closes race window between
89 * handling request reply and adding file_lock struct to
90 * inode. Otherwise, auth caps may get trimmed in the
91 * window. Caller function will decrease the counter.
93 fl
->fl_ops
= &ceph_fl_lock_ops
;
94 fl
->fl_ops
->fl_copy_lock(fl
, NULL
);
97 if (operation
!= CEPH_MDS_OP_SETFILELOCK
|| cmd
== CEPH_LOCK_UNLOCK
)
100 req
= ceph_mdsc_create_request(mdsc
, operation
, USE_AUTH_MDS
);
103 req
->r_inode
= inode
;
107 /* mds requires start and length rather than start and end */
108 if (LLONG_MAX
== fl
->fl_end
)
111 length
= fl
->fl_end
- fl
->fl_start
+ 1;
113 owner
= secure_addr(fl
->c
.flc_owner
);
115 doutc(cl
, "rule: %d, op: %d, owner: %llx, pid: %llu, "
116 "start: %llu, length: %llu, wait: %d, type: %d\n",
117 (int)lock_type
, (int)operation
, owner
,
119 fl
->fl_start
, length
, wait
, fl
->c
.flc_type
);
121 req
->r_args
.filelock_change
.rule
= lock_type
;
122 req
->r_args
.filelock_change
.type
= cmd
;
123 req
->r_args
.filelock_change
.owner
= cpu_to_le64(owner
);
124 req
->r_args
.filelock_change
.pid
= cpu_to_le64((u64
) fl
->c
.flc_pid
);
125 req
->r_args
.filelock_change
.start
= cpu_to_le64(fl
->fl_start
);
126 req
->r_args
.filelock_change
.length
= cpu_to_le64(length
);
127 req
->r_args
.filelock_change
.wait
= wait
;
129 err
= ceph_mdsc_submit_request(mdsc
, inode
, req
);
131 err
= ceph_mdsc_wait_request(mdsc
, req
, wait
?
132 ceph_lock_wait_for_completion
: NULL
);
133 if (!err
&& operation
== CEPH_MDS_OP_GETFILELOCK
) {
134 fl
->c
.flc_pid
= -le64_to_cpu(req
->r_reply_info
.filelock_reply
->pid
);
135 if (CEPH_LOCK_SHARED
== req
->r_reply_info
.filelock_reply
->type
)
136 fl
->c
.flc_type
= F_RDLCK
;
137 else if (CEPH_LOCK_EXCL
== req
->r_reply_info
.filelock_reply
->type
)
138 fl
->c
.flc_type
= F_WRLCK
;
140 fl
->c
.flc_type
= F_UNLCK
;
142 fl
->fl_start
= le64_to_cpu(req
->r_reply_info
.filelock_reply
->start
);
143 length
= le64_to_cpu(req
->r_reply_info
.filelock_reply
->start
) +
144 le64_to_cpu(req
->r_reply_info
.filelock_reply
->length
);
146 fl
->fl_end
= length
-1;
151 ceph_mdsc_put_request(req
);
152 doutc(cl
, "rule: %d, op: %d, pid: %llu, start: %llu, "
153 "length: %llu, wait: %d, type: %d, err code %d\n",
154 (int)lock_type
, (int)operation
, (u64
) fl
->c
.flc_pid
,
155 fl
->fl_start
, length
, wait
, fl
->c
.flc_type
, err
);
159 static int ceph_lock_wait_for_completion(struct ceph_mds_client
*mdsc
,
160 struct ceph_mds_request
*req
)
162 struct ceph_client
*cl
= mdsc
->fsc
->client
;
163 struct ceph_mds_request
*intr_req
;
164 struct inode
*inode
= req
->r_inode
;
167 BUG_ON(req
->r_op
!= CEPH_MDS_OP_SETFILELOCK
);
168 if (req
->r_args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
169 lock_type
= CEPH_LOCK_FCNTL_INTR
;
170 else if (req
->r_args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
171 lock_type
= CEPH_LOCK_FLOCK_INTR
;
174 BUG_ON(req
->r_args
.filelock_change
.type
== CEPH_LOCK_UNLOCK
);
176 err
= wait_for_completion_interruptible(&req
->r_completion
);
180 doutc(cl
, "request %llu was interrupted\n", req
->r_tid
);
182 mutex_lock(&mdsc
->mutex
);
183 if (test_bit(CEPH_MDS_R_GOT_RESULT
, &req
->r_req_flags
)) {
187 * ensure we aren't running concurrently with
188 * ceph_fill_trace or ceph_readdir_prepopulate, which
189 * rely on locks (dir mutex) held by our caller.
191 mutex_lock(&req
->r_fill_mutex
);
193 set_bit(CEPH_MDS_R_ABORTED
, &req
->r_req_flags
);
194 mutex_unlock(&req
->r_fill_mutex
);
196 if (!req
->r_session
) {
197 // haven't sent the request
201 mutex_unlock(&mdsc
->mutex
);
205 intr_req
= ceph_mdsc_create_request(mdsc
, CEPH_MDS_OP_SETFILELOCK
,
207 if (IS_ERR(intr_req
))
208 return PTR_ERR(intr_req
);
210 intr_req
->r_inode
= inode
;
212 intr_req
->r_num_caps
= 1;
214 intr_req
->r_args
.filelock_change
= req
->r_args
.filelock_change
;
215 intr_req
->r_args
.filelock_change
.rule
= lock_type
;
216 intr_req
->r_args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
218 err
= ceph_mdsc_do_request(mdsc
, inode
, intr_req
);
219 ceph_mdsc_put_request(intr_req
);
221 if (err
&& err
!= -ERESTARTSYS
)
224 wait_for_completion_killable(&req
->r_safe_completion
);
228 static int try_unlock_file(struct file
*file
, struct file_lock
*fl
)
231 unsigned int orig_flags
= fl
->c
.flc_flags
;
232 fl
->c
.flc_flags
|= FL_EXISTS
;
233 err
= locks_lock_file_wait(file
, fl
);
234 fl
->c
.flc_flags
= orig_flags
;
235 if (err
== -ENOENT
) {
236 if (!(orig_flags
& FL_EXISTS
))
244 * Attempt to set an fcntl lock.
245 * For now, this just goes away to the server. Later it may be more awesome.
247 int ceph_lock(struct file
*file
, int cmd
, struct file_lock
*fl
)
249 struct inode
*inode
= file_inode(file
);
250 struct ceph_inode_info
*ci
= ceph_inode(inode
);
251 struct ceph_client
*cl
= ceph_inode_to_client(inode
);
253 u16 op
= CEPH_MDS_OP_SETFILELOCK
;
257 if (!(fl
->c
.flc_flags
& FL_POSIX
))
260 if (ceph_inode_is_shutdown(inode
))
263 doutc(cl
, "fl_owner: %p\n", fl
->c
.flc_owner
);
265 /* set wait bit as appropriate, then make command as Ceph expects it*/
267 op
= CEPH_MDS_OP_GETFILELOCK
;
268 else if (IS_SETLKW(cmd
))
271 spin_lock(&ci
->i_ceph_lock
);
272 if (ci
->i_ceph_flags
& CEPH_I_ERROR_FILELOCK
) {
275 spin_unlock(&ci
->i_ceph_lock
);
277 if (op
== CEPH_MDS_OP_SETFILELOCK
&& lock_is_unlock(fl
))
278 posix_lock_file(file
, fl
, NULL
);
282 if (lock_is_read(fl
))
283 lock_cmd
= CEPH_LOCK_SHARED
;
284 else if (lock_is_write(fl
))
285 lock_cmd
= CEPH_LOCK_EXCL
;
287 lock_cmd
= CEPH_LOCK_UNLOCK
;
289 if (op
== CEPH_MDS_OP_SETFILELOCK
&& lock_is_unlock(fl
)) {
290 err
= try_unlock_file(file
, fl
);
295 err
= ceph_lock_message(CEPH_LOCK_FCNTL
, op
, inode
, lock_cmd
, wait
, fl
);
297 if (op
== CEPH_MDS_OP_SETFILELOCK
&& F_UNLCK
!= fl
->c
.flc_type
) {
298 doutc(cl
, "locking locally\n");
299 err
= posix_lock_file(file
, fl
, NULL
);
301 /* undo! This should only happen if
302 * the kernel detects local
304 ceph_lock_message(CEPH_LOCK_FCNTL
, op
, inode
,
305 CEPH_LOCK_UNLOCK
, 0, fl
);
306 doutc(cl
, "got %d on posix_lock_file, undid lock\n",
314 int ceph_flock(struct file
*file
, int cmd
, struct file_lock
*fl
)
316 struct inode
*inode
= file_inode(file
);
317 struct ceph_inode_info
*ci
= ceph_inode(inode
);
318 struct ceph_client
*cl
= ceph_inode_to_client(inode
);
323 if (!(fl
->c
.flc_flags
& FL_FLOCK
))
326 if (ceph_inode_is_shutdown(inode
))
329 doutc(cl
, "fl_file: %p\n", fl
->c
.flc_file
);
331 spin_lock(&ci
->i_ceph_lock
);
332 if (ci
->i_ceph_flags
& CEPH_I_ERROR_FILELOCK
) {
335 spin_unlock(&ci
->i_ceph_lock
);
337 if (lock_is_unlock(fl
))
338 locks_lock_file_wait(file
, fl
);
345 if (lock_is_read(fl
))
346 lock_cmd
= CEPH_LOCK_SHARED
;
347 else if (lock_is_write(fl
))
348 lock_cmd
= CEPH_LOCK_EXCL
;
350 lock_cmd
= CEPH_LOCK_UNLOCK
;
352 if (lock_is_unlock(fl
)) {
353 err
= try_unlock_file(file
, fl
);
358 err
= ceph_lock_message(CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
,
359 inode
, lock_cmd
, wait
, fl
);
360 if (!err
&& F_UNLCK
!= fl
->c
.flc_type
) {
361 err
= locks_lock_file_wait(file
, fl
);
363 ceph_lock_message(CEPH_LOCK_FLOCK
,
364 CEPH_MDS_OP_SETFILELOCK
,
365 inode
, CEPH_LOCK_UNLOCK
, 0, fl
);
366 doutc(cl
, "got %d on locks_lock_file_wait, undid lock\n",
374 * Fills in the passed counter variables, so you can prepare pagelist metadata
375 * before calling ceph_encode_locks.
377 void ceph_count_locks(struct inode
*inode
, int *fcntl_count
, int *flock_count
)
379 struct ceph_client
*cl
= ceph_inode_to_client(inode
);
380 struct file_lock
*lock
;
381 struct file_lock_context
*ctx
;
386 ctx
= locks_inode_context(inode
);
388 spin_lock(&ctx
->flc_lock
);
389 for_each_file_lock(lock
, &ctx
->flc_posix
)
391 for_each_file_lock(lock
, &ctx
->flc_flock
)
393 spin_unlock(&ctx
->flc_lock
);
395 doutc(cl
, "counted %d flock locks and %d fcntl locks\n",
396 *flock_count
, *fcntl_count
);
400 * Given a pointer to a lock, convert it to a ceph filelock
402 static int lock_to_ceph_filelock(struct inode
*inode
,
403 struct file_lock
*lock
,
404 struct ceph_filelock
*cephlock
)
406 struct ceph_client
*cl
= ceph_inode_to_client(inode
);
409 cephlock
->start
= cpu_to_le64(lock
->fl_start
);
410 cephlock
->length
= cpu_to_le64(lock
->fl_end
- lock
->fl_start
+ 1);
411 cephlock
->client
= cpu_to_le64(0);
412 cephlock
->pid
= cpu_to_le64((u64
) lock
->c
.flc_pid
);
413 cephlock
->owner
= cpu_to_le64(secure_addr(lock
->c
.flc_owner
));
415 switch (lock
->c
.flc_type
) {
417 cephlock
->type
= CEPH_LOCK_SHARED
;
420 cephlock
->type
= CEPH_LOCK_EXCL
;
423 cephlock
->type
= CEPH_LOCK_UNLOCK
;
426 doutc(cl
, "Have unknown lock type %d\n",
435 * Encode the flock and fcntl locks for the given inode into the ceph_filelock
436 * array. Must be called with inode->i_lock already held.
437 * If we encounter more of a specific lock type than expected, return -ENOSPC.
439 int ceph_encode_locks_to_buffer(struct inode
*inode
,
440 struct ceph_filelock
*flocks
,
441 int num_fcntl_locks
, int num_flock_locks
)
443 struct file_lock
*lock
;
444 struct file_lock_context
*ctx
= locks_inode_context(inode
);
445 struct ceph_client
*cl
= ceph_inode_to_client(inode
);
451 doutc(cl
, "encoding %d flock and %d fcntl locks\n", num_flock_locks
,
457 spin_lock(&ctx
->flc_lock
);
458 for_each_file_lock(lock
, &ctx
->flc_posix
) {
460 if (seen_fcntl
> num_fcntl_locks
) {
464 err
= lock_to_ceph_filelock(inode
, lock
, &flocks
[l
]);
469 for_each_file_lock(lock
, &ctx
->flc_flock
) {
471 if (seen_flock
> num_flock_locks
) {
475 err
= lock_to_ceph_filelock(inode
, lock
, &flocks
[l
]);
481 spin_unlock(&ctx
->flc_lock
);
486 * Copy the encoded flock and fcntl locks into the pagelist.
487 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
488 * sequential flock locks.
489 * Returns zero on success.
491 int ceph_locks_to_pagelist(struct ceph_filelock
*flocks
,
492 struct ceph_pagelist
*pagelist
,
493 int num_fcntl_locks
, int num_flock_locks
)
498 nlocks
= cpu_to_le32(num_fcntl_locks
);
499 err
= ceph_pagelist_append(pagelist
, &nlocks
, sizeof(nlocks
));
503 if (num_fcntl_locks
> 0) {
504 err
= ceph_pagelist_append(pagelist
, flocks
,
505 num_fcntl_locks
* sizeof(*flocks
));
510 nlocks
= cpu_to_le32(num_flock_locks
);
511 err
= ceph_pagelist_append(pagelist
, &nlocks
, sizeof(nlocks
));
515 if (num_flock_locks
> 0) {
516 err
= ceph_pagelist_append(pagelist
, &flocks
[num_fcntl_locks
],
517 num_flock_locks
* sizeof(*flocks
));