2 CTDB mutex helper using Ceph librados locks
4 Copyright (C) David Disseldorp 2016-2020
6 Based on ctdb_mutex_fcntl_helper.c, which is:
7 Copyright (C) Martin Schwenke 2015
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, see <http://www.gnu.org/licenses/>.
27 #include "rados/librados.h"
29 #define CTDB_MUTEX_CEPH_LOCK_NAME "ctdb_reclock_mutex"
30 #define CTDB_MUTEX_CEPH_LOCK_COOKIE CTDB_MUTEX_CEPH_LOCK_NAME
31 #define CTDB_MUTEX_CEPH_LOCK_DESC "CTDB cluster lock"
33 * During failover it may take up to <lock duration> seconds before the
34 * newly elected recovery master can obtain the lock.
36 #define CTDB_MUTEX_CEPH_LOCK_DURATION_SECS_DEFAULT 10
38 #define CTDB_MUTEX_STATUS_HOLDING "0"
39 #define CTDB_MUTEX_STATUS_CONTENDED "1"
40 #define CTDB_MUTEX_STATUS_TIMEOUT "2"
41 #define CTDB_MUTEX_STATUS_ERROR "3"
43 static char *progname
= NULL
;
45 static void usage(void)
47 fprintf(stderr
, "Usage: %s <Ceph Cluster> <Ceph user> "
48 "<RADOS pool> <RADOS object> "
49 "[lock duration secs] [-n RADOS namespace]\n",
53 static int ctdb_mutex_rados_ctx_create(const char *ceph_cluster_name
,
54 const char *ceph_auth_name
,
55 const char *pool_name
,
56 const char *namespace,
57 rados_t
*_ceph_cluster
,
58 rados_ioctx_t
*_ioctx
)
60 rados_t ceph_cluster
= NULL
;
61 rados_ioctx_t ioctx
= NULL
;
64 ret
= rados_create2(&ceph_cluster
, ceph_cluster_name
, ceph_auth_name
, 0);
66 fprintf(stderr
, "%s: failed to initialise Ceph cluster %s as %s"
67 " - (%s)\n", progname
, ceph_cluster_name
, ceph_auth_name
,
72 /* path=NULL tells librados to use default locations */
73 ret
= rados_conf_read_file(ceph_cluster
, NULL
);
75 fprintf(stderr
, "%s: failed to parse Ceph cluster config"
76 " - (%s)\n", progname
, strerror(-ret
));
77 rados_shutdown(ceph_cluster
);
81 ret
= rados_connect(ceph_cluster
);
83 fprintf(stderr
, "%s: failed to connect to Ceph cluster %s as %s"
84 " - (%s)\n", progname
, ceph_cluster_name
, ceph_auth_name
,
86 rados_shutdown(ceph_cluster
);
91 ret
= rados_ioctx_create(ceph_cluster
, pool_name
, &ioctx
);
93 fprintf(stderr
, "%s: failed to create Ceph ioctx for pool %s"
94 " - (%s)\n", progname
, pool_name
, strerror(-ret
));
95 rados_shutdown(ceph_cluster
);
99 if (namespace != NULL
) {
100 rados_ioctx_set_namespace(ioctx
, namespace);
103 *_ceph_cluster
= ceph_cluster
;
109 static int ctdb_mutex_rados_lock(rados_ioctx_t
*ioctx
,
111 uint64_t lock_duration_s
,
115 struct timeval tv
= { lock_duration_s
, 0 };
117 ret
= rados_lock_exclusive(ioctx
, oid
,
118 CTDB_MUTEX_CEPH_LOCK_NAME
,
119 CTDB_MUTEX_CEPH_LOCK_COOKIE
,
120 CTDB_MUTEX_CEPH_LOCK_DESC
,
121 lock_duration_s
== 0 ? NULL
: &tv
,
123 if ((ret
== -EEXIST
) || (ret
== -EBUSY
)) {
124 /* lock contention */
126 } else if (ret
< 0) {
127 /* unexpected failure */
129 "%s: Failed to get lock on RADOS object '%s' - (%s)\n",
130 progname
, oid
, strerror(-ret
));
138 static int ctdb_mutex_rados_unlock(rados_ioctx_t
*ioctx
,
143 ret
= rados_unlock(ioctx
, oid
,
144 CTDB_MUTEX_CEPH_LOCK_NAME
,
145 CTDB_MUTEX_CEPH_LOCK_COOKIE
);
148 "%s: Failed to drop lock on RADOS object '%s' - (%s)\n",
149 progname
, oid
, strerror(-ret
));
156 struct ctdb_mutex_rados_state
{
158 const char *ceph_cluster_name
;
159 const char *ceph_auth_name
;
160 const char *pool_name
;
161 const char *namespace;
163 uint64_t lock_duration_s
;
165 struct tevent_context
*ev
;
166 struct tevent_signal
*sigterm_ev
;
167 struct tevent_signal
*sigint_ev
;
168 struct tevent_timer
*ppid_timer_ev
;
169 struct tevent_timer
*renew_timer_ev
;
170 rados_t ceph_cluster
;
174 static void ctdb_mutex_rados_sigterm_cb(struct tevent_context
*ev
,
175 struct tevent_signal
*se
,
181 struct ctdb_mutex_rados_state
*cmr_state
= private_data
;
184 if (!cmr_state
->holding_mutex
) {
185 fprintf(stderr
, "Sigterm callback invoked without mutex!\n");
189 talloc_free(cmr_state
);
193 static void ctdb_mutex_rados_ppid_timer_cb(struct tevent_context
*ev
,
194 struct tevent_timer
*te
,
195 struct timeval current_time
,
198 struct ctdb_mutex_rados_state
*cmr_state
= private_data
;
201 if (!cmr_state
->holding_mutex
) {
202 fprintf(stderr
, "Timer callback invoked without mutex!\n");
204 goto err_ctx_cleanup
;
207 if ((kill(cmr_state
->ppid
, 0) == 0) || (errno
!= ESRCH
)) {
208 /* parent still around, keep waiting */
209 cmr_state
->ppid_timer_ev
= tevent_add_timer(cmr_state
->ev
,
211 tevent_timeval_current_ofs(5, 0),
212 ctdb_mutex_rados_ppid_timer_cb
,
214 if (cmr_state
->ppid_timer_ev
== NULL
) {
215 fprintf(stderr
, "Failed to create timer event\n");
216 /* rely on signal cb */
221 /* parent ended, drop lock (via destructor) and exit */
223 talloc_free(cmr_state
);
227 #define USECS_IN_SEC 1000000
229 static void ctdb_mutex_rados_lock_renew_timer_cb(struct tevent_context
*ev
,
230 struct tevent_timer
*te
,
231 struct timeval current_time
,
234 struct ctdb_mutex_rados_state
*cmr_state
= private_data
;
238 ret
= ctdb_mutex_rados_lock(cmr_state
->ioctx
, cmr_state
->object
,
239 cmr_state
->lock_duration_s
,
240 LIBRADOS_LOCK_FLAG_RENEW
);
242 /* should never get -EEXIST on renewal */
243 fprintf(stderr
, "Lock contention during renew: %d\n", ret
);
244 goto err_ctx_cleanup
;
245 } else if (ret
< 0) {
246 fprintf(stderr
, "Lock renew failed\n");
247 goto err_ctx_cleanup
;
250 tv
= tevent_timeval_current_ofs(0,
251 cmr_state
->lock_duration_s
* (USECS_IN_SEC
/ 2));
252 cmr_state
->renew_timer_ev
= tevent_add_timer(cmr_state
->ev
,
255 ctdb_mutex_rados_lock_renew_timer_cb
,
257 if (cmr_state
->renew_timer_ev
== NULL
) {
258 fprintf(stderr
, "Failed to create timer event\n");
259 goto err_ctx_cleanup
;
265 /* drop lock (via destructor) and exit */
266 talloc_free(cmr_state
);
270 static int ctdb_mutex_rados_state_destroy(struct ctdb_mutex_rados_state
*cmr_state
)
272 if (cmr_state
->holding_mutex
) {
273 ctdb_mutex_rados_unlock(cmr_state
->ioctx
, cmr_state
->object
);
275 if (cmr_state
->ioctx
!= NULL
) {
276 rados_ioctx_destroy(cmr_state
->ioctx
);
278 if (cmr_state
->ceph_cluster
!= NULL
) {
279 rados_shutdown(cmr_state
->ceph_cluster
);
284 /* register this host+service with ceph-mgr for visibility */
285 static int ctdb_mutex_rados_mgr_reg(rados_t ceph_cluster
)
288 uint64_t instance_guid
;
291 instance_guid
= rados_get_instance_id(ceph_cluster
);
292 ret
= snprintf(id_buf
, sizeof(id_buf
), "%s:0x%016llx",
293 "ctdb_mutex_ceph_rados_helper",
294 (unsigned long long)instance_guid
);
295 if (ret
< 0 || ret
>= sizeof(id_buf
)) {
296 fprintf(stderr
, "Ceph instance name too long\n");
297 return -ENAMETOOLONG
;
300 ret
= rados_service_register(ceph_cluster
, "ctdb", id_buf
, "");
302 fprintf(stderr
, "failed to register service with ceph-mgr\n");
309 int main(int argc
, char *argv
[])
313 struct ctdb_mutex_rados_state
*cmr_state
;
323 ret
= setvbuf(stdout
, NULL
, _IONBF
, 0);
325 fprintf(stderr
, "Failed to configure unbuffered stdout I/O\n");
328 cmr_state
= talloc_zero(NULL
, struct ctdb_mutex_rados_state
);
329 if (cmr_state
== NULL
) {
330 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
335 talloc_set_destructor(cmr_state
, ctdb_mutex_rados_state_destroy
);
336 cmr_state
->ceph_cluster_name
= argv
[1];
337 cmr_state
->ceph_auth_name
= argv
[2];
338 cmr_state
->pool_name
= argv
[3];
339 cmr_state
->object
= argv
[4];
342 while ((opt
= getopt(argc
, argv
, "n:")) != -1) {
345 cmr_state
->namespace = optarg
;
350 goto err_ctx_cleanup
;
354 if (argv
[optind
] != NULL
) {
355 /* optional lock duration provided */
357 cmr_state
->lock_duration_s
= strtoull(argv
[optind
], &endptr
, 0);
358 if ((endptr
== argv
[optind
]) || (*endptr
!= '\0')) {
359 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
361 goto err_ctx_cleanup
;
363 if (argv
[++optind
] != NULL
) {
364 /* incorrect count or format for optional arguments */
367 goto err_ctx_cleanup
;
371 cmr_state
->lock_duration_s
372 = CTDB_MUTEX_CEPH_LOCK_DURATION_SECS_DEFAULT
;
375 cmr_state
->ppid
= getppid();
376 if (cmr_state
->ppid
== 1) {
378 * The original parent is gone and the process has
379 * been reparented to init. This can happen if the
380 * helper is started just as the parent is killed
381 * during shutdown. The error message doesn't need to
382 * be stellar, since there won't be anything around to
383 * capture and log it...
385 fprintf(stderr
, "%s: PPID == 1\n", progname
);
387 goto err_ctx_cleanup
;
390 cmr_state
->ev
= tevent_context_init(cmr_state
);
391 if (cmr_state
->ev
== NULL
) {
392 fprintf(stderr
, "tevent_context_init failed\n");
393 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
395 goto err_ctx_cleanup
;
398 /* wait for sigterm */
399 cmr_state
->sigterm_ev
= tevent_add_signal(cmr_state
->ev
, cmr_state
, SIGTERM
, 0,
400 ctdb_mutex_rados_sigterm_cb
,
402 if (cmr_state
->sigterm_ev
== NULL
) {
403 fprintf(stderr
, "Failed to create term signal event\n");
404 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
406 goto err_ctx_cleanup
;
409 cmr_state
->sigint_ev
= tevent_add_signal(cmr_state
->ev
, cmr_state
, SIGINT
, 0,
410 ctdb_mutex_rados_sigterm_cb
,
412 if (cmr_state
->sigint_ev
== NULL
) {
413 fprintf(stderr
, "Failed to create int signal event\n");
414 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
416 goto err_ctx_cleanup
;
419 /* periodically check parent */
420 cmr_state
->ppid_timer_ev
= tevent_add_timer(cmr_state
->ev
, cmr_state
,
421 tevent_timeval_current_ofs(5, 0),
422 ctdb_mutex_rados_ppid_timer_cb
,
424 if (cmr_state
->ppid_timer_ev
== NULL
) {
425 fprintf(stderr
, "Failed to create timer event\n");
426 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
428 goto err_ctx_cleanup
;
431 ret
= ctdb_mutex_rados_ctx_create(cmr_state
->ceph_cluster_name
,
432 cmr_state
->ceph_auth_name
,
433 cmr_state
->pool_name
,
434 cmr_state
->namespace,
435 &cmr_state
->ceph_cluster
,
438 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
439 goto err_ctx_cleanup
;
442 ret
= ctdb_mutex_rados_mgr_reg(cmr_state
->ceph_cluster
);
444 fprintf(stderr
, "Failed to register with ceph-mgr\n");
445 /* ignore: ceph-mgr service registration is informational */
448 ret
= ctdb_mutex_rados_lock(cmr_state
->ioctx
, cmr_state
->object
,
449 cmr_state
->lock_duration_s
,
451 if ((ret
== -EEXIST
) || (ret
== -EBUSY
)) {
452 fprintf(stdout
, CTDB_MUTEX_STATUS_CONTENDED
);
453 goto err_ctx_cleanup
;
454 } else if (ret
< 0) {
455 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
456 goto err_ctx_cleanup
;
458 cmr_state
->holding_mutex
= true;
460 if (cmr_state
->lock_duration_s
!= 0) {
462 * renew (reobtain) the lock, using a period of half the lock
463 * duration. Convert to usecs to avoid rounding.
465 struct timeval tv
= tevent_timeval_current_ofs(0,
466 cmr_state
->lock_duration_s
* (USECS_IN_SEC
/ 2));
467 cmr_state
->renew_timer_ev
= tevent_add_timer(cmr_state
->ev
,
470 ctdb_mutex_rados_lock_renew_timer_cb
,
472 if (cmr_state
->renew_timer_ev
== NULL
) {
473 fprintf(stderr
, "Failed to create timer event\n");
474 fprintf(stdout
, CTDB_MUTEX_STATUS_ERROR
);
476 goto err_ctx_cleanup
;
480 fprintf(stdout
, CTDB_MUTEX_STATUS_HOLDING
);
482 /* wait for the signal / timer events to do their work */
483 ret
= tevent_loop_wait(cmr_state
->ev
);
485 goto err_ctx_cleanup
;
488 talloc_free(cmr_state
);