ctdb/server/ctdb_daemon.c

   1 /*
   2    ctdb daemon code
   3
   4    Copyright (C) Andrew Tridgell  2006
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "replace.h"
  21 #include "system/network.h"
  22 #include "system/filesys.h"
  23 #include "system/wait.h"
  24 #include "system/time.h"
  25
  26 #include <talloc.h>
  27 /* Allow use of deprecated function tevent_loop_allow_nesting() */
  28 #define TEVENT_DEPRECATED
  29 #include <tevent.h>
  30 #include <tdb.h>
  31
  32 #include "lib/tdb_wrap/tdb_wrap.h"
  33 #include "lib/util/dlinklist.h"
  34 #include "lib/util/debug.h"
  35 #include "lib/util/time.h"
  36 #include "lib/util/blocking.h"
  37 #include "lib/util/become_daemon.h"
  38
  39 #include "version.h"
  40 #include "ctdb_private.h"
  41 #include "ctdb_client.h"
  42
  43 #include "protocol/protocol.h"
  44 #include "protocol/protocol_api.h"
  45
  46 #include "common/rb_tree.h"
  47 #include "common/reqid.h"
  48 #include "common/system.h"
  49 #include "common/common.h"
  50 #include "common/logging.h"
  51 #include "common/pidfile.h"
  52 #include "common/sock_io.h"
  53
  54 #include "conf/node.h"
  55
  56 struct ctdb_client_pid_list {
  57         struct ctdb_client_pid_list *next, *prev;
  58         struct ctdb_context *ctdb;
  59         pid_t pid;
  60         struct ctdb_client *client;
  61 };
  62
  63 const char *ctdbd_pidfile = NULL;
  64 static struct pidfile_context *ctdbd_pidfile_ctx = NULL;
  65
  66 static void daemon_incoming_packet(void *, struct ctdb_req_header *);
  67
  68 static pid_t __ctdbd_pid;
  69
  70 static void print_exit_message(void)
  71 {
  72         if (getpid() == __ctdbd_pid) {
  73                 DEBUG(DEBUG_NOTICE,("CTDB daemon shutting down\n"));
  74
  75                 /* Wait a second to allow pending log messages to be flushed */
  76                 sleep(1);
  77         }
  78 }
  79
  80 #ifdef HAVE_GETRUSAGE
  81
  82 struct cpu_check_threshold_data {
  83         unsigned short percent;
  84         struct timeval timeofday;
  85         struct timeval ru_time;
  86 };
  87
  88 static void ctdb_cpu_check_threshold(struct tevent_context *ev,
  89                                      struct tevent_timer *te,
  90                                      struct timeval tv,
  91                                      void *private_data)
  92 {
  93         struct ctdb_context *ctdb = talloc_get_type_abort(
  94                 private_data, struct ctdb_context);
  95         uint32_t interval = 60;
  96
  97         static unsigned short threshold = 0;
  98         static struct cpu_check_threshold_data prev = {
  99                 .percent = 0,
 100                 .timeofday = { .tv_sec = 0 },
 101                 .ru_time = { .tv_sec = 0 },
 102         };
 103
 104         struct rusage usage;
 105         struct cpu_check_threshold_data curr = {
 106                 .percent = 0,
 107         };
 108         int64_t ru_time_diff, timeofday_diff;
 109         bool first;
 110         int ret;
 111
 112         /*
 113          * Cache the threshold so that we don't waste time checking
 114          * the environment variable every time
 115          */
 116         if (threshold == 0) {
 117                 const char *t;
 118
 119                 threshold = 90;
 120
 121                 t = getenv("CTDB_TEST_CPU_USAGE_THRESHOLD");
 122                 if (t != NULL) {
 123                         int th;
 124
 125                         th = atoi(t);
 126                         if (th <= 0 || th > 100) {
 127                                 DBG_WARNING("Failed to parse env var: %s\n", t);
 128                         } else {
 129                                 threshold = th;
 130                         }
 131                 }
 132         }
 133
 134         ret = getrusage(RUSAGE_SELF, &usage);
 135         if (ret != 0) {
 136                 DBG_WARNING("rusage() failed: %d\n", ret);
 137                 goto next;
 138         }
 139
 140         /* Sum the system and user CPU usage */
 141         curr.ru_time = timeval_sum(&usage.ru_utime, &usage.ru_stime);
 142
 143         curr.timeofday = tv;
 144
 145         first = timeval_is_zero(&prev.timeofday);
 146         if (first) {
 147                 /* No previous values recorded so no calculation to do */
 148                 goto done;
 149         }
 150
 151         timeofday_diff = usec_time_diff(&curr.timeofday, &prev.timeofday);
 152         if (timeofday_diff <= 0) {
 153                 /*
 154                  * Time went backwards or didn't progress so no (sane)
 155                  * calculation can be done
 156                  */
 157                 goto done;
 158         }
 159
 160         ru_time_diff = usec_time_diff(&curr.ru_time, &prev.ru_time);
 161
 162         curr.percent = ru_time_diff * 100 / timeofday_diff;
 163
 164         if (curr.percent >= threshold) {
 165                 /* Log only if the utilisation changes */
 166                 if (curr.percent != prev.percent) {
 167                         D_WARNING("WARNING: CPU utilisation %hu%% >= "
 168                                   "threshold (%hu%%)\n",
 169                                   curr.percent,
 170                                   threshold);
 171                 }
 172         } else {
 173                 /* Log if the utilisation falls below the threshold */
 174                 if (prev.percent >= threshold) {
 175                         D_WARNING("WARNING: CPU utilisation %hu%% < "
 176                                   "threshold (%hu%%)\n",
 177                                   curr.percent,
 178                                   threshold);
 179                 }
 180         }
 181
 182 done:
 183         prev = curr;
 184
 185 next:
 186         tevent_add_timer(ctdb->ev, ctdb,
 187                          timeval_current_ofs(interval, 0),
 188                          ctdb_cpu_check_threshold,
 189                          ctdb);
 190 }
 191
 192 static void ctdb_start_cpu_check_threshold(struct ctdb_context *ctdb)
 193 {
 194         tevent_add_timer(ctdb->ev, ctdb,
 195                          timeval_current(),
 196                          ctdb_cpu_check_threshold,
 197                          ctdb);
 198 }
 199 #endif /* HAVE_GETRUSAGE */
 200
 201 static void ctdb_time_tick(struct tevent_context *ev, struct tevent_timer *te,
 202                                   struct timeval t, void *private_data)
 203 {
 204         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
 205
 206         if (getpid() != ctdb->ctdbd_pid) {
 207                 return;
 208         }
 209
 210         tevent_add_timer(ctdb->ev, ctdb,
 211                          timeval_current_ofs(1, 0),
 212                          ctdb_time_tick, ctdb);
 213 }
 214
 215 /* Used to trigger a dummy event once per second, to make
 216  * detection of hangs more reliable.
 217  */
 218 static void ctdb_start_time_tickd(struct ctdb_context *ctdb)
 219 {
 220         tevent_add_timer(ctdb->ev, ctdb,
 221                          timeval_current_ofs(1, 0),
 222                          ctdb_time_tick, ctdb);
 223 }
 224
 225 static void ctdb_start_periodic_events(struct ctdb_context *ctdb)
 226 {
 227         /* start monitoring for connected/disconnected nodes */
 228         ctdb_start_keepalive(ctdb);
 229
 230         /* start periodic update of tcp tickle lists */
 231         ctdb_start_tcp_tickle_update(ctdb);
 232
 233         /* start listening for recovery daemon pings */
 234         ctdb_control_recd_ping(ctdb);
 235
 236         /* start listening to timer ticks */
 237         ctdb_start_time_tickd(ctdb);
 238
 239 #ifdef HAVE_GETRUSAGE
 240         ctdb_start_cpu_check_threshold(ctdb);
 241 #endif /* HAVE_GETRUSAGE */
 242 }
 243
 244 static void ignore_signal(int signum)
 245 {
 246         struct sigaction act;
 247
 248         memset(&act, 0, sizeof(act));
 249
 250         act.sa_handler = SIG_IGN;
 251         sigemptyset(&act.sa_mask);
 252         sigaddset(&act.sa_mask, signum);
 253         sigaction(signum, &act, NULL);
 254 }
 255
 256
 257 /*
 258   send a packet to a client
 259  */
 260 static int daemon_queue_send(struct ctdb_client *client, struct ctdb_req_header *hdr)
 261 {
 262         CTDB_INCREMENT_STAT(client->ctdb, client_packets_sent);
 263         if (hdr->operation == CTDB_REQ_MESSAGE) {
 264                 if (ctdb_queue_length(client->queue) > client->ctdb->tunable.max_queue_depth_drop_msg) {
 265                         DEBUG(DEBUG_ERR,("CTDB_REQ_MESSAGE queue full - killing client connection.\n"));
 266                         talloc_free(client);
 267                         return -1;
 268                 }
 269         }
 270         return ctdb_queue_send(client->queue, (uint8_t *)hdr, hdr->length);
 271 }
 272
 273 /*
 274   message handler for when we are in daemon mode. This redirects the message
 275   to the right client
 276  */
 277 static void daemon_message_handler(uint64_t srvid, TDB_DATA data,
 278                                    void *private_data)
 279 {
 280         struct ctdb_client *client = talloc_get_type(private_data, struct ctdb_client);
 281         struct ctdb_req_message_old *r;
 282         int len;
 283
 284         /* construct a message to send to the client containing the data */
 285         len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
 286         r = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_MESSAGE,
 287                                len, struct ctdb_req_message_old);
 288         CTDB_NO_MEMORY_VOID(client->ctdb, r);
 289
 290         talloc_set_name_const(r, "req_message packet");
 291
 292         r->srvid         = srvid;
 293         r->datalen       = data.dsize;
 294         memcpy(&r->data[0], data.dptr, data.dsize);
 295
 296         daemon_queue_send(client, &r->hdr);
 297
 298         talloc_free(r);
 299 }
 300
 301 /*
 302   this is called when the ctdb daemon received a ctdb request to
 303   set the srvid from the client
 304  */
 305 int daemon_register_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
 306 {
 307         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
 308         int res;
 309         if (client == NULL) {
 310                 DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_register_message_handler\n"));
 311                 return -1;
 312         }
 313         res = srvid_register(ctdb->srv, client, srvid, daemon_message_handler,
 314                              client);
 315         if (res != 0) {
 316                 DEBUG(DEBUG_ERR,(__location__ " Failed to register handler %llu in daemon\n",
 317                          (unsigned long long)srvid));
 318         } else {
 319                 DEBUG(DEBUG_INFO,(__location__ " Registered message handler for srvid=%llu\n",
 320                          (unsigned long long)srvid));
 321         }
 322
 323         return res;
 324 }
 325
 326 /*
 327   this is called when the ctdb daemon received a ctdb request to
 328   remove a srvid from the client
 329  */
 330 int daemon_deregister_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
 331 {
 332         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
 333         if (client == NULL) {
 334                 DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_deregister_message_handler\n"));
 335                 return -1;
 336         }
 337         return srvid_deregister(ctdb->srv, srvid, client);
 338 }
 339
 340 void daemon_tunnel_handler(uint64_t tunnel_id, TDB_DATA data,
 341                            void *private_data)
 342 {
 343         struct ctdb_client *client =
 344                 talloc_get_type_abort(private_data, struct ctdb_client);
 345         struct ctdb_req_tunnel_old *c, *pkt;
 346         size_t len;
 347
 348         pkt = (struct ctdb_req_tunnel_old *)data.dptr;
 349
 350         len = offsetof(struct ctdb_req_tunnel_old, data) + pkt->datalen;
 351         c = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_TUNNEL,
 352                                len, struct ctdb_req_tunnel_old);
 353         if (c == NULL) {
 354                 DEBUG(DEBUG_ERR, ("Memory error in daemon_tunnel_handler\n"));
 355                 return;
 356         }
 357
 358         talloc_set_name_const(c, "req_tunnel packet");
 359
 360         c->tunnel_id = tunnel_id;
 361         c->flags = pkt->flags;
 362         c->datalen = pkt->datalen;
 363         memcpy(c->data, pkt->data, pkt->datalen);
 364
 365         daemon_queue_send(client, &c->hdr);
 366
 367         talloc_free(c);
 368 }
 369
 370 /*
 371   destroy a ctdb_client
 372 */
 373 static int ctdb_client_destructor(struct ctdb_client *client)
 374 {
 375         struct ctdb_db_context *ctdb_db;
 376
 377         ctdb_takeover_client_destructor_hook(client);
 378         reqid_remove(client->ctdb->idr, client->client_id);
 379         client->ctdb->num_clients--;
 380
 381         if (client->num_persistent_updates != 0) {
 382                 DEBUG(DEBUG_ERR,(__location__ " Client disconnecting with %u persistent updates in flight. Starting recovery\n", client->num_persistent_updates));
 383                 client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
 384         }
 385         ctdb_db = find_ctdb_db(client->ctdb, client->db_id);
 386         if (ctdb_db) {
 387                 DEBUG(DEBUG_ERR, (__location__ " client exit while transaction "
 388                                   "commit active. Forcing recovery.\n"));
 389                 client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
 390
 391                 /*
 392                  * trans3 transaction state:
 393                  *
 394                  * The destructor sets the pointer to NULL.
 395                  */
 396                 talloc_free(ctdb_db->persistent_state);
 397         }
 398
 399         return 0;
 400 }
 401
 402
 403 /*
 404   this is called when the ctdb daemon received a ctdb request message
 405   from a local client over the unix domain socket
 406  */
 407 static void daemon_request_message_from_client(struct ctdb_client *client,
 408                                                struct ctdb_req_message_old *c)
 409 {
 410         TDB_DATA data;
 411         int res;
 412
 413         if (c->hdr.destnode == CTDB_CURRENT_NODE) {
 414                 c->hdr.destnode = ctdb_get_pnn(client->ctdb);
 415         }
 416
 417         /* maybe the message is for another client on this node */
 418         if (ctdb_get_pnn(client->ctdb)==c->hdr.destnode) {
 419                 ctdb_request_message(client->ctdb, (struct ctdb_req_header *)c);
 420                 return;
 421         }
 422
 423         /* its for a remote node */
 424         data.dptr = &c->data[0];
 425         data.dsize = c->datalen;
 426         res = ctdb_daemon_send_message(client->ctdb, c->hdr.destnode,
 427                                        c->srvid, data);
 428         if (res != 0) {
 429                 DEBUG(DEBUG_ERR,(__location__ " Failed to send message to remote node %u\n",
 430                          c->hdr.destnode));
 431         }
 432 }
 433
 434
 435 struct daemon_call_state {
 436         struct ctdb_client *client;
 437         uint32_t reqid;
 438         struct ctdb_call *call;
 439         struct timeval start_time;
 440
 441         /* readonly request ? */
 442         uint32_t readonly_fetch;
 443         uint32_t client_callid;
 444 };
 445
 446 /*
 447    complete a call from a client
 448 */
 449 static void daemon_call_from_client_callback(struct ctdb_call_state *state)
 450 {
 451         struct daemon_call_state *dstate = talloc_get_type(state->async.private_data,
 452                                                            struct daemon_call_state);
 453         struct ctdb_reply_call_old *r;
 454         int res;
 455         uint32_t length;
 456         struct ctdb_client *client = dstate->client;
 457         struct ctdb_db_context *ctdb_db = state->ctdb_db;
 458
 459         talloc_steal(client, dstate);
 460         talloc_steal(dstate, dstate->call);
 461
 462         res = ctdb_daemon_call_recv(state, dstate->call);
 463         if (res != 0) {
 464                 DEBUG(DEBUG_ERR, (__location__ " ctdbd_call_recv() returned error\n"));
 465                 CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
 466
 467                 CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 1", call_latency, dstate->start_time);
 468                 return;
 469         }
 470
 471         length = offsetof(struct ctdb_reply_call_old, data) + dstate->call->reply_data.dsize;
 472         /* If the client asked for readonly FETCH, we remapped this to
 473            FETCH_WITH_HEADER when calling the daemon. So we must
 474            strip the extra header off the reply data before passing
 475            it back to the client.
 476         */
 477         if (dstate->readonly_fetch
 478         && dstate->client_callid == CTDB_FETCH_FUNC) {
 479                 length -= sizeof(struct ctdb_ltdb_header);
 480         }
 481
 482         r = ctdbd_allocate_pkt(client->ctdb, dstate, CTDB_REPLY_CALL,
 483                                length, struct ctdb_reply_call_old);
 484         if (r == NULL) {
 485                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate reply_call in ctdb daemon\n"));
 486                 CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
 487                 CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 2", call_latency, dstate->start_time);
 488                 return;
 489         }
 490         r->hdr.reqid        = dstate->reqid;
 491         r->status           = dstate->call->status;
 492
 493         if (dstate->readonly_fetch
 494         && dstate->client_callid == CTDB_FETCH_FUNC) {
 495                 /* client only asked for a FETCH so we must strip off
 496                    the extra ctdb_ltdb header
 497                 */
 498                 r->datalen          = dstate->call->reply_data.dsize - sizeof(struct ctdb_ltdb_header);
 499                 memcpy(&r->data[0], dstate->call->reply_data.dptr + sizeof(struct ctdb_ltdb_header), r->datalen);
 500         } else {
 501                 r->datalen          = dstate->call->reply_data.dsize;
 502                 memcpy(&r->data[0], dstate->call->reply_data.dptr, r->datalen);
 503         }
 504
 505         res = daemon_queue_send(client, &r->hdr);
 506         if (res == -1) {
 507                 /* client is dead - return immediately */
 508                 return;
 509         }
 510         if (res != 0) {
 511                 DEBUG(DEBUG_ERR, (__location__ " Failed to queue packet from daemon to client\n"));
 512         }
 513         CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 3", call_latency, dstate->start_time);
 514         CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
 515         talloc_free(dstate);
 516 }
 517
 518 struct ctdb_daemon_packet_wrap {
 519         struct ctdb_context *ctdb;
 520         uint32_t client_id;
 521 };
 522
 523 /*
 524   a wrapper to catch disconnected clients
 525  */
 526 static void daemon_incoming_packet_wrap(void *p, struct ctdb_req_header *hdr)
 527 {
 528         struct ctdb_client *client;
 529         struct ctdb_daemon_packet_wrap *w = talloc_get_type(p,
 530                                                             struct ctdb_daemon_packet_wrap);
 531         if (w == NULL) {
 532                 DEBUG(DEBUG_CRIT,(__location__ " Bad packet type '%s'\n", talloc_get_name(p)));
 533                 return;
 534         }
 535
 536         client = reqid_find(w->ctdb->idr, w->client_id, struct ctdb_client);
 537         if (client == NULL) {
 538                 DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
 539                          w->client_id));
 540                 talloc_free(w);
 541                 return;
 542         }
 543         talloc_free(w);
 544
 545         /* process it */
 546         daemon_incoming_packet(client, hdr);
 547 }
 548
 549 struct ctdb_deferred_fetch_call {
 550         struct ctdb_deferred_fetch_call *next, *prev;
 551         struct ctdb_req_call_old *c;
 552         struct ctdb_daemon_packet_wrap *w;
 553 };
 554
 555 struct ctdb_deferred_fetch_queue {
 556         struct ctdb_deferred_fetch_call *deferred_calls;
 557 };
 558
 559 struct ctdb_deferred_requeue {
 560         struct ctdb_deferred_fetch_call *dfc;
 561         struct ctdb_client *client;
 562 };
 563
 564 /* called from a timer event and starts reprocessing the deferred call.*/
 565 static void reprocess_deferred_call(struct tevent_context *ev,
 566                                     struct tevent_timer *te,
 567                                     struct timeval t, void *private_data)
 568 {
 569         struct ctdb_deferred_requeue *dfr = (struct ctdb_deferred_requeue *)private_data;
 570         struct ctdb_client *client = dfr->client;
 571
 572         talloc_steal(client, dfr->dfc->c);
 573         daemon_incoming_packet(client, (struct ctdb_req_header *)dfr->dfc->c);
 574         talloc_free(dfr);
 575 }
 576
 577 /* the referral context is destroyed either after a timeout or when the initial
 578    fetch-lock has finished.
 579    at this stage, immediately start reprocessing the queued up deferred
 580    calls so they get reprocessed immediately (and since we are dmaster at
 581    this stage, trigger the waiting smbd processes to pick up and acquire the
 582    record right away.
 583 */
 584 static int deferred_fetch_queue_destructor(struct ctdb_deferred_fetch_queue *dfq)
 585 {
 586
 587         /* need to reprocess the packets from the queue explicitly instead of
 588            just using a normal destructor since we need to
 589            call the clients in the same order as the requests queued up
 590         */
 591         while (dfq->deferred_calls != NULL) {
 592                 struct ctdb_client *client;
 593                 struct ctdb_deferred_fetch_call *dfc = dfq->deferred_calls;
 594                 struct ctdb_deferred_requeue *dfr;
 595
 596                 DLIST_REMOVE(dfq->deferred_calls, dfc);
 597
 598                 client = reqid_find(dfc->w->ctdb->idr, dfc->w->client_id, struct ctdb_client);
 599                 if (client == NULL) {
 600                         DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
 601                                  dfc->w->client_id));
 602                         continue;
 603                 }
 604
 605                 /* process it by pushing it back onto the eventloop */
 606                 dfr = talloc(client, struct ctdb_deferred_requeue);
 607                 if (dfr == NULL) {
 608                         DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch requeue structure\n"));
 609                         continue;
 610                 }
 611
 612                 dfr->dfc    = talloc_steal(dfr, dfc);
 613                 dfr->client = client;
 614
 615                 tevent_add_timer(dfc->w->ctdb->ev, client, timeval_zero(),
 616                                  reprocess_deferred_call, dfr);
 617         }
 618
 619         return 0;
 620 }
 621
 622 /* insert the new deferral context into the rb tree.
 623    there should never be a pre-existing context here, but check for it
 624    warn and destroy the previous context if there is already a deferral context
 625    for this key.
 626 */
 627 static void *insert_dfq_callback(void *parm, void *data)
 628 {
 629         if (data) {
 630                 DEBUG(DEBUG_ERR,("Already have DFQ registered. Free old %p and create new %p\n", data, parm));
 631                 talloc_free(data);
 632         }
 633         return parm;
 634 }
 635
 636 /* if the original fetch-lock did not complete within a reasonable time,
 637    free the context and context for all deferred requests to cause them to be
 638    re-inserted into the event system.
 639 */
 640 static void dfq_timeout(struct tevent_context *ev, struct tevent_timer *te,
 641                         struct timeval t, void *private_data)
 642 {
 643         talloc_free(private_data);
 644 }
 645
 646 /* This function is used in the local daemon to register a KEY in a database
 647    for being "fetched"
 648    While the remote fetch is in-flight, any further attempts to re-fetch the
 649    same record will be deferred until the fetch completes.
 650 */
 651 static int setup_deferred_fetch_locks(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
 652 {
 653         uint32_t *k;
 654         struct ctdb_deferred_fetch_queue *dfq;
 655
 656         k = ctdb_key_to_idkey(call, call->key);
 657         if (k == NULL) {
 658                 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
 659                 return -1;
 660         }
 661
 662         dfq  = talloc(call, struct ctdb_deferred_fetch_queue);
 663         if (dfq == NULL) {
 664                 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch queue structure\n"));
 665                 talloc_free(k);
 666                 return -1;
 667         }
 668         dfq->deferred_calls = NULL;
 669
 670         trbt_insertarray32_callback(ctdb_db->deferred_fetch, k[0], &k[0], insert_dfq_callback, dfq);
 671
 672         talloc_set_destructor(dfq, deferred_fetch_queue_destructor);
 673
 674         /* If the fetch hasn't completed in 30 seconds, just tear it all down
 675            and let it try again as the events are reissued */
 676         tevent_add_timer(ctdb_db->ctdb->ev, dfq, timeval_current_ofs(30, 0),
 677                          dfq_timeout, dfq);
 678
 679         talloc_free(k);
 680         return 0;
 681 }
 682
 683 /* check if this is a duplicate request to a fetch already in-flight
 684    if it is, make this call deferred to be reprocessed later when
 685    the in-flight fetch completes.
 686 */
 687 static int requeue_duplicate_fetch(struct ctdb_db_context *ctdb_db, struct ctdb_client *client, TDB_DATA key, struct ctdb_req_call_old *c)
 688 {
 689         uint32_t *k;
 690         struct ctdb_deferred_fetch_queue *dfq;
 691         struct ctdb_deferred_fetch_call *dfc;
 692
 693         k = ctdb_key_to_idkey(c, key);
 694         if (k == NULL) {
 695                 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
 696                 return -1;
 697         }
 698
 699         dfq = trbt_lookuparray32(ctdb_db->deferred_fetch, k[0], &k[0]);
 700         if (dfq == NULL) {
 701                 talloc_free(k);
 702                 return -1;
 703         }
 704
 705
 706         talloc_free(k);
 707
 708         dfc = talloc(dfq, struct ctdb_deferred_fetch_call);
 709         if (dfc == NULL) {
 710                 DEBUG(DEBUG_ERR, ("Failed to allocate deferred fetch call structure\n"));
 711                 return -1;
 712         }
 713
 714         dfc->w = talloc(dfc, struct ctdb_daemon_packet_wrap);
 715         if (dfc->w == NULL) {
 716                 DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch daemon packet wrap structure\n"));
 717                 talloc_free(dfc);
 718                 return -1;
 719         }
 720
 721         dfc->c = talloc_steal(dfc, c);
 722         dfc->w->ctdb = ctdb_db->ctdb;
 723         dfc->w->client_id = client->client_id;
 724
 725         DLIST_ADD_END(dfq->deferred_calls, dfc);
 726
 727         return 0;
 728 }
 729
 730
 731 /*
 732   this is called when the ctdb daemon received a ctdb request call
 733   from a local client over the unix domain socket
 734  */
 735 static void daemon_request_call_from_client(struct ctdb_client *client,
 736                                             struct ctdb_req_call_old *c)
 737 {
 738         struct ctdb_call_state *state;
 739         struct ctdb_db_context *ctdb_db;
 740         struct daemon_call_state *dstate;
 741         struct ctdb_call *call;
 742         struct ctdb_ltdb_header header;
 743         TDB_DATA key, data;
 744         int ret;
 745         struct ctdb_context *ctdb = client->ctdb;
 746         struct ctdb_daemon_packet_wrap *w;
 747
 748         CTDB_INCREMENT_STAT(ctdb, total_calls);
 749         CTDB_INCREMENT_STAT(ctdb, pending_calls);
 750
 751         ctdb_db = find_ctdb_db(client->ctdb, c->db_id);
 752         if (!ctdb_db) {
 753                 DEBUG(DEBUG_ERR, (__location__ " Unknown database in request. db_id==0x%08x\n",
 754                           c->db_id));
 755                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 756                 return;
 757         }
 758
 759         if (ctdb_db->unhealthy_reason) {
 760                 /*
 761                  * this is just a warning, as the tdb should be empty anyway,
 762                  * and only persistent databases can be unhealthy, which doesn't
 763                  * use this code patch
 764                  */
 765                 DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in daemon_request_call_from_client(): %s\n",
 766                                      ctdb_db->db_name, ctdb_db->unhealthy_reason));
 767         }
 768
 769         key.dptr = c->data;
 770         key.dsize = c->keylen;
 771
 772         w = talloc(ctdb, struct ctdb_daemon_packet_wrap);
 773         CTDB_NO_MEMORY_VOID(ctdb, w);
 774
 775         w->ctdb = ctdb;
 776         w->client_id = client->client_id;
 777
 778         ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header,
 779                                            (struct ctdb_req_header *)c, &data,
 780                                            daemon_incoming_packet_wrap, w, true);
 781         if (ret == -2) {
 782                 /* will retry later */
 783                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 784                 return;
 785         }
 786
 787         talloc_free(w);
 788
 789         if (ret != 0) {
 790                 DEBUG(DEBUG_ERR,(__location__ " Unable to fetch record\n"));
 791                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 792                 return;
 793         }
 794
 795
 796         /* check if this fetch request is a duplicate for a
 797            request we already have in flight. If so defer it until
 798            the first request completes.
 799         */
 800         if (ctdb->tunable.fetch_collapse == 1) {
 801                 if (requeue_duplicate_fetch(ctdb_db, client, key, c) == 0) {
 802                         ret = ctdb_ltdb_unlock(ctdb_db, key);
 803                         if (ret != 0) {
 804                                 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
 805                         }
 806                         CTDB_DECREMENT_STAT(ctdb, pending_calls);
 807                         talloc_free(data.dptr);
 808                         return;
 809                 }
 810         }
 811
 812         /* Dont do READONLY if we don't have a tracking database */
 813         if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db_readonly(ctdb_db)) {
 814                 c->flags &= ~CTDB_WANT_READONLY;
 815         }
 816
 817         if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
 818                 header.flags &= ~CTDB_REC_RO_FLAGS;
 819                 CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
 820                 CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
 821                 if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
 822                         ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
 823                 }
 824                 /* and clear out the tracking data */
 825                 if (tdb_delete(ctdb_db->rottdb, key) != 0) {
 826                         DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
 827                 }
 828         }
 829
 830         /* if we are revoking, we must defer all other calls until the revoke
 831          * had completed.
 832          */
 833         if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
 834                 talloc_free(data.dptr);
 835                 ret = ctdb_ltdb_unlock(ctdb_db, key);
 836
 837                 if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
 838                         ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
 839                 }
 840                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 841                 return;
 842         }
 843
 844         if ((header.dmaster == ctdb->pnn)
 845         && (!(c->flags & CTDB_WANT_READONLY))
 846         && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
 847                 header.flags   |= CTDB_REC_RO_REVOKING_READONLY;
 848                 if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
 849                         ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
 850                 }
 851                 ret = ctdb_ltdb_unlock(ctdb_db, key);
 852
 853                 if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, key, &header, data) != 0) {
 854                         ctdb_fatal(ctdb, "Failed to start record revoke");
 855                 }
 856                 talloc_free(data.dptr);
 857
 858                 if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
 859                         ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
 860                 }
 861
 862                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 863                 return;
 864         }
 865
 866         dstate = talloc(client, struct daemon_call_state);
 867         if (dstate == NULL) {
 868                 ret = ctdb_ltdb_unlock(ctdb_db, key);
 869                 if (ret != 0) {
 870                         DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
 871                 }
 872
 873                 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate dstate\n"));
 874                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 875                 return;
 876         }
 877         dstate->start_time = timeval_current();
 878         dstate->client = client;
 879         dstate->reqid  = c->hdr.reqid;
 880         talloc_steal(dstate, data.dptr);
 881
 882         call = dstate->call = talloc_zero(dstate, struct ctdb_call);
 883         if (call == NULL) {
 884                 ret = ctdb_ltdb_unlock(ctdb_db, key);
 885                 if (ret != 0) {
 886                         DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
 887                 }
 888
 889                 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate call\n"));
 890                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 891                 CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 1", call_latency, dstate->start_time);
 892                 return;
 893         }
 894
 895         dstate->readonly_fetch = 0;
 896         call->call_id = c->callid;
 897         call->key = key;
 898         call->call_data.dptr = c->data + c->keylen;
 899         call->call_data.dsize = c->calldatalen;
 900         call->flags = c->flags;
 901
 902         if (c->flags & CTDB_WANT_READONLY) {
 903                 /* client wants readonly record, so translate this into a
 904                    fetch with header. remember what the client asked for
 905                    so we can remap the reply back to the proper format for
 906                    the client in the reply
 907                  */
 908                 dstate->client_callid = call->call_id;
 909                 call->call_id = CTDB_FETCH_WITH_HEADER_FUNC;
 910                 dstate->readonly_fetch = 1;
 911         }
 912
 913         if (header.dmaster == ctdb->pnn) {
 914                 state = ctdb_call_local_send(ctdb_db, call, &header, &data);
 915         } else {
 916                 state = ctdb_daemon_call_send_remote(ctdb_db, call, &header);
 917                 if (ctdb->tunable.fetch_collapse == 1) {
 918                         /* This request triggered a remote fetch-lock.
 919                            set up a deferral for this key so any additional
 920                            fetch-locks are deferred until the current one
 921                            finishes.
 922                          */
 923                         setup_deferred_fetch_locks(ctdb_db, call);
 924                 }
 925         }
 926
 927         ret = ctdb_ltdb_unlock(ctdb_db, key);
 928         if (ret != 0) {
 929                 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
 930         }
 931
 932         if (state == NULL) {
 933                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup call send\n"));
 934                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 935                 CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 2", call_latency, dstate->start_time);
 936                 return;
 937         }
 938         talloc_steal(state, dstate);
 939         talloc_steal(client, state);
 940
 941         state->async.fn = daemon_call_from_client_callback;
 942         state->async.private_data = dstate;
 943 }
 944
 945
 946 static void daemon_request_control_from_client(struct ctdb_client *client,
 947                                                struct ctdb_req_control_old *c);
 948 static void daemon_request_tunnel_from_client(struct ctdb_client *client,
 949                                               struct ctdb_req_tunnel_old *c);
 950
 951 /* data contains a packet from the client */
 952 static void daemon_incoming_packet(void *p, struct ctdb_req_header *hdr)
 953 {
 954         struct ctdb_client *client = talloc_get_type(p, struct ctdb_client);
 955         TALLOC_CTX *tmp_ctx;
 956         struct ctdb_context *ctdb = client->ctdb;
 957
 958         /* place the packet as a child of a tmp_ctx. We then use
 959            talloc_free() below to free it. If any of the calls want
 960            to keep it, then they will steal it somewhere else, and the
 961            talloc_free() will be a no-op */
 962         tmp_ctx = talloc_new(client);
 963         talloc_steal(tmp_ctx, hdr);
 964
 965         if (hdr->ctdb_magic != CTDB_MAGIC) {
 966                 ctdb_set_error(client->ctdb, "Non CTDB packet rejected in daemon\n");
 967                 goto done;
 968         }
 969
 970         if (hdr->ctdb_version != CTDB_PROTOCOL) {
 971                 ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
 972                 goto done;
 973         }
 974
 975         switch (hdr->operation) {
 976         case CTDB_REQ_CALL:
 977                 CTDB_INCREMENT_STAT(ctdb, client.req_call);
 978                 daemon_request_call_from_client(client, (struct ctdb_req_call_old *)hdr);
 979                 break;
 980
 981         case CTDB_REQ_MESSAGE:
 982                 CTDB_INCREMENT_STAT(ctdb, client.req_message);
 983                 daemon_request_message_from_client(client, (struct ctdb_req_message_old *)hdr);
 984                 break;
 985
 986         case CTDB_REQ_CONTROL:
 987                 CTDB_INCREMENT_STAT(ctdb, client.req_control);
 988                 daemon_request_control_from_client(client, (struct ctdb_req_control_old *)hdr);
 989                 break;
 990
 991         case CTDB_REQ_TUNNEL:
 992                 CTDB_INCREMENT_STAT(ctdb, client.req_tunnel);
 993                 daemon_request_tunnel_from_client(client, (struct ctdb_req_tunnel_old *)hdr);
 994                 break;
 995
 996         default:
 997                 DEBUG(DEBUG_CRIT,(__location__ " daemon: unrecognized operation %u\n",
 998                          hdr->operation));
 999         }
1000
1001 done:
1002         talloc_free(tmp_ctx);
1003 }
1004
1005 /*
1006   called when the daemon gets a incoming packet
1007  */
1008 static void ctdb_daemon_read_cb(uint8_t *data, size_t cnt, void *args)
1009 {
1010         struct ctdb_client *client = talloc_get_type(args, struct ctdb_client);
1011         struct ctdb_req_header *hdr;
1012
1013         if (cnt == 0) {
1014                 talloc_free(client);
1015                 return;
1016         }
1017
1018         CTDB_INCREMENT_STAT(client->ctdb, client_packets_recv);
1019
1020         if (cnt < sizeof(*hdr)) {
1021                 ctdb_set_error(client->ctdb, "Bad packet length %u in daemon\n",
1022                                (unsigned)cnt);
1023                 return;
1024         }
1025         hdr = (struct ctdb_req_header *)data;
1026
1027         if (hdr->ctdb_magic != CTDB_MAGIC) {
1028                 ctdb_set_error(client->ctdb, "Non CTDB packet rejected\n");
1029                 goto err_out;
1030         }
1031
1032         if (hdr->ctdb_version != CTDB_PROTOCOL) {
1033                 ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
1034                 goto err_out;
1035         }
1036
1037         DEBUG(DEBUG_DEBUG,(__location__ " client request %u of type %u length %u from "
1038                  "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
1039                  hdr->srcnode, hdr->destnode));
1040
1041         /* it is the responsibility of the incoming packet function to free 'data' */
1042         daemon_incoming_packet(client, hdr);
1043         return;
1044
1045 err_out:
1046         TALLOC_FREE(data);
1047 }
1048
1049
1050 static int ctdb_clientpid_destructor(struct ctdb_client_pid_list *client_pid)
1051 {
1052         if (client_pid->ctdb->client_pids != NULL) {
1053                 DLIST_REMOVE(client_pid->ctdb->client_pids, client_pid);
1054         }
1055
1056         return 0;
1057 }
1058
1059 static int get_new_client_id(struct reqid_context *idr,
1060                              struct ctdb_client *client,
1061                              uint32_t *out)
1062 {
1063         uint32_t client_id;
1064
1065         client_id = reqid_new(idr, client);
1066         /*
1067          * Some places in the code (e.g. ctdb_control_db_attach(),
1068          * ctdb_control_db_detach()) assign a special meaning to
1069          * client_id 0.  The assumption is that if client_id is 0 then
1070          * the control has come from another daemon.  Therefore, we
1071          * should never return client_id == 0.
1072          */
1073         if (client_id == 0) {
1074                 /*
1075                  * Don't leak ID 0.  This is safe because the ID keeps
1076                  * increasing.  A test will be added to ensure that
1077                  * this doesn't change.
1078                  */
1079                 reqid_remove(idr, 0);
1080
1081                 client_id = reqid_new(idr, client);
1082         }
1083
1084         if (client_id == REQID_INVALID) {
1085                 return EINVAL;
1086         }
1087
1088         if (client_id == 0) {
1089                 /* Every other ID must have been used and we can't use 0 */
1090                 reqid_remove(idr, 0);
1091                 return EINVAL;
1092         }
1093
1094         *out = client_id;
1095         return 0;
1096 }
1097
1098 static void ctdb_accept_client(struct tevent_context *ev,
1099                                struct tevent_fd *fde, uint16_t flags,
1100                                void *private_data)
1101 {
1102         struct sockaddr_un addr;
1103         socklen_t len;
1104         int fd;
1105         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
1106         struct ctdb_client *client;
1107         struct ctdb_client_pid_list *client_pid;
1108         pid_t peer_pid = 0;
1109         int ret;
1110
1111         memset(&addr, 0, sizeof(addr));
1112         len = sizeof(addr);
1113         fd = accept(ctdb->daemon.sd, (struct sockaddr *)&addr, &len);
1114         if (fd == -1) {
1115                 return;
1116         }
1117         smb_set_close_on_exec(fd);
1118
1119         ret = set_blocking(fd, false);
1120         if (ret != 0) {
1121                 DEBUG(DEBUG_ERR,
1122                       (__location__
1123                        " failed to set socket non-blocking (%s)\n",
1124                        strerror(errno)));
1125                 close(fd);
1126                 return;
1127         }
1128
1129         set_close_on_exec(fd);
1130
1131         DEBUG(DEBUG_DEBUG,(__location__ " Created SOCKET FD:%d to connected child\n", fd));
1132
1133         client = talloc_zero(ctdb, struct ctdb_client);
1134         if (ctdb_get_peer_pid(fd, &peer_pid) == 0) {
1135                 DEBUG(DEBUG_INFO,("Connected client with pid:%u\n", (unsigned)peer_pid));
1136         }
1137
1138         client->ctdb = ctdb;
1139         client->fd = fd;
1140
1141         ret = get_new_client_id(ctdb->idr, client, &client->client_id);
1142         if (ret != 0) {
1143                 DBG_ERR("Unable to get client ID (%d)\n", ret);
1144                 close(fd);
1145                 talloc_free(client);
1146                 return;
1147         }
1148
1149         client->pid = peer_pid;
1150
1151         client_pid = talloc(client, struct ctdb_client_pid_list);
1152         if (client_pid == NULL) {
1153                 DEBUG(DEBUG_ERR,("Failed to allocate client pid structure\n"));
1154                 close(fd);
1155                 talloc_free(client);
1156                 return;
1157         }
1158         client_pid->ctdb   = ctdb;
1159         client_pid->pid    = peer_pid;
1160         client_pid->client = client;
1161
1162         DLIST_ADD(ctdb->client_pids, client_pid);
1163
1164         client->queue = ctdb_queue_setup(ctdb, client, fd, CTDB_DS_ALIGNMENT,
1165                                          ctdb_daemon_read_cb, client,
1166                                          "client-%u", client->pid);
1167
1168         talloc_set_destructor(client, ctdb_client_destructor);
1169         talloc_set_destructor(client_pid, ctdb_clientpid_destructor);
1170         ctdb->num_clients++;
1171 }
1172
1173
1174
1175 /*
1176  * Create a unix domain socket, bind it, secure it and listen.  Return
1177  * the file descriptor for the socket.
1178  */
1179 static int ux_socket_bind(struct ctdb_context *ctdb, bool test_mode_enabled)
1180 {
1181         struct sockaddr_un addr = { .sun_family = AF_UNIX };
1182         int ret;
1183
1184         ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
1185         if (ctdb->daemon.sd == -1) {
1186                 return -1;
1187         }
1188
1189         strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)-1);
1190
1191         if (! sock_clean(ctdb->daemon.name)) {
1192                 return -1;
1193         }
1194
1195         set_close_on_exec(ctdb->daemon.sd);
1196
1197         ret = set_blocking(ctdb->daemon.sd, false);
1198         if (ret != 0) {
1199                 DBG_ERR("Failed to set socket non-blocking (%s)\n",
1200                         strerror(errno));
1201                 goto failed;
1202         }
1203
1204         ret = bind(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr));
1205         if (ret == -1) {
1206                 D_ERR("Unable to bind on ctdb socket '%s'\n", ctdb->daemon.name);
1207                 goto failed;
1208         }
1209
1210         if (!test_mode_enabled) {
1211                 ret = chown(ctdb->daemon.name, geteuid(), getegid());
1212                 if (ret != 0 && !test_mode_enabled) {
1213                         D_ERR("Unable to secure (chown) ctdb socket '%s'\n",
1214                               ctdb->daemon.name);
1215                         goto failed;
1216                 }
1217         }
1218
1219         ret = chmod(ctdb->daemon.name, 0700);
1220         if (ret != 0) {
1221                 D_ERR("Unable to secure (chmod) ctdb socket '%s'\n",
1222                       ctdb->daemon.name);
1223                 goto failed;
1224         }
1225
1226
1227         ret = listen(ctdb->daemon.sd, 100);
1228         if (ret != 0) {
1229                 D_ERR("Unable to listen on ctdb socket '%s'\n",
1230                       ctdb->daemon.name);
1231                 goto failed;
1232         }
1233
1234         D_NOTICE("Listening to ctdb socket %s\n", ctdb->daemon.name);
1235         return 0;
1236
1237 failed:
1238         close(ctdb->daemon.sd);
1239         ctdb->daemon.sd = -1;
1240         return -1;
1241 }
1242
1243 struct ctdb_node *ctdb_find_node(struct ctdb_context *ctdb, uint32_t pnn)
1244 {
1245         struct ctdb_node *node = NULL;
1246         unsigned int i;
1247
1248         if (pnn == CTDB_CURRENT_NODE) {
1249                 pnn = ctdb->pnn;
1250         }
1251
1252         /* Always found: PNN correctly set just before this is called */
1253         for (i = 0; i < ctdb->num_nodes; i++) {
1254                 node = ctdb->nodes[i];
1255                 if (pnn == node->pnn) {
1256                         return node;
1257                 }
1258         }
1259
1260         return NULL;
1261 }
1262
1263 static void initialise_node_flags (struct ctdb_context *ctdb)
1264 {
1265         struct ctdb_node *node = NULL;
1266
1267         node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE);
1268         /*
1269          * PNN correctly set just before this is called so always
1270          * found but keep static analysers happy...
1271          */
1272         if (node == NULL) {
1273                 DBG_ERR("Unable to find current node\n");
1274                 return;
1275         }
1276
1277         node->flags &= ~NODE_FLAGS_DISCONNECTED;
1278
1279         /* do we start out in DISABLED mode? */
1280         if (ctdb->start_as_disabled != 0) {
1281                 D_ERR("This node is configured to start in DISABLED state\n");
1282                 node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED;
1283         }
1284         /* do we start out in STOPPED mode? */
1285         if (ctdb->start_as_stopped != 0) {
1286                 D_ERR("This node is configured to start in STOPPED state\n");
1287                 node->flags |= NODE_FLAGS_STOPPED;
1288         }
1289 }
1290
1291 static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
1292                                       void *private_data)
1293 {
1294         if (status != 0) {
1295                 ctdb_die(ctdb, "Failed to run setup event");
1296         }
1297         ctdb_run_notification_script(ctdb, "setup");
1298
1299         /* Start the recovery daemon */
1300         if (ctdb_start_recoverd(ctdb) != 0) {
1301                 DEBUG(DEBUG_ALERT,("Failed to start recovery daemon\n"));
1302                 exit(11);
1303         }
1304
1305         ctdb_start_periodic_events(ctdb);
1306
1307         ctdb_wait_for_first_recovery(ctdb);
1308 }
1309
1310 static struct timeval tevent_before_wait_ts;
1311 static struct timeval tevent_after_wait_ts;
1312
1313 static void ctdb_tevent_trace_init(void)
1314 {
1315         struct timeval now;
1316
1317         now = timeval_current();
1318
1319         tevent_before_wait_ts = now;
1320         tevent_after_wait_ts = now;
1321 }
1322
1323 static void ctdb_tevent_trace(enum tevent_trace_point tp,
1324                               void *private_data)
1325 {
1326         struct timeval diff;
1327         struct timeval now;
1328         struct ctdb_context *ctdb =
1329                 talloc_get_type(private_data, struct ctdb_context);
1330
1331         if (getpid() != ctdb->ctdbd_pid) {
1332                 return;
1333         }
1334
1335         now = timeval_current();
1336
1337         switch (tp) {
1338         case TEVENT_TRACE_BEFORE_WAIT:
1339                 diff = tevent_timeval_until(&tevent_after_wait_ts, &now);
1340                 if (diff.tv_sec > 3) {
1341                         DEBUG(DEBUG_ERR,
1342                               ("Handling event took %ld seconds!\n",
1343                                (long)diff.tv_sec));
1344                 }
1345                 tevent_before_wait_ts = now;
1346                 break;
1347
1348         case TEVENT_TRACE_AFTER_WAIT:
1349                 diff = tevent_timeval_until(&tevent_before_wait_ts, &now);
1350                 if (diff.tv_sec > 3) {
1351                         DEBUG(DEBUG_ERR,
1352                               ("No event for %ld seconds!\n",
1353                                (long)diff.tv_sec));
1354                 }
1355                 tevent_after_wait_ts = now;
1356                 break;
1357
1358         default:
1359                 /* Do nothing for future tevent trace points */ ;
1360         }
1361 }
1362
1363 static void ctdb_remove_pidfile(void)
1364 {
1365         TALLOC_FREE(ctdbd_pidfile_ctx);
1366 }
1367
1368 static void ctdb_create_pidfile(TALLOC_CTX *mem_ctx)
1369 {
1370         if (ctdbd_pidfile != NULL) {
1371                 int ret = pidfile_context_create(mem_ctx, ctdbd_pidfile,
1372                                                  &ctdbd_pidfile_ctx);
1373                 if (ret != 0) {
1374                         DEBUG(DEBUG_ERR,
1375                               ("Failed to create PID file %s\n",
1376                                ctdbd_pidfile));
1377                         exit(11);
1378                 }
1379
1380                 DEBUG(DEBUG_NOTICE, ("Created PID file %s\n", ctdbd_pidfile));
1381                 atexit(ctdb_remove_pidfile);
1382         }
1383 }
1384
1385 static void ctdb_initialise_vnn_map(struct ctdb_context *ctdb)
1386 {
1387         unsigned int i, j, count;
1388
1389         /* initialize the vnn mapping table, skipping any deleted nodes */
1390         ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
1391         CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map);
1392
1393         count = 0;
1394         for (i = 0; i < ctdb->num_nodes; i++) {
1395                 if ((ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) == 0) {
1396                         count++;
1397                 }
1398         }
1399
1400         ctdb->vnn_map->generation = INVALID_GENERATION;
1401         ctdb->vnn_map->size = count;
1402         ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, ctdb->vnn_map->size);
1403         CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map->map);
1404
1405         for(i=0, j=0; i < ctdb->vnn_map->size; i++) {
1406                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1407                         continue;
1408                 }
1409                 ctdb->vnn_map->map[j] = i;
1410                 j++;
1411         }
1412 }
1413
1414 static void ctdb_set_my_pnn(struct ctdb_context *ctdb)
1415 {
1416         if (ctdb->address == NULL) {
1417                 ctdb_fatal(ctdb,
1418                            "Can not determine PNN - node address is not set\n");
1419         }
1420
1421         ctdb->pnn = ctdb_ip_to_pnn(ctdb, ctdb->address);
1422         if (ctdb->pnn == CTDB_UNKNOWN_PNN) {
1423                 ctdb_fatal(ctdb,
1424                            "Can not determine PNN - unknown node address\n");
1425         }
1426
1427         D_NOTICE("PNN is %u\n", ctdb->pnn);
1428 }
1429
1430 static void stdin_handler(struct tevent_context *ev,
1431                           struct tevent_fd *fde,
1432                           uint16_t flags,
1433                           void *private_data)
1434 {
1435         struct ctdb_context *ctdb = talloc_get_type_abort(
1436                 private_data, struct ctdb_context);
1437         ssize_t nread;
1438         char c;
1439
1440         nread = read(STDIN_FILENO, &c, 1);
1441         if (nread != 1) {
1442                 D_ERR("stdin closed, exiting\n");
1443                 talloc_free(fde);
1444                 ctdb_shutdown_sequence(ctdb, EPIPE);
1445         }
1446 }
1447
1448 static int setup_stdin_handler(struct ctdb_context *ctdb)
1449 {
1450         struct tevent_fd *fde;
1451         struct stat st;
1452         int ret;
1453
1454         ret = fstat(STDIN_FILENO, &st);
1455         if (ret != 0) {
1456                 /* Problem with stdin, ignore... */
1457                 DBG_INFO("Can't fstat() stdin\n");
1458                 return 0;
1459         }
1460
1461         if (!S_ISFIFO(st.st_mode)) {
1462                 DBG_INFO("Not a pipe...\n");
1463                 return 0;
1464         }
1465
1466         fde = tevent_add_fd(ctdb->ev,
1467                             ctdb,
1468                             STDIN_FILENO,
1469                             TEVENT_FD_READ,
1470                             stdin_handler,
1471                             ctdb);
1472         if (fde == NULL) {
1473                 return ENOMEM;
1474         }
1475
1476         DBG_INFO("Set up stdin handler\n");
1477         return 0;
1478 }
1479
1480 static void fork_only(void)
1481 {
1482         pid_t pid;
1483
1484         pid = fork();
1485         if (pid == -1) {
1486                 D_ERR("Fork failed (errno=%d)\n", errno);
1487                 exit(1);
1488         }
1489
1490         if (pid != 0) {
1491                 /* Parent simply exits... */
1492                 exit(0);
1493         }
1494 }
1495
1496 static void sighup_hook(void *private_data)
1497 {
1498         struct ctdb_context *ctdb = talloc_get_type_abort(private_data,
1499                                                           struct ctdb_context);
1500
1501         if (ctdb->recoverd_pid > 0) {
1502                 kill(ctdb->recoverd_pid, SIGHUP);
1503         }
1504         ctdb_event_reopen_logs(ctdb);
1505 }
1506
1507 /*
1508   start the protocol going as a daemon
1509 */
1510 int ctdb_start_daemon(struct ctdb_context *ctdb,
1511                       bool interactive,
1512                       bool test_mode_enabled)
1513 {
1514         bool status;
1515         int ret;
1516         struct tevent_fd *fde;
1517
1518         /* Fork if not interactive */
1519         if (!interactive) {
1520                 if (test_mode_enabled) {
1521                         /* Keep stdin open */
1522                         fork_only();
1523                 } else {
1524                         /* Fork, close stdin, start a session */
1525                         become_daemon(true, false, false);
1526                 }
1527         }
1528
1529         ignore_signal(SIGPIPE);
1530         ignore_signal(SIGUSR1);
1531
1532         ctdb->ctdbd_pid = getpid();
1533         DEBUG(DEBUG_ERR, ("Starting CTDBD (Version %s) as PID: %u\n",
1534                           SAMBA_VERSION_STRING, ctdb->ctdbd_pid));
1535         ctdb_create_pidfile(ctdb);
1536
1537         /* create a unix domain stream socket to listen to */
1538         ret = ux_socket_bind(ctdb, test_mode_enabled);
1539         if (ret != 0) {
1540                 D_ERR("Cannot continue.  Exiting!\n");
1541                 exit(10);
1542         }
1543
1544         /* Make sure we log something when the daemon terminates.
1545          * This must be the first exit handler to run (so the last to
1546          * be registered.
1547          */
1548         __ctdbd_pid = getpid();
1549         atexit(print_exit_message);
1550
1551         if (ctdb->do_setsched) {
1552                 /* try to set us up as realtime */
1553                 if (!set_scheduler()) {
1554                         exit(1);
1555                 }
1556                 DEBUG(DEBUG_NOTICE, ("Set real-time scheduler priority\n"));
1557         }
1558
1559         ctdb->ev = tevent_context_init(NULL);
1560         if (ctdb->ev == NULL) {
1561                 DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
1562                 exit(1);
1563         }
1564         tevent_loop_allow_nesting(ctdb->ev);
1565         ctdb_tevent_trace_init();
1566         tevent_set_trace_callback(ctdb->ev, ctdb_tevent_trace, ctdb);
1567
1568         status = logging_setup_sighup_handler(ctdb->ev,
1569                                               ctdb,
1570                                               sighup_hook,
1571                                               ctdb);
1572         if (!status) {
1573                 D_ERR("Failed to set up signal handler for SIGHUP\n");
1574                 exit(1);
1575         }
1576
1577         /* set up a handler to pick up sigchld */
1578         if (ctdb_init_sigchld(ctdb) == NULL) {
1579                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD\n"));
1580                 exit(1);
1581         }
1582
1583         if (!interactive) {
1584                 ctdb_set_child_logging(ctdb);
1585         }
1586
1587         /* Exit if stdin is closed */
1588         if (test_mode_enabled) {
1589                 ret = setup_stdin_handler(ctdb);
1590                 if (ret != 0) {
1591                         DBG_ERR("Failed to setup stdin handler\n");
1592                         exit(1);
1593                 }
1594         }
1595
1596         TALLOC_FREE(ctdb->srv);
1597         if (srvid_init(ctdb, &ctdb->srv) != 0) {
1598                 DEBUG(DEBUG_CRIT,("Failed to setup message srvid context\n"));
1599                 exit(1);
1600         }
1601
1602         TALLOC_FREE(ctdb->tunnels);
1603         if (srvid_init(ctdb, &ctdb->tunnels) != 0) {
1604                 DEBUG(DEBUG_ERR, ("Failed to setup tunnels context\n"));
1605                 exit(1);
1606         }
1607
1608         /* initialize statistics collection */
1609         ctdb_statistics_init(ctdb);
1610
1611         /* force initial recovery for election */
1612         ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
1613
1614         if (ctdb_start_eventd(ctdb) != 0) {
1615                 DEBUG(DEBUG_ERR, ("Failed to start event daemon\n"));
1616                 exit(1);
1617         }
1618
1619         ctdb_set_runstate(ctdb, CTDB_RUNSTATE_INIT);
1620         ret = ctdb_event_script(ctdb, CTDB_EVENT_INIT);
1621         if (ret != 0) {
1622                 ctdb_die(ctdb, "Failed to run init event\n");
1623         }
1624         ctdb_run_notification_script(ctdb, "init");
1625
1626         if (strcmp(ctdb->transport, "tcp") == 0) {
1627                 ret = ctdb_tcp_init(ctdb);
1628         }
1629 #ifdef USE_INFINIBAND
1630         if (strcmp(ctdb->transport, "ib") == 0) {
1631                 ret = ctdb_ibw_init(ctdb);
1632         }
1633 #endif
1634         if (ret != 0) {
1635                 DEBUG(DEBUG_ERR,("Failed to initialise transport '%s'\n", ctdb->transport));
1636                 return -1;
1637         }
1638
1639         if (ctdb->methods == NULL) {
1640                 DEBUG(DEBUG_ALERT,(__location__ " Can not initialize transport. ctdb->methods is NULL\n"));
1641                 ctdb_fatal(ctdb, "transport is unavailable. can not initialize.");
1642         }
1643
1644         /* Initialise the transport.  This sets the node address if it
1645          * was not set via the command-line. */
1646         if (ctdb->methods->initialise(ctdb) != 0) {
1647                 ctdb_fatal(ctdb, "transport failed to initialise");
1648         }
1649
1650         ctdb_set_my_pnn(ctdb);
1651
1652         initialise_node_flags(ctdb);
1653
1654         ret = ctdb_set_public_addresses(ctdb);
1655         if (ret == -1) {
1656                 D_ERR("Unable to setup public IP addresses\n");
1657                 exit(1);
1658         }
1659
1660         ctdb_initialise_vnn_map(ctdb);
1661
1662         /* attach to existing databases */
1663         if (ctdb_attach_databases(ctdb) != 0) {
1664                 ctdb_fatal(ctdb, "Failed to attach to databases\n");
1665         }
1666
1667         /* start frozen, then let the first election sort things out */
1668         if (!ctdb_blocking_freeze(ctdb)) {
1669                 ctdb_fatal(ctdb, "Failed to get initial freeze\n");
1670         }
1671
1672         /* now start accepting clients, only can do this once frozen */
1673         fde = tevent_add_fd(ctdb->ev, ctdb, ctdb->daemon.sd, TEVENT_FD_READ,
1674                             ctdb_accept_client, ctdb);
1675         if (fde == NULL) {
1676                 ctdb_fatal(ctdb, "Failed to add daemon socket to event loop");
1677         }
1678         tevent_fd_set_auto_close(fde);
1679
1680         /* Start the transport */
1681         if (ctdb->methods->start(ctdb) != 0) {
1682                 DEBUG(DEBUG_ALERT,("transport failed to start!\n"));
1683                 ctdb_fatal(ctdb, "transport failed to start");
1684         }
1685
1686         /* Recovery daemon and timed events are started from the
1687          * callback, only after the setup event completes
1688          * successfully.
1689          */
1690         ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SETUP);
1691         ret = ctdb_event_script_callback(ctdb,
1692                                          ctdb,
1693                                          ctdb_setup_event_callback,
1694                                          ctdb,
1695                                          CTDB_EVENT_SETUP,
1696                                          "%s",
1697                                          "");
1698         if (ret != 0) {
1699                 DEBUG(DEBUG_CRIT,("Failed to set up 'setup' event\n"));
1700                 exit(1);
1701         }
1702
1703         lockdown_memory(ctdb->valgrinding);
1704
1705         /* go into a wait loop to allow other nodes to complete */
1706         tevent_loop_wait(ctdb->ev);
1707
1708         DEBUG(DEBUG_CRIT,("event_loop_wait() returned. this should not happen\n"));
1709         exit(1);
1710 }
1711
1712 /*
1713   allocate a packet for use in daemon<->daemon communication
1714  */
1715 struct ctdb_req_header *_ctdb_transport_allocate(struct ctdb_context *ctdb,
1716                                                  TALLOC_CTX *mem_ctx,
1717                                                  enum ctdb_operation operation,
1718                                                  size_t length, size_t slength,
1719                                                  const char *type)
1720 {
1721         int size;
1722         struct ctdb_req_header *hdr;
1723
1724         length = MAX(length, slength);
1725         size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
1726
1727         if (ctdb->methods == NULL) {
1728                 DEBUG(DEBUG_INFO,(__location__ " Unable to allocate transport packet for operation %u of length %u. Transport is DOWN.\n",
1729                          operation, (unsigned)length));
1730                 return NULL;
1731         }
1732
1733         hdr = (struct ctdb_req_header *)ctdb->methods->allocate_pkt(mem_ctx, size);
1734         if (hdr == NULL) {
1735                 DEBUG(DEBUG_ERR,("Unable to allocate transport packet for operation %u of length %u\n",
1736                          operation, (unsigned)length));
1737                 return NULL;
1738         }
1739         talloc_set_name_const(hdr, type);
1740         memset(hdr, 0, slength);
1741         hdr->length       = length;
1742         hdr->operation    = operation;
1743         hdr->ctdb_magic   = CTDB_MAGIC;
1744         hdr->ctdb_version = CTDB_PROTOCOL;
1745         hdr->generation   = ctdb->vnn_map->generation;
1746         hdr->srcnode      = ctdb->pnn;
1747
1748         return hdr;
1749 }
1750
1751 struct daemon_control_state {
1752         struct daemon_control_state *next, *prev;
1753         struct ctdb_client *client;
1754         struct ctdb_req_control_old *c;
1755         uint32_t reqid;
1756         struct ctdb_node *node;
1757 };
1758
1759 /*
1760   callback when a control reply comes in
1761  */
1762 static void daemon_control_callback(struct ctdb_context *ctdb,
1763                                     int32_t status, TDB_DATA data,
1764                                     const char *errormsg,
1765                                     void *private_data)
1766 {
1767         struct daemon_control_state *state = talloc_get_type(private_data,
1768                                                              struct daemon_control_state);
1769         struct ctdb_client *client = state->client;
1770         struct ctdb_reply_control_old *r;
1771         size_t len;
1772         int ret;
1773
1774         /* construct a message to send to the client containing the data */
1775         len = offsetof(struct ctdb_reply_control_old, data) + data.dsize;
1776         if (errormsg) {
1777                 len += strlen(errormsg);
1778         }
1779         r = ctdbd_allocate_pkt(ctdb, state, CTDB_REPLY_CONTROL, len,
1780                                struct ctdb_reply_control_old);
1781         CTDB_NO_MEMORY_VOID(ctdb, r);
1782
1783         r->hdr.reqid     = state->reqid;
1784         r->status        = status;
1785         r->datalen       = data.dsize;
1786         r->errorlen = 0;
1787         memcpy(&r->data[0], data.dptr, data.dsize);
1788         if (errormsg) {
1789                 r->errorlen = strlen(errormsg);
1790                 memcpy(&r->data[r->datalen], errormsg, r->errorlen);
1791         }
1792
1793         ret = daemon_queue_send(client, &r->hdr);
1794         if (ret != -1) {
1795                 talloc_free(state);
1796         }
1797 }
1798
1799 /*
1800   fail all pending controls to a disconnected node
1801  */
1802 void ctdb_daemon_cancel_controls(struct ctdb_context *ctdb, struct ctdb_node *node)
1803 {
1804         struct daemon_control_state *state;
1805         while ((state = node->pending_controls)) {
1806                 DLIST_REMOVE(node->pending_controls, state);
1807                 daemon_control_callback(ctdb, (uint32_t)-1, tdb_null,
1808                                         "node is disconnected", state);
1809         }
1810 }
1811
1812 /*
1813   destroy a daemon_control_state
1814  */
1815 static int daemon_control_destructor(struct daemon_control_state *state)
1816 {
1817         if (state->node) {
1818                 DLIST_REMOVE(state->node->pending_controls, state);
1819         }
1820         return 0;
1821 }
1822
1823 /*
1824   this is called when the ctdb daemon received a ctdb request control
1825   from a local client over the unix domain socket
1826  */
1827 static void daemon_request_control_from_client(struct ctdb_client *client,
1828                                                struct ctdb_req_control_old *c)
1829 {
1830         TDB_DATA data;
1831         int res;
1832         struct daemon_control_state *state;
1833         TALLOC_CTX *tmp_ctx = talloc_new(client);
1834
1835         if (c->hdr.destnode == CTDB_CURRENT_NODE) {
1836                 c->hdr.destnode = client->ctdb->pnn;
1837         }
1838
1839         state = talloc(client, struct daemon_control_state);
1840         CTDB_NO_MEMORY_VOID(client->ctdb, state);
1841
1842         state->client = client;
1843         state->c = talloc_steal(state, c);
1844         state->reqid = c->hdr.reqid;
1845         if (ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
1846                 state->node = client->ctdb->nodes[c->hdr.destnode];
1847                 DLIST_ADD(state->node->pending_controls, state);
1848         } else {
1849                 state->node = NULL;
1850         }
1851
1852         talloc_set_destructor(state, daemon_control_destructor);
1853
1854         if (c->flags & CTDB_CTRL_FLAG_NOREPLY) {
1855                 talloc_steal(tmp_ctx, state);
1856         }
1857
1858         data.dptr = &c->data[0];
1859         data.dsize = c->datalen;
1860         res = ctdb_daemon_send_control(client->ctdb, c->hdr.destnode,
1861                                        c->srvid, c->opcode, client->client_id,
1862                                        c->flags,
1863                                        data, daemon_control_callback,
1864                                        state);
1865         if (res != 0) {
1866                 DEBUG(DEBUG_ERR,(__location__ " Failed to send control to remote node %u\n",
1867                          c->hdr.destnode));
1868         }
1869
1870         talloc_free(tmp_ctx);
1871 }
1872
1873 static void daemon_request_tunnel_from_client(struct ctdb_client *client,
1874                                               struct ctdb_req_tunnel_old *c)
1875 {
1876         TDB_DATA data;
1877         int ret;
1878
1879         if (! ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
1880                 DEBUG(DEBUG_ERR, ("Invalid destination 0x%x\n",
1881                                   c->hdr.destnode));
1882                 return;
1883         }
1884
1885         ret = srvid_exists(client->ctdb->tunnels, c->tunnel_id, NULL);
1886         if (ret != 0) {
1887                 DEBUG(DEBUG_ERR,
1888                       ("tunnel id 0x%"PRIx64" not registered, dropping pkt\n",
1889                        c->tunnel_id));
1890                 return;
1891         }
1892
1893         data = (TDB_DATA) {
1894                 .dsize = c->datalen,
1895                 .dptr = &c->data[0],
1896         };
1897
1898         ret = ctdb_daemon_send_tunnel(client->ctdb, c->hdr.destnode,
1899                                       c->tunnel_id, c->flags, data);
1900         if (ret != 0) {
1901                 DEBUG(DEBUG_ERR, ("Failed to set tunnel to remote note %u\n",
1902                                   c->hdr.destnode));
1903         }
1904 }
1905
1906 /*
1907   register a call function
1908 */
1909 int ctdb_daemon_set_call(struct ctdb_context *ctdb, uint32_t db_id,
1910                          ctdb_fn_t fn, int id)
1911 {
1912         struct ctdb_registered_call *call;
1913         struct ctdb_db_context *ctdb_db;
1914
1915         ctdb_db = find_ctdb_db(ctdb, db_id);
1916         if (ctdb_db == NULL) {
1917                 return -1;
1918         }
1919
1920         call = talloc(ctdb_db, struct ctdb_registered_call);
1921         call->fn = fn;
1922         call->id = id;
1923
1924         DLIST_ADD(ctdb_db->calls, call);
1925         return 0;
1926 }
1927
1928
1929
1930 /*
1931   this local messaging handler is ugly, but is needed to prevent
1932   recursion in ctdb_send_message() when the destination node is the
1933   same as the source node
1934  */
1935 struct ctdb_local_message {
1936         struct ctdb_context *ctdb;
1937         uint64_t srvid;
1938         TDB_DATA data;
1939 };
1940
1941 static void ctdb_local_message_trigger(struct tevent_context *ev,
1942                                        struct tevent_timer *te,
1943                                        struct timeval t, void *private_data)
1944 {
1945         struct ctdb_local_message *m = talloc_get_type(
1946                 private_data, struct ctdb_local_message);
1947
1948         srvid_dispatch(m->ctdb->srv, m->srvid, CTDB_SRVID_ALL, m->data);
1949         talloc_free(m);
1950 }
1951
1952 static int ctdb_local_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data)
1953 {
1954         struct ctdb_local_message *m;
1955         m = talloc(ctdb, struct ctdb_local_message);
1956         CTDB_NO_MEMORY(ctdb, m);
1957
1958         m->ctdb = ctdb;
1959         m->srvid = srvid;
1960         m->data  = data;
1961         m->data.dptr = talloc_memdup(m, m->data.dptr, m->data.dsize);
1962         if (m->data.dptr == NULL) {
1963                 talloc_free(m);
1964                 return -1;
1965         }
1966
1967         /* this needs to be done as an event to prevent recursion */
1968         tevent_add_timer(ctdb->ev, m, timeval_zero(),
1969                          ctdb_local_message_trigger, m);
1970         return 0;
1971 }
1972
1973 /*
1974   send a ctdb message
1975 */
1976 int ctdb_daemon_send_message(struct ctdb_context *ctdb, uint32_t pnn,
1977                              uint64_t srvid, TDB_DATA data)
1978 {
1979         struct ctdb_req_message_old *r;
1980         int len;
1981
1982         if (ctdb->methods == NULL) {
1983                 DEBUG(DEBUG_INFO,(__location__ " Failed to send message. Transport is DOWN\n"));
1984                 return -1;
1985         }
1986
1987         /* see if this is a message to ourselves */
1988         if (pnn == ctdb->pnn) {
1989                 return ctdb_local_message(ctdb, srvid, data);
1990         }
1991
1992         len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
1993         r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_MESSAGE, len,
1994                                     struct ctdb_req_message_old);
1995         CTDB_NO_MEMORY(ctdb, r);
1996
1997         r->hdr.destnode  = pnn;
1998         r->srvid         = srvid;
1999         r->datalen       = data.dsize;
2000         memcpy(&r->data[0], data.dptr, data.dsize);
2001
2002         ctdb_queue_packet(ctdb, &r->hdr);
2003
2004         talloc_free(r);
2005         return 0;
2006 }
2007
2008
2009
2010 struct ctdb_client_notify_list {
2011         struct ctdb_client_notify_list *next, *prev;
2012         struct ctdb_context *ctdb;
2013         uint64_t srvid;
2014         TDB_DATA data;
2015 };
2016
2017
2018 static int ctdb_client_notify_destructor(struct ctdb_client_notify_list *nl)
2019 {
2020         int ret;
2021
2022         DEBUG(DEBUG_ERR,("Sending client notify message for srvid:%llu\n", (unsigned long long)nl->srvid));
2023
2024         ret = ctdb_daemon_send_message(nl->ctdb,
2025                                        CTDB_BROADCAST_CONNECTED,
2026                                        nl->srvid,
2027                                        nl->data);
2028         if (ret != 0) {
2029                 DEBUG(DEBUG_ERR,("Failed to send client notify message\n"));
2030         }
2031
2032         return 0;
2033 }
2034
2035 int32_t ctdb_control_register_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
2036 {
2037         struct ctdb_notify_data_old *notify = (struct ctdb_notify_data_old *)indata.dptr;
2038         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2039         struct ctdb_client_notify_list *nl;
2040
2041         DEBUG(DEBUG_INFO,("Register srvid %llu for client %d\n", (unsigned long long)notify->srvid, client_id));
2042
2043         if (indata.dsize < offsetof(struct ctdb_notify_data_old, notify_data)) {
2044                 DEBUG(DEBUG_ERR,(__location__ " Too little data in control : %d\n", (int)indata.dsize));
2045                 return -1;
2046         }
2047
2048         if (indata.dsize != (notify->len + offsetof(struct ctdb_notify_data_old, notify_data))) {
2049                 DEBUG(DEBUG_ERR,(__location__ " Wrong amount of data in control. Got %d, expected %d\n", (int)indata.dsize, (int)(notify->len + offsetof(struct ctdb_notify_data_old, notify_data))));
2050                 return -1;
2051         }
2052
2053
2054         if (client == NULL) {
2055                 DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
2056                 return -1;
2057         }
2058
2059         for(nl=client->notify; nl; nl=nl->next) {
2060                 if (nl->srvid == notify->srvid) {
2061                         break;
2062                 }
2063         }
2064         if (nl != NULL) {
2065                 DEBUG(DEBUG_ERR,(__location__ " Notification for srvid:%llu already exists for this client\n", (unsigned long long)notify->srvid));
2066                 return -1;
2067         }
2068
2069         nl = talloc(client, struct ctdb_client_notify_list);
2070         CTDB_NO_MEMORY(ctdb, nl);
2071         nl->ctdb       = ctdb;
2072         nl->srvid      = notify->srvid;
2073         nl->data.dsize = notify->len;
2074         nl->data.dptr  = talloc_memdup(nl, notify->notify_data,
2075                                        nl->data.dsize);
2076         CTDB_NO_MEMORY(ctdb, nl->data.dptr);
2077
2078         DLIST_ADD(client->notify, nl);
2079         talloc_set_destructor(nl, ctdb_client_notify_destructor);
2080
2081         return 0;
2082 }
2083
2084 int32_t ctdb_control_deregister_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
2085 {
2086         uint64_t srvid = *(uint64_t *)indata.dptr;
2087         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2088         struct ctdb_client_notify_list *nl;
2089
2090         DEBUG(DEBUG_INFO,("Deregister srvid %llu for client %d\n", (unsigned long long)srvid, client_id));
2091
2092         if (client == NULL) {
2093                 DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
2094                 return -1;
2095         }
2096
2097         for(nl=client->notify; nl; nl=nl->next) {
2098                 if (nl->srvid == srvid) {
2099                         break;
2100                 }
2101         }
2102         if (nl == NULL) {
2103                 DEBUG(DEBUG_ERR,(__location__ " No notification for srvid:%llu found for this client\n", (unsigned long long)srvid));
2104                 return -1;
2105         }
2106
2107         DLIST_REMOVE(client->notify, nl);
2108         talloc_set_destructor(nl, NULL);
2109         talloc_free(nl);
2110
2111         return 0;
2112 }
2113
2114 struct ctdb_client *ctdb_find_client_by_pid(struct ctdb_context *ctdb, pid_t pid)
2115 {
2116         struct ctdb_client_pid_list *client_pid;
2117
2118         for (client_pid = ctdb->client_pids; client_pid; client_pid=client_pid->next) {
2119                 if (client_pid->pid == pid) {
2120                         return client_pid->client;
2121                 }
2122         }
2123         return NULL;
2124 }
2125
2126
2127 /* This control is used by samba when probing if a process (of a samba daemon)
2128    exists on the node.
2129    Samba does this when it needs/wants to check if a subrecord in one of the
2130    databases is still valid, or if it is stale and can be removed.
2131    If the node is in unhealthy or stopped state we just kill of the samba
2132    process holding this sub-record and return to the calling samba that
2133    the process does not exist.
2134    This allows us to forcefully recall subrecords registered by samba processes
2135    on banned and stopped nodes.
2136 */
2137 int32_t ctdb_control_process_exists(struct ctdb_context *ctdb, pid_t pid)
2138 {
2139         struct ctdb_client *client;
2140
2141         client = ctdb_find_client_by_pid(ctdb, pid);
2142         if (client == NULL) {
2143                 return -1;
2144         }
2145
2146         if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE) {
2147                 DEBUG(DEBUG_NOTICE,
2148                       ("Killing client with pid:%d on banned/stopped node\n",
2149                        (int)pid));
2150                 talloc_free(client);
2151                 return -1;
2152         }
2153
2154         return kill(pid, 0);
2155 }
2156
2157 int32_t ctdb_control_check_pid_srvid(struct ctdb_context *ctdb,
2158                                      TDB_DATA indata)
2159 {
2160         struct ctdb_client_pid_list *client_pid;
2161         pid_t pid;
2162         uint64_t srvid;
2163         int ret;
2164
2165         pid = *(pid_t *)indata.dptr;
2166         srvid = *(uint64_t *)(indata.dptr + sizeof(pid_t));
2167
2168         for (client_pid = ctdb->client_pids;
2169              client_pid != NULL;
2170              client_pid = client_pid->next) {
2171                 if (client_pid->pid == pid) {
2172                         ret = srvid_exists(ctdb->srv, srvid,
2173                                            client_pid->client);
2174                         if (ret == 0) {
2175                                 return 0;
2176                         }
2177                 }
2178         }
2179
2180         return -1;
2181 }
2182
2183 int ctdb_control_getnodesfile(struct ctdb_context *ctdb,
2184                               uint32_t opcode,
2185                               TDB_DATA indata,
2186                               TDB_DATA *outdata)
2187 {
2188         struct ctdb_node_map *node_map = NULL;
2189         size_t len;
2190         uint8_t *buf = NULL;
2191         size_t npush = 0;
2192         int ret = -1;
2193
2194         CHECK_CONTROL_DATA_SIZE(0);
2195
2196         node_map = ctdb_read_nodes(ctdb, ctdb->nodes_source);
2197         if (node_map == NULL) {
2198                 D_ERR("Failed to read nodes file\n");
2199                 return -1;
2200         }
2201
2202         len = ctdb_node_map_len(node_map);
2203         buf = talloc_size(ctdb, len);
2204         if (buf == NULL) {
2205                 goto done;
2206         }
2207
2208         ctdb_node_map_push(node_map, buf, &npush);
2209         if (len != npush) {
2210                 talloc_free(buf);
2211                 goto done;
2212         }
2213
2214         outdata->dptr  = buf;
2215         outdata->dsize = len;
2216         ret = 0;
2217 done:
2218         talloc_free(node_map);
2219         return ret;
2220 }
2221
2222 void ctdb_shutdown_sequence(struct ctdb_context *ctdb, int exit_code)
2223 {
2224         if (ctdb->runstate == CTDB_RUNSTATE_SHUTDOWN) {
2225                 DEBUG(DEBUG_NOTICE,("Already shutting down so will not proceed.\n"));
2226                 return;
2227         }
2228
2229         DEBUG(DEBUG_ERR,("Shutdown sequence commencing.\n"));
2230         ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SHUTDOWN);
2231         ctdb_stop_recoverd(ctdb);
2232         ctdb_stop_keepalive(ctdb);
2233         ctdb_stop_monitoring(ctdb);
2234         ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
2235         ctdb_stop_eventd(ctdb);
2236         if (ctdb->methods != NULL && ctdb->methods->shutdown != NULL) {
2237                 ctdb->methods->shutdown(ctdb);
2238         }
2239
2240         DEBUG(DEBUG_ERR,("Shutdown sequence complete, exiting.\n"));
2241         exit(exit_code);
2242 }
2243
2244 /* When forking the main daemon and the child process needs to connect
2245  * back to the daemon as a client process, this function can be used
2246  * to change the ctdb context from daemon into client mode.  The child
2247  * process must be created using ctdb_fork() and not fork() -
2248  * ctdb_fork() does some necessary housekeeping.
2249  */
2250 int switch_from_server_to_client(struct ctdb_context *ctdb)
2251 {
2252         int ret;
2253
2254         if (ctdb->daemon.sd != -1) {
2255                 close(ctdb->daemon.sd);
2256                 ctdb->daemon.sd = -1;
2257         }
2258
2259         /* get a new event context */
2260         ctdb->ev = tevent_context_init(ctdb);
2261         if (ctdb->ev == NULL) {
2262                 DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
2263                 exit(1);
2264         }
2265         tevent_loop_allow_nesting(ctdb->ev);
2266
2267         /* Connect to main CTDB daemon */
2268         ret = ctdb_socket_connect(ctdb);
2269         if (ret != 0) {
2270                 DEBUG(DEBUG_ALERT, (__location__ " Failed to init ctdb client\n"));
2271                 return -1;
2272         }
2273
2274         ctdb->can_send_controls = true;
2275
2276         return 0;
2277 }