fs/dlm/recoverd.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /******************************************************************************
   3 *******************************************************************************
   4 **
   5 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
   6 **  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
   7 **
   8 **
   9 *******************************************************************************
  10 ******************************************************************************/
  11
  12 #include "dlm_internal.h"
  13 #include "lockspace.h"
  14 #include "member.h"
  15 #include "dir.h"
  16 #include "ast.h"
  17 #include "recover.h"
  18 #include "lowcomms.h"
  19 #include "lock.h"
  20 #include "requestqueue.h"
  21 #include "recoverd.h"
  22
  23 static int dlm_create_masters_list(struct dlm_ls *ls)
  24 {
  25         struct dlm_rsb *r;
  26         int error = 0;
  27
  28         write_lock_bh(&ls->ls_masters_lock);
  29         if (!list_empty(&ls->ls_masters_list)) {
  30                 log_error(ls, "root list not empty");
  31                 error = -EINVAL;
  32                 goto out;
  33         }
  34
  35         read_lock_bh(&ls->ls_rsbtbl_lock);
  36         list_for_each_entry(r, &ls->ls_slow_active, res_slow_list) {
  37                 if (r->res_nodeid)
  38                         continue;
  39
  40                 list_add(&r->res_masters_list, &ls->ls_masters_list);
  41                 dlm_hold_rsb(r);
  42         }
  43         read_unlock_bh(&ls->ls_rsbtbl_lock);
  44  out:
  45         write_unlock_bh(&ls->ls_masters_lock);
  46         return error;
  47 }
  48
  49 static void dlm_release_masters_list(struct dlm_ls *ls)
  50 {
  51         struct dlm_rsb *r, *safe;
  52
  53         write_lock_bh(&ls->ls_masters_lock);
  54         list_for_each_entry_safe(r, safe, &ls->ls_masters_list, res_masters_list) {
  55                 list_del_init(&r->res_masters_list);
  56                 dlm_put_rsb(r);
  57         }
  58         write_unlock_bh(&ls->ls_masters_lock);
  59 }
  60
  61 static void dlm_create_root_list(struct dlm_ls *ls, struct list_head *root_list)
  62 {
  63         struct dlm_rsb *r;
  64
  65         read_lock_bh(&ls->ls_rsbtbl_lock);
  66         list_for_each_entry(r, &ls->ls_slow_active, res_slow_list) {
  67                 list_add(&r->res_root_list, root_list);
  68                 dlm_hold_rsb(r);
  69         }
  70
  71         WARN_ON_ONCE(!list_empty(&ls->ls_slow_inactive));
  72         read_unlock_bh(&ls->ls_rsbtbl_lock);
  73 }
  74
  75 static void dlm_release_root_list(struct list_head *root_list)
  76 {
  77         struct dlm_rsb *r, *safe;
  78
  79         list_for_each_entry_safe(r, safe, root_list, res_root_list) {
  80                 list_del_init(&r->res_root_list);
  81                 dlm_put_rsb(r);
  82         }
  83 }
  84
  85 /* If the start for which we're re-enabling locking (seq) has been superseded
  86    by a newer stop (ls_recover_seq), we need to leave locking disabled.
  87
  88    We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees
  89    locking stopped and b) adds a message to the requestqueue, but dlm_recoverd
  90    enables locking and clears the requestqueue between a and b. */
  91
  92 static int enable_locking(struct dlm_ls *ls, uint64_t seq)
  93 {
  94         int error = -EINTR;
  95
  96         write_lock_bh(&ls->ls_recv_active);
  97
  98         spin_lock_bh(&ls->ls_recover_lock);
  99         if (ls->ls_recover_seq == seq) {
 100                 set_bit(LSFL_RUNNING, &ls->ls_flags);
 101                 /* Schedule next timer if recovery put something on inactive.
 102                  *
 103                  * The rsbs that was queued while recovery on toss hasn't
 104                  * started yet because LSFL_RUNNING was set everything
 105                  * else recovery hasn't started as well because ls_in_recovery
 106                  * is still hold. So we should not run into the case that
 107                  * resume_scan_timer() queues a timer that can occur in
 108                  * a no op.
 109                  */
 110                 resume_scan_timer(ls);
 111                 /* unblocks processes waiting to enter the dlm */
 112                 up_write(&ls->ls_in_recovery);
 113                 clear_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
 114                 error = 0;
 115         }
 116         spin_unlock_bh(&ls->ls_recover_lock);
 117
 118         write_unlock_bh(&ls->ls_recv_active);
 119         return error;
 120 }
 121
 122 static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 123 {
 124         LIST_HEAD(root_list);
 125         unsigned long start;
 126         int error, neg = 0;
 127
 128         log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
 129
 130         mutex_lock(&ls->ls_recoverd_active);
 131
 132         dlm_callback_suspend(ls);
 133
 134         dlm_clear_inactive(ls);
 135
 136         /*
 137          * This list of root rsb's will be the basis of most of the recovery
 138          * routines.
 139          */
 140
 141         dlm_create_root_list(ls, &root_list);
 142
 143         /*
 144          * Add or remove nodes from the lockspace's ls_nodes list.
 145          *
 146          * Due to the fact that we must report all membership changes to lsops
 147          * or midcomms layer, it is not permitted to abort ls_recover() until
 148          * this is done.
 149          */
 150
 151         error = dlm_recover_members(ls, rv, &neg);
 152         if (error) {
 153                 log_rinfo(ls, "dlm_recover_members error %d", error);
 154                 goto fail_root_list;
 155         }
 156
 157         dlm_recover_dir_nodeid(ls, &root_list);
 158
 159         /* Create a snapshot of all active rsbs were we are the master of.
 160          * During the barrier between dlm_recover_members_wait() and
 161          * dlm_recover_directory() other nodes can dump their necessary
 162          * directory dlm_rsb (r->res_dir_nodeid == nodeid) in rcom
 163          * communication dlm_copy_master_names() handling.
 164          *
 165          * TODO We should create a per lockspace list that contains rsbs
 166          * that we are the master of. Instead of creating this list while
 167          * recovery we keep track of those rsbs while locking handling and
 168          * recovery can use it when necessary.
 169          */
 170         error = dlm_create_masters_list(ls);
 171         if (error) {
 172                 log_rinfo(ls, "dlm_create_masters_list error %d", error);
 173                 goto fail_root_list;
 174         }
 175
 176         ls->ls_recover_locks_in = 0;
 177
 178         dlm_set_recover_status(ls, DLM_RS_NODES);
 179
 180         error = dlm_recover_members_wait(ls, rv->seq);
 181         if (error) {
 182                 log_rinfo(ls, "dlm_recover_members_wait error %d", error);
 183                 dlm_release_masters_list(ls);
 184                 goto fail_root_list;
 185         }
 186
 187         start = jiffies;
 188
 189         /*
 190          * Rebuild our own share of the directory by collecting from all other
 191          * nodes their master rsb names that hash to us.
 192          */
 193
 194         error = dlm_recover_directory(ls, rv->seq);
 195         if (error) {
 196                 log_rinfo(ls, "dlm_recover_directory error %d", error);
 197                 dlm_release_masters_list(ls);
 198                 goto fail_root_list;
 199         }
 200
 201         dlm_set_recover_status(ls, DLM_RS_DIR);
 202
 203         error = dlm_recover_directory_wait(ls, rv->seq);
 204         if (error) {
 205                 log_rinfo(ls, "dlm_recover_directory_wait error %d", error);
 206                 dlm_release_masters_list(ls);
 207                 goto fail_root_list;
 208         }
 209
 210         dlm_release_masters_list(ls);
 211
 212         /*
 213          * We may have outstanding operations that are waiting for a reply from
 214          * a failed node.  Mark these to be resent after recovery.  Unlock and
 215          * cancel ops can just be completed.
 216          */
 217
 218         dlm_recover_waiters_pre(ls);
 219
 220         if (dlm_recovery_stopped(ls)) {
 221                 error = -EINTR;
 222                 goto fail_root_list;
 223         }
 224
 225         if (neg || dlm_no_directory(ls)) {
 226                 /*
 227                  * Clear lkb's for departed nodes.
 228                  */
 229
 230                 dlm_recover_purge(ls, &root_list);
 231
 232                 /*
 233                  * Get new master nodeid's for rsb's that were mastered on
 234                  * departed nodes.
 235                  */
 236
 237                 error = dlm_recover_masters(ls, rv->seq, &root_list);
 238                 if (error) {
 239                         log_rinfo(ls, "dlm_recover_masters error %d", error);
 240                         goto fail_root_list;
 241                 }
 242
 243                 /*
 244                  * Send our locks on remastered rsb's to the new masters.
 245                  */
 246
 247                 error = dlm_recover_locks(ls, rv->seq, &root_list);
 248                 if (error) {
 249                         log_rinfo(ls, "dlm_recover_locks error %d", error);
 250                         goto fail_root_list;
 251                 }
 252
 253                 dlm_set_recover_status(ls, DLM_RS_LOCKS);
 254
 255                 error = dlm_recover_locks_wait(ls, rv->seq);
 256                 if (error) {
 257                         log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
 258                         goto fail_root_list;
 259                 }
 260
 261                 log_rinfo(ls, "dlm_recover_locks %u in",
 262                           ls->ls_recover_locks_in);
 263
 264                 /*
 265                  * Finalize state in master rsb's now that all locks can be
 266                  * checked.  This includes conversion resolution and lvb
 267                  * settings.
 268                  */
 269
 270                 dlm_recover_rsbs(ls, &root_list);
 271         } else {
 272                 /*
 273                  * Other lockspace members may be going through the "neg" steps
 274                  * while also adding us to the lockspace, in which case they'll
 275                  * be doing the recover_locks (RS_LOCKS) barrier.
 276                  */
 277                 dlm_set_recover_status(ls, DLM_RS_LOCKS);
 278
 279                 error = dlm_recover_locks_wait(ls, rv->seq);
 280                 if (error) {
 281                         log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
 282                         goto fail_root_list;
 283                 }
 284         }
 285
 286         dlm_release_root_list(&root_list);
 287
 288         /*
 289          * Purge directory-related requests that are saved in requestqueue.
 290          * All dir requests from before recovery are invalid now due to the dir
 291          * rebuild and will be resent by the requesting nodes.
 292          */
 293
 294         dlm_purge_requestqueue(ls);
 295
 296         dlm_set_recover_status(ls, DLM_RS_DONE);
 297
 298         error = dlm_recover_done_wait(ls, rv->seq);
 299         if (error) {
 300                 log_rinfo(ls, "dlm_recover_done_wait error %d", error);
 301                 goto fail;
 302         }
 303
 304         dlm_clear_members_gone(ls);
 305
 306         dlm_callback_resume(ls);
 307
 308         error = enable_locking(ls, rv->seq);
 309         if (error) {
 310                 log_rinfo(ls, "enable_locking error %d", error);
 311                 goto fail;
 312         }
 313
 314         error = dlm_process_requestqueue(ls);
 315         if (error) {
 316                 log_rinfo(ls, "dlm_process_requestqueue error %d", error);
 317                 goto fail;
 318         }
 319
 320         error = dlm_recover_waiters_post(ls);
 321         if (error) {
 322                 log_rinfo(ls, "dlm_recover_waiters_post error %d", error);
 323                 goto fail;
 324         }
 325
 326         dlm_recover_grant(ls);
 327
 328         log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms",
 329                   (unsigned long long)rv->seq, ls->ls_generation,
 330                   jiffies_to_msecs(jiffies - start));
 331         mutex_unlock(&ls->ls_recoverd_active);
 332
 333         return 0;
 334
 335  fail_root_list:
 336         dlm_release_root_list(&root_list);
 337  fail:
 338         mutex_unlock(&ls->ls_recoverd_active);
 339
 340         return error;
 341 }
 342
 343 /* The dlm_ls_start() that created the rv we take here may already have been
 344    stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP
 345    flag set. */
 346
 347 static void do_ls_recovery(struct dlm_ls *ls)
 348 {
 349         struct dlm_recover *rv = NULL;
 350         int error;
 351
 352         spin_lock_bh(&ls->ls_recover_lock);
 353         rv = ls->ls_recover_args;
 354         ls->ls_recover_args = NULL;
 355         if (rv && ls->ls_recover_seq == rv->seq)
 356                 clear_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
 357         spin_unlock_bh(&ls->ls_recover_lock);
 358
 359         if (rv) {
 360                 error = ls_recover(ls, rv);
 361                 switch (error) {
 362                 case 0:
 363                         ls->ls_recovery_result = 0;
 364                         complete(&ls->ls_recovery_done);
 365
 366                         dlm_lsop_recover_done(ls);
 367                         break;
 368                 case -EINTR:
 369                         /* if recovery was interrupted -EINTR we wait for the next
 370                          * ls_recover() iteration until it hopefully succeeds.
 371                          */
 372                         log_rinfo(ls, "%s %llu interrupted and should be queued to run again",
 373                                   __func__, (unsigned long long)rv->seq);
 374                         break;
 375                 default:
 376                         log_rinfo(ls, "%s %llu error %d", __func__,
 377                                   (unsigned long long)rv->seq, error);
 378
 379                         /* let new_lockspace() get aware of critical error */
 380                         ls->ls_recovery_result = error;
 381                         complete(&ls->ls_recovery_done);
 382                         break;
 383                 }
 384
 385                 kfree(rv->nodes);
 386                 kfree(rv);
 387         }
 388 }
 389
 390 static int dlm_recoverd(void *arg)
 391 {
 392         struct dlm_ls *ls;
 393
 394         ls = dlm_find_lockspace_local(arg);
 395         if (!ls) {
 396                 log_print("dlm_recoverd: no lockspace %p", arg);
 397                 return -1;
 398         }
 399
 400         down_write(&ls->ls_in_recovery);
 401         set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
 402         wake_up(&ls->ls_recover_lock_wait);
 403
 404         while (1) {
 405                 /*
 406                  * We call kthread_should_stop() after set_current_state().
 407                  * This is because it works correctly if kthread_stop() is
 408                  * called just before set_current_state().
 409                  */
 410                 set_current_state(TASK_INTERRUPTIBLE);
 411                 if (kthread_should_stop()) {
 412                         set_current_state(TASK_RUNNING);
 413                         break;
 414                 }
 415                 if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) &&
 416                     !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) {
 417                         if (kthread_should_stop())
 418                                 break;
 419                         schedule();
 420                 }
 421                 set_current_state(TASK_RUNNING);
 422
 423                 if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) {
 424                         down_write(&ls->ls_in_recovery);
 425                         set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
 426                         wake_up(&ls->ls_recover_lock_wait);
 427                 }
 428
 429                 if (test_and_clear_bit(LSFL_RECOVER_WORK, &ls->ls_flags))
 430                         do_ls_recovery(ls);
 431         }
 432
 433         if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags))
 434                 up_write(&ls->ls_in_recovery);
 435
 436         dlm_put_lockspace(ls);
 437         return 0;
 438 }
 439
 440 int dlm_recoverd_start(struct dlm_ls *ls)
 441 {
 442         struct task_struct *p;
 443         int error = 0;
 444
 445         p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
 446         if (IS_ERR(p))
 447                 error = PTR_ERR(p);
 448         else
 449                 ls->ls_recoverd_task = p;
 450         return error;
 451 }
 452
 453 void dlm_recoverd_stop(struct dlm_ls *ls)
 454 {
 455         kthread_stop(ls->ls_recoverd_task);
 456 }
 457
 458 void dlm_recoverd_suspend(struct dlm_ls *ls)
 459 {
 460         wake_up(&ls->ls_wait_general);
 461         mutex_lock(&ls->ls_recoverd_active);
 462 }
 463
 464 void dlm_recoverd_resume(struct dlm_ls *ls)
 465 {
 466         mutex_unlock(&ls->ls_recoverd_active);
 467 }
 468