lib/tdb/common/lock.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb_private.h"
  29
  30 _PUBLIC_ void tdb_setalarm_sigptr(struct tdb_context *tdb, volatile sig_atomic_t *ptr)
  31 {
  32         tdb->interrupt_sig_ptr = ptr;
  33 }
  34
  35 static int fcntl_lock(struct tdb_context *tdb,
  36                       int rw, off_t off, off_t len, bool waitflag)
  37 {
  38         struct flock fl;
  39         int cmd;
  40
  41 #ifdef USE_TDB_MUTEX_LOCKING
  42         {
  43                 int ret;
  44                 if (tdb_mutex_lock(tdb, rw, off, len, waitflag, &ret)) {
  45                         return ret;
  46                 }
  47         }
  48 #endif
  49
  50         fl.l_type = rw;
  51         fl.l_whence = SEEK_SET;
  52         fl.l_start = off;
  53         fl.l_len = len;
  54         fl.l_pid = 0;
  55
  56         cmd = waitflag ? F_SETLKW : F_SETLK;
  57
  58         return fcntl(tdb->fd, cmd, &fl);
  59 }
  60
  61 static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
  62 {
  63         struct flock fl;
  64 #if 0 /* Check they matched up locks and unlocks correctly. */
  65         char line[80];
  66         FILE *locks;
  67         bool found = false;
  68
  69         locks = fopen("/proc/locks", "r");
  70
  71         while (fgets(line, 80, locks)) {
  72                 char *p;
  73                 int type, start, l;
  74
  75                 /* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
  76                 p = strchr(line, ':') + 1;
  77                 if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
  78                         continue;
  79                 p += strlen(" FLOCK  ADVISORY  ");
  80                 if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
  81                         type = F_RDLCK;
  82                 else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
  83                         type = F_WRLCK;
  84                 else
  85                         abort();
  86                 p += 6;
  87                 if (atoi(p) != getpid())
  88                         continue;
  89                 p = strchr(strchr(p, ' ') + 1, ' ') + 1;
  90                 start = atoi(p);
  91                 p = strchr(p, ' ') + 1;
  92                 if (strncmp(p, "EOF", 3) == 0)
  93                         l = 0;
  94                 else
  95                         l = atoi(p) - start + 1;
  96
  97                 if (off == start) {
  98                         if (len != l) {
  99                                 fprintf(stderr, "Len %u should be %u: %s",
 100                                         (int)len, l, line);
 101                                 abort();
 102                         }
 103                         if (type != rw) {
 104                                 fprintf(stderr, "Type %s wrong: %s",
 105                                         rw == F_RDLCK ? "READ" : "WRITE", line);
 106                                 abort();
 107                         }
 108                         found = true;
 109                         break;
 110                 }
 111         }
 112
 113         if (!found) {
 114                 fprintf(stderr, "Unlock on %u@%u not found!\n",
 115                         (int)off, (int)len);
 116                 abort();
 117         }
 118
 119         fclose(locks);
 120 #endif
 121
 122 #ifdef USE_TDB_MUTEX_LOCKING
 123         {
 124                 int ret;
 125                 if (tdb_mutex_unlock(tdb, rw, off, len, &ret)) {
 126                         return ret;
 127                 }
 128         }
 129 #endif
 130
 131         fl.l_type = F_UNLCK;
 132         fl.l_whence = SEEK_SET;
 133         fl.l_start = off;
 134         fl.l_len = len;
 135         fl.l_pid = 0;
 136
 137         return fcntl(tdb->fd, F_SETLKW, &fl);
 138 }
 139
 140 /*
 141  * Calculate the lock offset for a list
 142  *
 143  * list -1 is the freelist, otherwise a hash chain.
 144  *
 145  * Note that we consistently (but without real reason) lock hash chains at an
 146  * offset that is 4 bytes below the real offset of the corresponding list head
 147  * in the db.
 148  *
 149  * This is the memory layout of the hashchain array:
 150  *
 151  * FREELIST_TOP + 0 = freelist
 152  * FREELIST_TOP + 4 = hashtable list 0
 153  * FREELIST_TOP + 8 = hashtable list 1
 154  * ...
 155  *
 156  * Otoh lock_offset computes:
 157  *
 158  * freelist = FREELIST_TOP - 4
 159  * list 0   = FREELIST_TOP + 0
 160  * list 1   = FREELIST_TOP + 4
 161  * ...
 162  *
 163  * Unfortunately we can't change this calculation in order to align the locking
 164  * offset with the memory layout, as that would make the locking incompatible
 165  * between different tdb versions.
 166  */
 167 static tdb_off_t lock_offset(int list)
 168 {
 169         return FREELIST_TOP + 4*list;
 170 }
 171
 172 /* a byte range locking function - return 0 on success
 173    this functions locks/unlocks "len" byte at the specified offset.
 174
 175    On error, errno is also set so that errors are passed back properly
 176    through tdb_open().
 177
 178    note that a len of zero means lock to end of file
 179 */
 180 int tdb_brlock(struct tdb_context *tdb,
 181                int rw_type, tdb_off_t offset, size_t len,
 182                enum tdb_lock_flags flags)
 183 {
 184         int ret;
 185
 186         if (tdb->flags & TDB_NOLOCK) {
 187                 return 0;
 188         }
 189
 190         if (flags & TDB_LOCK_MARK_ONLY) {
 191                 return 0;
 192         }
 193
 194         if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
 195                 tdb->ecode = TDB_ERR_RDONLY;
 196                 return -1;
 197         }
 198
 199         do {
 200                 ret = fcntl_lock(tdb, rw_type, offset, len,
 201                                  flags & TDB_LOCK_WAIT);
 202                 /* Check for a sigalarm break. */
 203                 if (ret == -1 && errno == EINTR &&
 204                                 tdb->interrupt_sig_ptr &&
 205                                 *tdb->interrupt_sig_ptr) {
 206                         break;
 207                 }
 208         } while (ret == -1 && errno == EINTR);
 209
 210         if (ret == -1) {
 211                 tdb->ecode = TDB_ERR_LOCK;
 212                 /* Generic lock error. errno set by fcntl.
 213                  * EAGAIN is an expected return from non-blocking
 214                  * locks. */
 215                 if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
 216                         TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %u rw_type=%d flags=%d len=%zu\n",
 217                                  tdb->fd, offset, rw_type, flags, len));
 218                 }
 219                 return -1;
 220         }
 221         return 0;
 222 }
 223
 224 int tdb_brunlock(struct tdb_context *tdb,
 225                  int rw_type, tdb_off_t offset, size_t len)
 226 {
 227         int ret;
 228
 229         if (tdb->flags & TDB_NOLOCK) {
 230                 return 0;
 231         }
 232
 233         do {
 234                 ret = fcntl_unlock(tdb, rw_type, offset, len);
 235         } while (ret == -1 && errno == EINTR);
 236
 237         if (ret == -1) {
 238                 TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brunlock failed (fd=%d) at offset %u rw_type=%u len=%zu\n",
 239                          tdb->fd, offset, rw_type, len));
 240         }
 241         return ret;
 242 }
 243
 244 /*
 245  * Do a tdb_brlock in a loop. Some OSes (such as solaris) have too
 246  * conservative deadlock detection and claim a deadlock when progress can be
 247  * made. For those OSes we may loop for a while.
 248  */
 249
 250 static int tdb_brlock_retry(struct tdb_context *tdb,
 251                             int rw_type, tdb_off_t offset, size_t len,
 252                             enum tdb_lock_flags flags)
 253 {
 254         int count = 1000;
 255
 256         while (count--) {
 257                 struct timeval tv;
 258                 int ret;
 259
 260                 ret = tdb_brlock(tdb, rw_type, offset, len, flags);
 261                 if (ret == 0) {
 262                         return 0;
 263                 }
 264                 if (errno != EDEADLK) {
 265                         break;
 266                 }
 267                 /* sleep for as short a time as we can - more portable than usleep() */
 268                 tv.tv_sec = 0;
 269                 tv.tv_usec = 1;
 270                 select(0, NULL, NULL, NULL, &tv);
 271         }
 272         return -1;
 273 }
 274
 275 /*
 276   upgrade a read lock to a write lock.
 277 */
 278 int tdb_allrecord_upgrade(struct tdb_context *tdb)
 279 {
 280         int ret;
 281
 282         if (tdb->allrecord_lock.count != 1) {
 283                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 284                          "tdb_allrecord_upgrade failed: count %u too high\n",
 285                          tdb->allrecord_lock.count));
 286                 tdb->ecode = TDB_ERR_LOCK;
 287                 return -1;
 288         }
 289
 290         if (tdb->allrecord_lock.off != 1) {
 291                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 292                          "tdb_allrecord_upgrade failed: already upgraded?\n"));
 293                 tdb->ecode = TDB_ERR_LOCK;
 294                 return -1;
 295         }
 296
 297         if (tdb_have_mutexes(tdb)) {
 298                 ret = tdb_mutex_allrecord_upgrade(tdb);
 299                 if (ret == -1) {
 300                         goto fail;
 301                 }
 302                 ret = tdb_brlock_retry(tdb, F_WRLCK, lock_offset(tdb->hash_size),
 303                                        0, TDB_LOCK_WAIT|TDB_LOCK_PROBE);
 304                 if (ret == -1) {
 305                         tdb_mutex_allrecord_downgrade(tdb);
 306                 }
 307         } else {
 308                 ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0,
 309                                        TDB_LOCK_WAIT|TDB_LOCK_PROBE);
 310         }
 311
 312         if (ret == 0) {
 313                 tdb->allrecord_lock.ltype = F_WRLCK;
 314                 tdb->allrecord_lock.off = 0;
 315                 return 0;
 316         }
 317 fail:
 318         TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_allrecord_upgrade failed\n"));
 319         return -1;
 320 }
 321
 322 static struct tdb_lock_type *find_nestlock(struct tdb_context *tdb,
 323                                            tdb_off_t offset)
 324 {
 325         int i;
 326
 327         for (i=0; i<tdb->num_lockrecs; i++) {
 328                 if (tdb->lockrecs[i].off == offset) {
 329                         return &tdb->lockrecs[i];
 330                 }
 331         }
 332         return NULL;
 333 }
 334
 335 /* lock an offset in the database. */
 336 int tdb_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype,
 337                   enum tdb_lock_flags flags)
 338 {
 339         struct tdb_lock_type *new_lck;
 340
 341         if (offset >= lock_offset(tdb->hash_size)) {
 342                 tdb->ecode = TDB_ERR_LOCK;
 343                 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid offset %u for ltype=%d\n",
 344                          offset, ltype));
 345                 return -1;
 346         }
 347         if (tdb->flags & TDB_NOLOCK)
 348                 return 0;
 349
 350         new_lck = find_nestlock(tdb, offset);
 351         if (new_lck) {
 352                 if ((new_lck->ltype == F_RDLCK) && (ltype == F_WRLCK)) {
 353                         if (!tdb_have_mutexes(tdb)) {
 354                                 int ret;
 355                                 /*
 356                                  * Upgrade the underlying fcntl
 357                                  * lock. Mutexes don't do readlocks,
 358                                  * so this only applies to fcntl
 359                                  * locking.
 360                                  */
 361                                 ret = tdb_brlock(tdb, ltype, offset, 1, flags);
 362                                 if (ret != 0) {
 363                                         return ret;
 364                                 }
 365                         }
 366                         new_lck->ltype = F_WRLCK;
 367                 }
 368                 /*
 369                  * Just increment the in-memory struct, posix locks
 370                  * don't stack.
 371                  */
 372                 new_lck->count++;
 373                 return 0;
 374         }
 375
 376         if (tdb->num_lockrecs == tdb->lockrecs_array_length) {
 377                 new_lck = (struct tdb_lock_type *)realloc(
 378                         tdb->lockrecs,
 379                         sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
 380                 if (new_lck == NULL) {
 381                         errno = ENOMEM;
 382                         return -1;
 383                 }
 384                 tdb->lockrecs_array_length = tdb->num_lockrecs+1;
 385                 tdb->lockrecs = new_lck;
 386         }
 387
 388         /* Since fcntl locks don't nest, we do a lock for the first one,
 389            and simply bump the count for future ones */
 390         if (tdb_brlock(tdb, ltype, offset, 1, flags)) {
 391                 return -1;
 392         }
 393
 394         new_lck = &tdb->lockrecs[tdb->num_lockrecs];
 395
 396         new_lck->off = offset;
 397         new_lck->count = 1;
 398         new_lck->ltype = ltype;
 399         tdb->num_lockrecs++;
 400
 401         return 0;
 402 }
 403
 404 static int tdb_lock_and_recover(struct tdb_context *tdb)
 405 {
 406         int ret;
 407
 408         /* We need to match locking order in transaction commit. */
 409         if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) {
 410                 return -1;
 411         }
 412
 413         if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) {
 414                 tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
 415                 return -1;
 416         }
 417
 418         ret = tdb_transaction_recover(tdb);
 419
 420         tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1);
 421         tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
 422
 423         return ret;
 424 }
 425
 426 static bool have_data_locks(const struct tdb_context *tdb)
 427 {
 428         int i;
 429
 430         for (i = 0; i < tdb->num_lockrecs; i++) {
 431                 if (tdb->lockrecs[i].off >= lock_offset(-1))
 432                         return true;
 433         }
 434         return false;
 435 }
 436
 437 /*
 438  * A allrecord lock allows us to avoid per chain locks. Check if the allrecord
 439  * lock is strong enough.
 440  */
 441 static int tdb_lock_covered_by_allrecord_lock(struct tdb_context *tdb,
 442                                               int ltype)
 443 {
 444         if (ltype == F_RDLCK) {
 445                 /*
 446                  * The allrecord_lock is equal (F_RDLCK) or stronger
 447                  * (F_WRLCK). Pass.
 448                  */
 449                 return 0;
 450         }
 451
 452         if (tdb->allrecord_lock.ltype == F_RDLCK) {
 453                 /*
 454                  * We ask for ltype==F_WRLCK, but the allrecord_lock
 455                  * is too weak. We can't upgrade here, so fail.
 456                  */
 457                 tdb->ecode = TDB_ERR_LOCK;
 458                 return -1;
 459         }
 460
 461         /*
 462          * Asking for F_WRLCK, allrecord is F_WRLCK as well. Pass.
 463          */
 464         return 0;
 465 }
 466
 467 static int tdb_lock_list(struct tdb_context *tdb, int list, int ltype,
 468                          enum tdb_lock_flags waitflag)
 469 {
 470         int ret;
 471         bool check = false;
 472
 473         if (tdb->allrecord_lock.count) {
 474                 return tdb_lock_covered_by_allrecord_lock(tdb, ltype);
 475         }
 476
 477         /*
 478          * Check for recoveries: Someone might have kill -9'ed a process
 479          * during a commit.
 480          */
 481         check = !have_data_locks(tdb);
 482         ret = tdb_nest_lock(tdb, lock_offset(list), ltype, waitflag);
 483
 484         if (ret == 0 && check && tdb_needs_recovery(tdb)) {
 485                 tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
 486
 487                 if (tdb_lock_and_recover(tdb) == -1) {
 488                         return -1;
 489                 }
 490                 return tdb_lock_list(tdb, list, ltype, waitflag);
 491         }
 492         return ret;
 493 }
 494
 495 /* lock a list in the database. list -1 is the alloc list */
 496 int tdb_lock(struct tdb_context *tdb, int list, int ltype)
 497 {
 498         int ret;
 499
 500         ret = tdb_lock_list(tdb, list, ltype, TDB_LOCK_WAIT);
 501         if (ret) {
 502                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock failed on list %d "
 503                          "ltype=%d (%s)\n",  list, ltype, strerror(errno)));
 504         }
 505         return ret;
 506 }
 507
 508 /* lock a list in the database. list -1 is the alloc list. non-blocking lock */
 509 _PUBLIC_ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
 510 _PUBLIC_ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype)
 511 {
 512         return tdb_lock_list(tdb, list, ltype, TDB_LOCK_NOWAIT);
 513 }
 514
 515
 516 int tdb_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype,
 517                     bool mark_lock)
 518 {
 519         int ret = -1;
 520         struct tdb_lock_type *lck;
 521
 522         if (tdb->flags & TDB_NOLOCK)
 523                 return 0;
 524
 525         /* Sanity checks */
 526         if (offset >= lock_offset(tdb->hash_size)) {
 527                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: offset %u invalid (%d)\n", offset, tdb->hash_size));
 528                 return ret;
 529         }
 530
 531         lck = find_nestlock(tdb, offset);
 532         if ((lck == NULL) || (lck->count == 0)) {
 533                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: count is 0\n"));
 534                 return -1;
 535         }
 536
 537         if (lck->count > 1) {
 538                 lck->count--;
 539                 return 0;
 540         }
 541
 542         /*
 543          * This lock has count==1 left, so we need to unlock it in the
 544          * kernel. We don't bother with decrementing the in-memory array
 545          * element, we're about to overwrite it with the last array element
 546          * anyway.
 547          */
 548
 549         if (mark_lock) {
 550                 ret = 0;
 551         } else {
 552                 ret = tdb_brunlock(tdb, ltype, offset, 1);
 553         }
 554
 555         /*
 556          * Shrink the array by overwriting the element just unlocked with the
 557          * last array element.
 558          */
 559         *lck = tdb->lockrecs[--tdb->num_lockrecs];
 560
 561         /*
 562          * We don't bother with realloc when the array shrinks, but if we have
 563          * a completely idle tdb we should get rid of the locked array.
 564          */
 565
 566         if (ret)
 567                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: An error occurred unlocking!\n"));
 568         return ret;
 569 }
 570
 571 _PUBLIC_ int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
 572 _PUBLIC_ int tdb_unlock(struct tdb_context *tdb, int list, int ltype)
 573 {
 574         /* a global lock allows us to avoid per chain locks */
 575         if (tdb->allrecord_lock.count) {
 576                 return tdb_lock_covered_by_allrecord_lock(tdb, ltype);
 577         }
 578
 579         return tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
 580 }
 581
 582 /*
 583   get the transaction lock
 584  */
 585 int tdb_transaction_lock(struct tdb_context *tdb, int ltype,
 586                          enum tdb_lock_flags lockflags)
 587 {
 588         return tdb_nest_lock(tdb, TRANSACTION_LOCK, ltype, lockflags);
 589 }
 590
 591 /*
 592   release the transaction lock
 593  */
 594 int tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
 595 {
 596         return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false);
 597 }
 598
 599 /* Returns 0 if all done, -1 if error, 1 if ok. */
 600 static int tdb_allrecord_check(struct tdb_context *tdb, int ltype,
 601                                enum tdb_lock_flags flags, bool upgradable)
 602 {
 603         /* There are no locks on read-only dbs */
 604         if (tdb->read_only || tdb->traverse_read) {
 605                 tdb->ecode = TDB_ERR_LOCK;
 606                 return -1;
 607         }
 608
 609         if (tdb->allrecord_lock.count &&
 610             tdb->allrecord_lock.ltype == (uint32_t)ltype) {
 611                 tdb->allrecord_lock.count++;
 612                 return 0;
 613         }
 614
 615         if (tdb->allrecord_lock.count) {
 616                 /* a global lock of a different type exists */
 617                 tdb->ecode = TDB_ERR_LOCK;
 618                 return -1;
 619         }
 620
 621         if (tdb_have_extra_locks(tdb)) {
 622                 /* can't combine global and chain locks */
 623                 tdb->ecode = TDB_ERR_LOCK;
 624                 return -1;
 625         }
 626
 627         if (upgradable && ltype != F_RDLCK) {
 628                 /* tdb error: you can't upgrade a write lock! */
 629                 tdb->ecode = TDB_ERR_LOCK;
 630                 return -1;
 631         }
 632         return 1;
 633 }
 634
 635 /* We only need to lock individual bytes, but Linux merges consecutive locks
 636  * so we lock in contiguous ranges. */
 637 static int tdb_chainlock_gradual(struct tdb_context *tdb,
 638                                  int ltype, enum tdb_lock_flags flags,
 639                                  size_t off, size_t len)
 640 {
 641         int ret;
 642         enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
 643
 644         if (len <= 4) {
 645                 /* Single record.  Just do blocking lock. */
 646                 return tdb_brlock(tdb, ltype, off, len, flags);
 647         }
 648
 649         /* First we try non-blocking. */
 650         ret = tdb_brlock(tdb, ltype, off, len, nb_flags);
 651         if (ret == 0) {
 652                 return 0;
 653         }
 654
 655         /* Try locking first half, then second. */
 656         ret = tdb_chainlock_gradual(tdb, ltype, flags, off, len / 2);
 657         if (ret == -1)
 658                 return -1;
 659
 660         ret = tdb_chainlock_gradual(tdb, ltype, flags,
 661                                     off + len / 2, len - len / 2);
 662         if (ret == -1) {
 663                 tdb_brunlock(tdb, ltype, off, len / 2);
 664                 return -1;
 665         }
 666         return 0;
 667 }
 668
 669 /* lock/unlock entire database.  It can only be upgradable if you have some
 670  * other way of guaranteeing exclusivity (ie. transaction write lock).
 671  * We do the locking gradually to avoid being starved by smaller locks. */
 672 int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 673                        enum tdb_lock_flags flags, bool upgradable)
 674 {
 675         int ret;
 676
 677         switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) {
 678         case -1:
 679                 return -1;
 680         case 0:
 681                 return 0;
 682         }
 683
 684         /* We cover two kinds of locks:
 685          * 1) Normal chain locks.  Taken for almost all operations.
 686          * 2) Individual records locks.  Taken after normal or free
 687          *    chain locks.
 688          *
 689          * It is (1) which cause the starvation problem, so we're only
 690          * gradual for that. */
 691
 692         if (tdb_have_mutexes(tdb)) {
 693                 ret = tdb_mutex_allrecord_lock(tdb, ltype, flags);
 694         } else {
 695                 ret = tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP,
 696                                             tdb->hash_size * 4);
 697         }
 698
 699         if (ret == -1) {
 700                 return -1;
 701         }
 702
 703         /* Grab individual record locks. */
 704         if (tdb_brlock(tdb, ltype, lock_offset(tdb->hash_size), 0,
 705                        flags) == -1) {
 706                 if (tdb_have_mutexes(tdb)) {
 707                         tdb_mutex_allrecord_unlock(tdb);
 708                 } else {
 709                         tdb_brunlock(tdb, ltype, FREELIST_TOP,
 710                                      tdb->hash_size * 4);
 711                 }
 712                 return -1;
 713         }
 714
 715         tdb->allrecord_lock.count = 1;
 716         /* If it's upgradable, it's actually exclusive so we can treat
 717          * it as a write lock. */
 718         tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
 719         tdb->allrecord_lock.off = upgradable;
 720
 721         if (tdb_needs_recovery(tdb)) {
 722                 bool mark = flags & TDB_LOCK_MARK_ONLY;
 723                 tdb_allrecord_unlock(tdb, ltype, mark);
 724                 if (mark) {
 725                         tdb->ecode = TDB_ERR_LOCK;
 726                         TDB_LOG((tdb, TDB_DEBUG_ERROR,
 727                                  "tdb_lockall_mark cannot do recovery\n"));
 728                         return -1;
 729                 }
 730                 if (tdb_lock_and_recover(tdb) == -1) {
 731                         return -1;
 732                 }
 733                 return tdb_allrecord_lock(tdb, ltype, flags, upgradable);
 734         }
 735
 736         return 0;
 737 }
 738
 739
 740
 741 /* unlock entire db */
 742 int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock)
 743 {
 744         /* There are no locks on read-only dbs */
 745         if (tdb->read_only || tdb->traverse_read) {
 746                 tdb->ecode = TDB_ERR_LOCK;
 747                 return -1;
 748         }
 749
 750         if (tdb->allrecord_lock.count == 0) {
 751                 tdb->ecode = TDB_ERR_LOCK;
 752                 return -1;
 753         }
 754
 755         /* Upgradable locks are marked as write locks. */
 756         if (tdb->allrecord_lock.ltype != (uint32_t)ltype
 757             && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
 758                 tdb->ecode = TDB_ERR_LOCK;
 759                 return -1;
 760         }
 761
 762         if (tdb->allrecord_lock.count > 1) {
 763                 tdb->allrecord_lock.count--;
 764                 return 0;
 765         }
 766
 767         if (!mark_lock) {
 768                 int ret;
 769
 770                 if (tdb_have_mutexes(tdb)) {
 771                         ret = tdb_mutex_allrecord_unlock(tdb);
 772                         if (ret == 0) {
 773                                 ret = tdb_brunlock(tdb, ltype,
 774                                                    lock_offset(tdb->hash_size),
 775                                                    0);
 776                         }
 777                 } else {
 778                         ret = tdb_brunlock(tdb, ltype, FREELIST_TOP, 0);
 779                 }
 780
 781                 if (ret != 0) {
 782                         TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed "
 783                                  "(%s)\n", strerror(errno)));
 784                         return -1;
 785                 }
 786         }
 787
 788         tdb->allrecord_lock.count = 0;
 789         tdb->allrecord_lock.ltype = 0;
 790
 791         return 0;
 792 }
 793
 794 /* lock entire database with write lock */
 795 _PUBLIC_ int tdb_lockall(struct tdb_context *tdb)
 796 {
 797         tdb_trace(tdb, "tdb_lockall");
 798         return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
 799 }
 800
 801 /* lock entire database with write lock - mark only */
 802 _PUBLIC_ int tdb_lockall_mark(struct tdb_context *tdb)
 803 {
 804         tdb_trace(tdb, "tdb_lockall_mark");
 805         return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY, false);
 806 }
 807
 808 /* unlock entire database with write lock - unmark only */
 809 _PUBLIC_ int tdb_lockall_unmark(struct tdb_context *tdb)
 810 {
 811         tdb_trace(tdb, "tdb_lockall_unmark");
 812         return tdb_allrecord_unlock(tdb, F_WRLCK, true);
 813 }
 814
 815 /* lock entire database with write lock - nonblocking variant */
 816 _PUBLIC_ int tdb_lockall_nonblock(struct tdb_context *tdb)
 817 {
 818         int ret = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_NOWAIT, false);
 819         tdb_trace_ret(tdb, "tdb_lockall_nonblock", ret);
 820         return ret;
 821 }
 822
 823 /* unlock entire database with write lock */
 824 _PUBLIC_ int tdb_unlockall(struct tdb_context *tdb)
 825 {
 826         tdb_trace(tdb, "tdb_unlockall");
 827         return tdb_allrecord_unlock(tdb, F_WRLCK, false);
 828 }
 829
 830 /* lock entire database with read lock */
 831 _PUBLIC_ int tdb_lockall_read(struct tdb_context *tdb)
 832 {
 833         tdb_trace(tdb, "tdb_lockall_read");
 834         return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
 835 }
 836
 837 /* lock entire database with read lock - nonblock variant */
 838 _PUBLIC_ int tdb_lockall_read_nonblock(struct tdb_context *tdb)
 839 {
 840         int ret = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_NOWAIT, false);
 841         tdb_trace_ret(tdb, "tdb_lockall_read_nonblock", ret);
 842         return ret;
 843 }
 844
 845 /* unlock entire database with read lock */
 846 _PUBLIC_ int tdb_unlockall_read(struct tdb_context *tdb)
 847 {
 848         tdb_trace(tdb, "tdb_unlockall_read");
 849         return tdb_allrecord_unlock(tdb, F_RDLCK, false);
 850 }
 851
 852 /* lock/unlock one hash chain. This is meant to be used to reduce
 853    contention - it cannot guarantee how many records will be locked */
 854 _PUBLIC_ int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
 855 {
 856         int ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 857         tdb_trace_1rec(tdb, "tdb_chainlock", key);
 858         return ret;
 859 }
 860
 861 /* lock/unlock one hash chain, non-blocking. This is meant to be used
 862    to reduce contention - it cannot guarantee how many records will be
 863    locked */
 864 _PUBLIC_ int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
 865 {
 866         int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 867         tdb_trace_1rec_ret(tdb, "tdb_chainlock_nonblock", key, ret);
 868         return ret;
 869 }
 870
 871 /* mark a chain as locked without actually locking it. Warning! use with great caution! */
 872 _PUBLIC_ int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key)
 873 {
 874         int ret = tdb_nest_lock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
 875                                 F_WRLCK, TDB_LOCK_MARK_ONLY);
 876         tdb_trace_1rec(tdb, "tdb_chainlock_mark", key);
 877         return ret;
 878 }
 879
 880 /* unmark a chain as locked without actually locking it. Warning! use with great caution! */
 881 _PUBLIC_ int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key)
 882 {
 883         tdb_trace_1rec(tdb, "tdb_chainlock_unmark", key);
 884         return tdb_nest_unlock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
 885                                F_WRLCK, true);
 886 }
 887
 888 _PUBLIC_ int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
 889 {
 890         tdb_trace_1rec(tdb, "tdb_chainunlock", key);
 891         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 892 }
 893
 894 _PUBLIC_ int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
 895 {
 896         int ret;
 897         ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 898         tdb_trace_1rec(tdb, "tdb_chainlock_read", key);
 899         return ret;
 900 }
 901
 902 _PUBLIC_ int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
 903 {
 904         tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
 905         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 906 }
 907
 908 _PUBLIC_ int tdb_chainlock_read_nonblock(struct tdb_context *tdb, TDB_DATA key)
 909 {
 910         int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 911         tdb_trace_1rec_ret(tdb, "tdb_chainlock_read_nonblock", key, ret);
 912         return ret;
 913 }
 914
 915 /* record lock stops delete underneath */
 916 int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
 917 {
 918         if (tdb->allrecord_lock.count) {
 919                 return 0;
 920         }
 921         return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0;
 922 }
 923
 924 /*
 925   Write locks override our own fcntl readlocks, so check it here.
 926   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
 927   an error to fail to get the lock here.
 928 */
 929 int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off)
 930 {
 931         struct tdb_traverse_lock *i;
 932         if (tdb == NULL) {
 933                 return -1;
 934         }
 935         for (i = &tdb->travlocks; i; i = i->next)
 936                 if (i->off == off)
 937                         return -1;
 938         if (tdb->allrecord_lock.count) {
 939                 if (tdb->allrecord_lock.ltype == F_WRLCK) {
 940                         return 0;
 941                 }
 942                 return -1;
 943         }
 944         return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
 945 }
 946
 947 int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
 948 {
 949         if (tdb->allrecord_lock.count) {
 950                 return 0;
 951         }
 952         return tdb_brunlock(tdb, F_WRLCK, off, 1);
 953 }
 954
 955 /* fcntl locks don't stack: avoid unlocking someone else's */
 956 int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off)
 957 {
 958         struct tdb_traverse_lock *i;
 959         uint32_t count = 0;
 960
 961         if (tdb->allrecord_lock.count) {
 962                 return 0;
 963         }
 964
 965         if (off == 0)
 966                 return 0;
 967         for (i = &tdb->travlocks; i; i = i->next)
 968                 if (i->off == off)
 969                         count++;
 970         return (count == 1 ? tdb_brunlock(tdb, F_RDLCK, off, 1) : 0);
 971 }
 972
 973 bool tdb_have_extra_locks(struct tdb_context *tdb)
 974 {
 975         unsigned int extra = tdb->num_lockrecs;
 976
 977         /* A transaction holds the lock for all records. */
 978         if (!tdb->transaction && tdb->allrecord_lock.count) {
 979                 return true;
 980         }
 981
 982         /* We always hold the active lock if CLEAR_IF_FIRST. */
 983         if (find_nestlock(tdb, ACTIVE_LOCK)) {
 984                 extra--;
 985         }
 986
 987         /* In a transaction, we expect to hold the transaction lock */
 988         if (tdb->transaction && find_nestlock(tdb, TRANSACTION_LOCK)) {
 989                 extra--;
 990         }
 991
 992         return extra;
 993 }
 994
 995 /* The transaction code uses this to remove all locks. */
 996 void tdb_release_transaction_locks(struct tdb_context *tdb)
 997 {
 998         int i;
 999         unsigned int active = 0;
1000
1001         if (tdb->allrecord_lock.count != 0) {
1002                 tdb_allrecord_unlock(tdb, tdb->allrecord_lock.ltype, false);
1003                 tdb->allrecord_lock.count = 0;
1004         }
1005
1006         for (i=0;i<tdb->num_lockrecs;i++) {
1007                 struct tdb_lock_type *lck = &tdb->lockrecs[i];
1008
1009                 /* Don't release the active lock!  Copy it to first entry. */
1010                 if (lck->off == ACTIVE_LOCK) {
1011                         tdb->lockrecs[active++] = *lck;
1012                 } else {
1013                         tdb_brunlock(tdb, lck->ltype, lck->off, 1);
1014                 }
1015         }
1016         tdb->num_lockrecs = active;
1017 }
1018
1019 /* Following functions are added specifically to support CTDB. */
1020
1021 /* Don't do actual fcntl locking, just mark tdb locked */
1022 _PUBLIC_ int tdb_transaction_write_lock_mark(struct tdb_context *tdb);
1023 _PUBLIC_ int tdb_transaction_write_lock_mark(struct tdb_context *tdb)
1024 {
1025         return tdb_transaction_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY);
1026 }
1027
1028 /* Don't do actual fcntl unlocking, just mark tdb unlocked */
1029 _PUBLIC_ int tdb_transaction_write_lock_unmark(struct tdb_context *tdb);
1030 _PUBLIC_ int tdb_transaction_write_lock_unmark(struct tdb_context *tdb)
1031 {
1032         return tdb_nest_unlock(tdb, TRANSACTION_LOCK, F_WRLCK, true);
1033 }