docs: document SUPPORTS_BLOCK_REFCOUNTING for share:fake_fscaps
[samba4-gss.git] / lib / tdb / common / tdb.c
blob4ad01b2b360b2b5027b38a68d76a852cbc627433
1 /*
2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
12 ** under the LGPL
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #include "tdb_private.h"
30 _PUBLIC_ TDB_DATA tdb_null;
33 non-blocking increment of the tdb sequence number if the tdb has been opened using
34 the TDB_SEQNUM flag
36 _PUBLIC_ void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
38 tdb_off_t seqnum=0;
40 if (!(tdb->flags & TDB_SEQNUM)) {
41 return;
44 /* we ignore errors from this, as we have no sane way of
45 dealing with them.
47 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
48 seqnum++;
49 tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum);
53 increment the tdb sequence number if the tdb has been opened using
54 the TDB_SEQNUM flag
56 static void tdb_increment_seqnum(struct tdb_context *tdb)
58 if (!(tdb->flags & TDB_SEQNUM)) {
59 return;
62 if (tdb->transaction != NULL) {
63 tdb_increment_seqnum_nonblock(tdb);
64 return;
67 #if defined(HAVE___ATOMIC_ADD_FETCH) && defined(HAVE___ATOMIC_ADD_LOAD)
68 if (tdb->map_ptr != NULL) {
69 uint32_t *pseqnum = (uint32_t *)(
70 TDB_SEQNUM_OFS + (char *)tdb->map_ptr);
71 __atomic_add_fetch(pseqnum, 1, __ATOMIC_SEQ_CST);
72 return;
74 #endif
76 if (tdb_nest_lock(tdb, TDB_SEQNUM_OFS, F_WRLCK,
77 TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
78 return;
81 tdb_increment_seqnum_nonblock(tdb);
83 tdb_nest_unlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, false);
86 static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
88 return memcmp(data.dptr, key.dptr, data.dsize);
91 void tdb_chainwalk_init(struct tdb_chainwalk_ctx *ctx, tdb_off_t ptr)
93 *ctx = (struct tdb_chainwalk_ctx) { .slow_ptr = ptr };
96 bool tdb_chainwalk_check(struct tdb_context *tdb,
97 struct tdb_chainwalk_ctx *ctx,
98 tdb_off_t next_ptr)
100 int ret;
102 if (ctx->slow_chase) {
103 ret = tdb_ofs_read(tdb, ctx->slow_ptr, &ctx->slow_ptr);
104 if (ret == -1) {
105 return false;
108 ctx->slow_chase = !ctx->slow_chase;
110 if (next_ptr == ctx->slow_ptr) {
111 tdb->ecode = TDB_ERR_CORRUPT;
112 TDB_LOG((tdb, TDB_DEBUG_ERROR,
113 "tdb_chainwalk_check: circular chain\n"));
114 return false;
117 return true;
120 /* Returns 0 on fail. On success, return offset of record, and fills
121 in rec */
122 static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
123 struct tdb_record *r)
125 tdb_off_t rec_ptr;
126 struct tdb_chainwalk_ctx chainwalk;
128 /* read in the hash top */
129 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
130 return 0;
132 tdb_chainwalk_init(&chainwalk, rec_ptr);
134 /* keep looking until we find the right record */
135 while (rec_ptr) {
136 bool ok;
138 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
139 return 0;
141 if (!TDB_DEAD(r) && hash==r->full_hash
142 && key.dsize==r->key_len
143 && tdb_parse_data(tdb, key, rec_ptr + sizeof(*r),
144 r->key_len, tdb_key_compare,
145 NULL) == 0) {
146 return rec_ptr;
148 rec_ptr = r->next;
150 ok = tdb_chainwalk_check(tdb, &chainwalk, rec_ptr);
151 if (!ok) {
152 return 0;
155 tdb->ecode = TDB_ERR_NOEXIST;
156 return 0;
159 /* As tdb_find, but if you succeed, keep the lock */
160 tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
161 struct tdb_record *rec)
163 uint32_t rec_ptr;
165 if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
166 return 0;
167 if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
168 tdb_unlock(tdb, BUCKET(hash), locktype);
169 return rec_ptr;
172 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key);
174 struct tdb_update_hash_state {
175 const TDB_DATA *dbufs;
176 int num_dbufs;
177 tdb_len_t dbufs_len;
180 static int tdb_update_hash_cmp(TDB_DATA key, TDB_DATA data, void *private_data)
182 struct tdb_update_hash_state *state = private_data;
183 unsigned char *dptr = data.dptr;
184 int i;
186 if (state->dbufs_len != data.dsize) {
187 return -1;
190 for (i=0; i<state->num_dbufs; i++) {
191 TDB_DATA dbuf = state->dbufs[i];
192 if( dbuf.dsize > 0) {
193 int ret;
194 ret = memcmp(dptr, dbuf.dptr, dbuf.dsize);
195 if (ret != 0) {
196 return -1;
198 dptr += dbuf.dsize;
202 return 0;
205 /* update an entry in place - this only works if the new data size
206 is <= the old data size and the key exists.
207 on failure return -1.
209 static int tdb_update_hash(struct tdb_context *tdb, TDB_DATA key,
210 uint32_t hash,
211 const TDB_DATA *dbufs, int num_dbufs,
212 tdb_len_t dbufs_len)
214 struct tdb_record rec;
215 tdb_off_t rec_ptr, ofs;
216 int i;
218 /* find entry */
219 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
220 return -1;
222 /* it could be an exact duplicate of what is there - this is
223 * surprisingly common (eg. with a ldb re-index). */
224 if (rec.data_len == dbufs_len) {
225 struct tdb_update_hash_state state = {
226 .dbufs = dbufs, .num_dbufs = num_dbufs,
227 .dbufs_len = dbufs_len
229 int ret;
231 ret = tdb_parse_record(tdb, key, tdb_update_hash_cmp, &state);
232 if (ret == 0) {
233 return 0;
237 /* must be long enough key, data and tailer */
238 if (rec.rec_len < key.dsize + dbufs_len + sizeof(tdb_off_t)) {
239 tdb->ecode = TDB_SUCCESS; /* Not really an error */
240 return -1;
243 ofs = rec_ptr + sizeof(rec) + rec.key_len;
245 for (i=0; i<num_dbufs; i++) {
246 TDB_DATA dbuf = dbufs[i];
247 int ret;
249 ret = tdb->methods->tdb_write(tdb, ofs, dbuf.dptr, dbuf.dsize);
250 if (ret == -1) {
251 return -1;
253 ofs += dbuf.dsize;
256 if (dbufs_len != rec.data_len) {
257 /* update size */
258 rec.data_len = dbufs_len;
259 return tdb_rec_write(tdb, rec_ptr, &rec);
262 return 0;
265 /* find an entry in the database given a key */
266 /* If an entry doesn't exist tdb_err will be set to
267 * TDB_ERR_NOEXIST. If a key has no data attached
268 * then the TDB_DATA will have zero length but
269 * a non-zero pointer
271 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
273 tdb_off_t rec_ptr;
274 struct tdb_record rec;
275 TDB_DATA ret;
276 uint32_t hash;
278 /* find which hash bucket it is in */
279 hash = tdb->hash_fn(&key);
280 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
281 return tdb_null;
283 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
284 rec.data_len);
285 ret.dsize = rec.data_len;
286 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
287 return ret;
290 _PUBLIC_ TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
292 TDB_DATA ret = _tdb_fetch(tdb, key);
294 tdb_trace_1rec_retrec(tdb, "tdb_fetch", key, ret);
295 return ret;
299 * Find an entry in the database and hand the record's data to a parsing
300 * function. The parsing function is executed under the chain read lock, so it
301 * should be fast and should not block on other syscalls.
303 * DON'T CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
305 * For mmapped tdb's that do not have a transaction open it points the parsing
306 * function directly at the mmap area, it avoids the malloc/memcpy in this
307 * case. If a transaction is open or no mmap is available, it has to do
308 * malloc/read/parse/free.
310 * This is interesting for all readers of potentially large data structures in
311 * the tdb records, ldb indexes being one example.
313 * Return -1 if the record was not found.
316 _PUBLIC_ int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
317 int (*parser)(TDB_DATA key, TDB_DATA data,
318 void *private_data),
319 void *private_data)
321 tdb_off_t rec_ptr;
322 struct tdb_record rec;
323 int ret;
324 uint32_t hash;
326 /* find which hash bucket it is in */
327 hash = tdb->hash_fn(&key);
329 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
330 /* record not found */
331 tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, -1);
332 tdb->ecode = TDB_ERR_NOEXIST;
333 return -1;
335 tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, 0);
337 ret = tdb_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
338 rec.data_len, parser, private_data);
340 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
342 return ret;
345 /* check if an entry in the database exists
347 note that 1 is returned if the key is found and 0 is returned if not found
348 this doesn't match the conventions in the rest of this module, but is
349 compatible with gdbm
351 static int tdb_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
353 struct tdb_record rec;
355 if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
356 return 0;
357 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
358 return 1;
361 _PUBLIC_ int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
363 uint32_t hash = tdb->hash_fn(&key);
364 int ret;
366 ret = tdb_exists_hash(tdb, key, hash);
367 tdb_trace_1rec_ret(tdb, "tdb_exists", key, ret);
368 return ret;
372 * Move a dead record to the freelist. The hash chain and freelist
373 * must be locked.
375 static int tdb_del_dead(struct tdb_context *tdb,
376 uint32_t last_ptr,
377 uint32_t rec_ptr,
378 struct tdb_record *rec,
379 bool *deleted)
381 int ret;
383 ret = tdb_write_lock_record(tdb, rec_ptr);
384 if (ret == -1) {
385 /* Someone traversing here: Just leave it dead */
386 return 0;
388 ret = tdb_write_unlock_record(tdb, rec_ptr);
389 if (ret == -1) {
390 return -1;
392 ret = tdb_ofs_write(tdb, last_ptr, &rec->next);
393 if (ret == -1) {
394 return -1;
397 *deleted = true;
399 ret = tdb_free(tdb, rec_ptr, rec);
400 return ret;
404 * Walk the hash chain and leave tdb->max_dead_records around. Move
405 * the rest of dead records to the freelist.
407 int tdb_trim_dead(struct tdb_context *tdb, uint32_t hash)
409 struct tdb_chainwalk_ctx chainwalk;
410 struct tdb_record rec;
411 tdb_off_t last_ptr, rec_ptr;
412 bool locked_freelist = false;
413 int num_dead = 0;
414 int ret;
416 last_ptr = TDB_HASH_TOP(hash);
419 * Init chainwalk with the pointer to the hash top. It might
420 * be that the very first record in the chain is a dead one
421 * that we have to delete.
423 tdb_chainwalk_init(&chainwalk, last_ptr);
425 ret = tdb_ofs_read(tdb, last_ptr, &rec_ptr);
426 if (ret == -1) {
427 return -1;
430 while (rec_ptr != 0) {
431 bool deleted = false;
432 uint32_t next;
434 ret = tdb_rec_read(tdb, rec_ptr, &rec);
435 if (ret == -1) {
436 goto fail;
440 * Make a copy of rec.next: Further down we might
441 * delete and put the record on the freelist. Make
442 * sure that modifications in that code path can't
443 * break the chainwalk here.
445 next = rec.next;
447 if (rec.magic == TDB_DEAD_MAGIC) {
448 num_dead += 1;
450 if (num_dead > tdb->max_dead_records) {
452 if (!locked_freelist) {
454 * Lock the freelist only if
455 * it's really required.
457 ret = tdb_lock(tdb, -1, F_WRLCK);
458 if (ret == -1) {
459 goto fail;
461 locked_freelist = true;
464 ret = tdb_del_dead(
465 tdb,
466 last_ptr,
467 rec_ptr,
468 &rec,
469 &deleted);
471 if (ret == -1) {
472 goto fail;
478 * Don't do the chainwalk check if "rec_ptr" was
479 * deleted. We reduced the chain, and the chainwalk
480 * check might catch up early. Imagine a valid chain
481 * with just dead records: We never can bump the
482 * "slow" pointer in chainwalk_check, as there isn't
483 * anything left to jump to and compare.
485 if (!deleted) {
486 bool ok;
488 last_ptr = rec_ptr;
490 ok = tdb_chainwalk_check(tdb, &chainwalk, next);
491 if (!ok) {
492 ret = -1;
493 goto fail;
496 rec_ptr = next;
498 ret = 0;
499 fail:
500 if (locked_freelist) {
501 tdb_unlock(tdb, -1, F_WRLCK);
503 return ret;
506 /* delete an entry in the database given a key */
507 static int tdb_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
509 tdb_off_t rec_ptr;
510 struct tdb_record rec;
511 int ret;
513 if (tdb->read_only || tdb->traverse_read) {
514 tdb->ecode = TDB_ERR_RDONLY;
515 return -1;
518 rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec);
519 if (rec_ptr == 0) {
520 return -1;
524 * Mark the record dead
526 rec.magic = TDB_DEAD_MAGIC;
527 ret = tdb_rec_write(tdb, rec_ptr, &rec);
528 if (ret == -1) {
529 goto done;
532 tdb_increment_seqnum(tdb);
534 ret = tdb_trim_dead(tdb, hash);
535 done:
536 if (tdb_unlock(tdb, BUCKET(hash), F_WRLCK) != 0)
537 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_delete: WARNING tdb_unlock failed!\n"));
538 return ret;
541 _PUBLIC_ int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
543 uint32_t hash = tdb->hash_fn(&key);
544 int ret;
546 ret = tdb_delete_hash(tdb, key, hash);
547 tdb_trace_1rec_ret(tdb, "tdb_delete", key, ret);
548 return ret;
552 * See if we have a dead record around with enough space
554 tdb_off_t tdb_find_dead(struct tdb_context *tdb, uint32_t hash,
555 struct tdb_record *r, tdb_len_t length,
556 tdb_off_t *p_last_ptr)
558 tdb_off_t rec_ptr, last_ptr;
559 struct tdb_chainwalk_ctx chainwalk;
560 tdb_off_t best_rec_ptr = 0;
561 tdb_off_t best_last_ptr = 0;
562 struct tdb_record best = { .rec_len = UINT32_MAX };
564 length += sizeof(tdb_off_t); /* tailer */
566 last_ptr = TDB_HASH_TOP(hash);
568 /* read in the hash top */
569 if (tdb_ofs_read(tdb, last_ptr, &rec_ptr) == -1)
570 return 0;
572 tdb_chainwalk_init(&chainwalk, rec_ptr);
574 /* keep looking until we find the right record */
575 while (rec_ptr) {
576 bool ok;
578 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
579 return 0;
581 if (TDB_DEAD(r) && (r->rec_len >= length) &&
582 (r->rec_len < best.rec_len)) {
583 best_rec_ptr = rec_ptr;
584 best_last_ptr = last_ptr;
585 best = *r;
587 last_ptr = rec_ptr;
588 rec_ptr = r->next;
590 ok = tdb_chainwalk_check(tdb, &chainwalk, rec_ptr);
591 if (!ok) {
592 return 0;
596 if (best.rec_len == UINT32_MAX) {
597 return 0;
600 *r = best;
601 *p_last_ptr = best_last_ptr;
602 return best_rec_ptr;
605 static int _tdb_storev(struct tdb_context *tdb, TDB_DATA key,
606 const TDB_DATA *dbufs, int num_dbufs,
607 int flag, uint32_t hash)
609 struct tdb_record rec;
610 tdb_off_t rec_ptr, ofs;
611 tdb_len_t rec_len, dbufs_len;
612 int i;
613 int ret = -1;
615 dbufs_len = 0;
617 for (i=0; i<num_dbufs; i++) {
618 size_t dsize = dbufs[i].dsize;
620 if ((dsize != 0) && (dbufs[i].dptr == NULL)) {
621 tdb->ecode = TDB_ERR_EINVAL;
622 goto fail;
625 dbufs_len += dsize;
626 if (dbufs_len < dsize) {
627 tdb->ecode = TDB_ERR_OOM;
628 goto fail;
632 rec_len = key.dsize + dbufs_len;
633 if ((rec_len < key.dsize) || (rec_len < dbufs_len)) {
634 tdb->ecode = TDB_ERR_OOM;
635 goto fail;
638 /* check for it existing, on insert. */
639 if (flag == TDB_INSERT) {
640 if (tdb_exists_hash(tdb, key, hash)) {
641 tdb->ecode = TDB_ERR_EXISTS;
642 goto fail;
644 } else {
645 /* first try in-place update, on modify or replace. */
646 if (tdb_update_hash(tdb, key, hash, dbufs, num_dbufs,
647 dbufs_len) == 0) {
648 goto done;
650 if (tdb->ecode == TDB_ERR_NOEXIST &&
651 flag == TDB_MODIFY) {
652 /* if the record doesn't exist and we are in TDB_MODIFY mode then
653 we should fail the store */
654 goto fail;
657 /* reset the error code potentially set by the tdb_update_hash() */
658 tdb->ecode = TDB_SUCCESS;
660 /* delete any existing record - if it doesn't exist we don't
661 care. Doing this first reduces fragmentation, and avoids
662 coalescing with `allocated' block before it's updated. */
663 if (flag != TDB_INSERT)
664 tdb_delete_hash(tdb, key, hash);
666 /* we have to allocate some space */
667 rec_ptr = tdb_allocate(tdb, hash, rec_len, &rec);
669 if (rec_ptr == 0) {
670 goto fail;
673 /* Read hash top into next ptr */
674 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
675 goto fail;
677 rec.key_len = key.dsize;
678 rec.data_len = dbufs_len;
679 rec.full_hash = hash;
680 rec.magic = TDB_MAGIC;
682 ofs = rec_ptr;
684 /* write out and point the top of the hash chain at it */
685 ret = tdb_rec_write(tdb, ofs, &rec);
686 if (ret == -1) {
687 goto fail;
689 ofs += sizeof(rec);
691 ret = tdb->methods->tdb_write(tdb, ofs, key.dptr, key.dsize);
692 if (ret == -1) {
693 goto fail;
695 ofs += key.dsize;
697 for (i=0; i<num_dbufs; i++) {
698 if (dbufs[i].dsize == 0) {
699 continue;
702 ret = tdb->methods->tdb_write(tdb, ofs, dbufs[i].dptr,
703 dbufs[i].dsize);
704 if (ret == -1) {
705 goto fail;
707 ofs += dbufs[i].dsize;
710 ret = tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr);
711 if (ret == -1) {
712 /* Need to tdb_unallocate() here */
713 goto fail;
716 done:
717 ret = 0;
718 fail:
719 if (ret == 0) {
720 tdb_increment_seqnum(tdb);
722 return ret;
725 static int _tdb_store(struct tdb_context *tdb, TDB_DATA key,
726 TDB_DATA dbuf, int flag, uint32_t hash)
728 return _tdb_storev(tdb, key, &dbuf, 1, flag, hash);
731 /* store an element in the database, replacing any existing element
732 with the same key
734 return 0 on success, -1 on failure
736 _PUBLIC_ int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
738 uint32_t hash;
739 int ret;
741 if (tdb->read_only || tdb->traverse_read) {
742 tdb->ecode = TDB_ERR_RDONLY;
743 tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, -1);
744 return -1;
747 /* find which hash bucket it is in */
748 hash = tdb->hash_fn(&key);
749 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
750 return -1;
752 ret = _tdb_store(tdb, key, dbuf, flag, hash);
753 tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, ret);
754 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
755 return ret;
758 _PUBLIC_ int tdb_storev(struct tdb_context *tdb, TDB_DATA key,
759 const TDB_DATA *dbufs, int num_dbufs, int flag)
761 uint32_t hash;
762 int ret;
764 if (tdb->read_only || tdb->traverse_read) {
765 tdb->ecode = TDB_ERR_RDONLY;
766 tdb_trace_1plusn_rec_flag_ret(tdb, "tdb_storev", key,
767 dbufs, num_dbufs, flag, -1);
768 return -1;
771 /* find which hash bucket it is in */
772 hash = tdb->hash_fn(&key);
773 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
774 return -1;
776 ret = _tdb_storev(tdb, key, dbufs, num_dbufs, flag, hash);
777 tdb_trace_1plusn_rec_flag_ret(tdb, "tdb_storev", key,
778 dbufs, num_dbufs, flag, -1);
779 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
780 return ret;
783 /* Append to an entry. Create if not exist. */
784 _PUBLIC_ int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
786 uint32_t hash;
787 TDB_DATA dbufs[2];
788 int ret = -1;
790 /* find which hash bucket it is in */
791 hash = tdb->hash_fn(&key);
792 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
793 return -1;
795 dbufs[0] = _tdb_fetch(tdb, key);
796 dbufs[1] = new_dbuf;
798 ret = _tdb_storev(tdb, key, dbufs, 2, 0, hash);
799 tdb_trace_2rec_retrec(tdb, "tdb_append", key, dbufs[0], dbufs[1]);
801 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
802 SAFE_FREE(dbufs[0].dptr);
803 return ret;
808 return the name of the current tdb file
809 useful for external logging functions
811 _PUBLIC_ const char *tdb_name(struct tdb_context *tdb)
813 return tdb->name;
817 return the underlying file descriptor being used by tdb, or -1
818 useful for external routines that want to check the device/inode
819 of the fd
821 _PUBLIC_ int tdb_fd(struct tdb_context *tdb)
823 return tdb->fd;
827 return the current logging function
828 useful for external tdb routines that wish to log tdb errors
830 _PUBLIC_ tdb_log_func tdb_log_fn(struct tdb_context *tdb)
832 return tdb->log.log_fn;
837 get the tdb sequence number. Only makes sense if the writers opened
838 with TDB_SEQNUM set. Note that this sequence number will wrap quite
839 quickly, so it should only be used for a 'has something changed'
840 test, not for code that relies on the count of the number of changes
841 made. If you want a counter then use a tdb record.
843 The aim of this sequence number is to allow for a very lightweight
844 test of a possible tdb change.
846 _PUBLIC_ int tdb_get_seqnum(struct tdb_context *tdb)
848 tdb_off_t seqnum=0;
850 if (tdb->transaction != NULL) {
851 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
852 return seqnum;
855 #if defined(HAVE___ATOMIC_ADD_FETCH) && defined(HAVE___ATOMIC_ADD_LOAD)
856 if (tdb->map_ptr != NULL) {
857 uint32_t *pseqnum = (uint32_t *)(
858 TDB_SEQNUM_OFS + (char *)tdb->map_ptr);
859 uint32_t ret;
860 __atomic_load(pseqnum, &ret,__ATOMIC_SEQ_CST);
861 return ret;
863 #endif
865 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
866 return seqnum;
869 _PUBLIC_ int tdb_hash_size(struct tdb_context *tdb)
871 return tdb->hash_size;
874 _PUBLIC_ size_t tdb_map_size(struct tdb_context *tdb)
876 return tdb->map_size;
879 _PUBLIC_ int tdb_get_flags(struct tdb_context *tdb)
881 return tdb->flags;
884 _PUBLIC_ void tdb_add_flags(struct tdb_context *tdb, unsigned flags)
886 if ((flags & TDB_ALLOW_NESTING) &&
887 (flags & TDB_DISALLOW_NESTING)) {
888 tdb->ecode = TDB_ERR_NESTING;
889 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_add_flags: "
890 "allow_nesting and disallow_nesting are not allowed together!"));
891 return;
894 if (flags & TDB_ALLOW_NESTING) {
895 tdb->flags &= ~TDB_DISALLOW_NESTING;
897 if (flags & TDB_DISALLOW_NESTING) {
898 tdb->flags &= ~TDB_ALLOW_NESTING;
901 tdb->flags |= flags;
904 _PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
906 if ((flags & TDB_ALLOW_NESTING) &&
907 (flags & TDB_DISALLOW_NESTING)) {
908 tdb->ecode = TDB_ERR_NESTING;
909 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
910 "allow_nesting and disallow_nesting are not allowed together!"));
911 return;
914 if ((flags & TDB_NOLOCK) &&
915 (tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) &&
916 (tdb->mutexes == NULL)) {
917 tdb->ecode = TDB_ERR_LOCK;
918 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
919 "Can not remove NOLOCK flag on mutexed databases"));
920 return;
923 if (flags & TDB_ALLOW_NESTING) {
924 tdb->flags |= TDB_DISALLOW_NESTING;
926 if (flags & TDB_DISALLOW_NESTING) {
927 tdb->flags |= TDB_ALLOW_NESTING;
930 tdb->flags &= ~flags;
935 enable sequence number handling on an open tdb
937 _PUBLIC_ void tdb_enable_seqnum(struct tdb_context *tdb)
939 tdb->flags |= TDB_SEQNUM;
944 add a region of the file to the freelist. Length is the size of the region in bytes,
945 which includes the free list header that needs to be added
947 static int tdb_free_region(struct tdb_context *tdb, tdb_off_t offset, ssize_t length)
949 struct tdb_record rec;
950 if (length <= sizeof(rec)) {
951 /* the region is not worth adding */
952 return 0;
954 if (length + offset > tdb->map_size) {
955 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: adding region beyond end of file\n"));
956 return -1;
958 memset(&rec,'\0',sizeof(rec));
959 rec.rec_len = length - sizeof(rec);
960 if (tdb_free(tdb, offset, &rec) == -1) {
961 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: failed to add free record\n"));
962 return -1;
964 return 0;
968 wipe the entire database, deleting all records. This can be done
969 very fast by using a allrecord lock. The entire data portion of the
970 file becomes a single entry in the freelist.
972 This code carefully steps around the recovery area, leaving it alone
974 _PUBLIC_ int tdb_wipe_all(struct tdb_context *tdb)
976 uint32_t i;
977 tdb_off_t offset = 0;
978 ssize_t data_len;
979 tdb_off_t recovery_head;
980 tdb_len_t recovery_size = 0;
982 if (tdb_lockall(tdb) != 0) {
983 return -1;
986 tdb_trace(tdb, "tdb_wipe_all");
988 /* see if the tdb has a recovery area, and remember its size
989 if so. We don't want to lose this as otherwise each
990 tdb_wipe_all() in a transaction will increase the size of
991 the tdb by the size of the recovery area */
992 if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
993 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery head\n"));
994 goto failed;
997 if (recovery_head != 0) {
998 struct tdb_record rec;
999 if (tdb->methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
1000 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery record\n"));
1001 return -1;
1003 recovery_size = rec.rec_len + sizeof(rec);
1006 /* wipe the hashes */
1007 for (i=0;i<tdb->hash_size;i++) {
1008 if (tdb_ofs_write(tdb, TDB_HASH_TOP(i), &offset) == -1) {
1009 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write hash %d\n", i));
1010 goto failed;
1014 /* wipe the freelist */
1015 if (tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
1016 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write freelist\n"));
1017 goto failed;
1020 /* add all the rest of the file to the freelist, possibly leaving a gap
1021 for the recovery area */
1022 if (recovery_size == 0) {
1023 /* the simple case - the whole file can be used as a freelist */
1024 data_len = (tdb->map_size - TDB_DATA_START(tdb->hash_size));
1025 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
1026 goto failed;
1028 } else {
1029 /* we need to add two freelist entries - one on either
1030 side of the recovery area
1032 Note that we cannot shift the recovery area during
1033 this operation. Only the transaction.c code may
1034 move the recovery area or we risk subtle data
1035 corruption
1037 data_len = (recovery_head - TDB_DATA_START(tdb->hash_size));
1038 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
1039 goto failed;
1041 /* and the 2nd free list entry after the recovery area - if any */
1042 data_len = tdb->map_size - (recovery_head+recovery_size);
1043 if (tdb_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
1044 goto failed;
1048 tdb_increment_seqnum_nonblock(tdb);
1050 if (tdb_unlockall(tdb) != 0) {
1051 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to unlock\n"));
1052 goto failed;
1055 return 0;
1057 failed:
1058 tdb_unlockall(tdb);
1059 return -1;
1062 struct traverse_state {
1063 bool error;
1064 struct tdb_context *dest_db;
1068 traverse function for repacking
1070 static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
1072 struct traverse_state *state = (struct traverse_state *)private_data;
1073 if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
1074 state->error = true;
1075 return -1;
1077 return 0;
1081 repack a tdb
1083 _PUBLIC_ int tdb_repack(struct tdb_context *tdb)
1085 struct tdb_context *tmp_db;
1086 struct traverse_state state;
1088 tdb_trace(tdb, "tdb_repack");
1090 if (tdb_transaction_start(tdb) != 0) {
1091 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to start transaction\n"));
1092 return -1;
1095 tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb), TDB_INTERNAL, O_RDWR|O_CREAT, 0);
1096 if (tmp_db == NULL) {
1097 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to create tmp_db\n"));
1098 tdb_transaction_cancel(tdb);
1099 return -1;
1102 state.error = false;
1103 state.dest_db = tmp_db;
1105 if (tdb_traverse_read(tdb, repack_traverse, &state) == -1) {
1106 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying out\n"));
1107 tdb_transaction_cancel(tdb);
1108 tdb_close(tmp_db);
1109 return -1;
1112 if (state.error) {
1113 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during traversal\n"));
1114 tdb_transaction_cancel(tdb);
1115 tdb_close(tmp_db);
1116 return -1;
1119 if (tdb_wipe_all(tdb) != 0) {
1120 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to wipe database\n"));
1121 tdb_transaction_cancel(tdb);
1122 tdb_close(tmp_db);
1123 return -1;
1126 state.error = false;
1127 state.dest_db = tdb;
1129 if (tdb_traverse_read(tmp_db, repack_traverse, &state) == -1) {
1130 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying back\n"));
1131 tdb_transaction_cancel(tdb);
1132 tdb_close(tmp_db);
1133 return -1;
1136 if (state.error) {
1137 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during second traversal\n"));
1138 tdb_transaction_cancel(tdb);
1139 tdb_close(tmp_db);
1140 return -1;
1143 tdb_close(tmp_db);
1145 if (tdb_transaction_commit(tdb) != 0) {
1146 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to commit\n"));
1147 return -1;
1150 return 0;
1153 /* Even on files, we can get partial writes due to signals. */
1154 bool tdb_write_all(int fd, const void *buf, size_t count)
1156 while (count) {
1157 ssize_t ret;
1158 ret = write(fd, buf, count);
1159 if (ret < 0)
1160 return false;
1161 buf = (const char *)buf + ret;
1162 count -= ret;
1164 return true;
1167 bool tdb_add_off_t(tdb_off_t a, tdb_off_t b, tdb_off_t *pret)
1169 tdb_off_t ret = a + b;
1171 if ((ret < a) || (ret < b)) {
1172 return false;
1174 *pret = ret;
1175 return true;
1178 #ifdef TDB_TRACE
1179 static void tdb_trace_write(struct tdb_context *tdb, const char *str)
1181 if (!tdb_write_all(tdb->tracefd, str, strlen(str))) {
1182 close(tdb->tracefd);
1183 tdb->tracefd = -1;
1187 static void tdb_trace_start(struct tdb_context *tdb)
1189 tdb_off_t seqnum=0;
1190 char msg[sizeof(tdb_off_t) * 4 + 1];
1192 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
1193 snprintf(msg, sizeof(msg), "%u ", seqnum);
1194 tdb_trace_write(tdb, msg);
1197 static void tdb_trace_end(struct tdb_context *tdb)
1199 tdb_trace_write(tdb, "\n");
1202 static void tdb_trace_end_ret(struct tdb_context *tdb, int ret)
1204 char msg[sizeof(ret) * 4 + 4];
1205 snprintf(msg, sizeof(msg), " = %i\n", ret);
1206 tdb_trace_write(tdb, msg);
1209 static void tdb_trace_record(struct tdb_context *tdb, TDB_DATA rec)
1211 char msg[21 + rec.dsize*2], *p;
1212 unsigned int i;
1214 /* We differentiate zero-length records from non-existent ones. */
1215 if (rec.dptr == NULL) {
1216 tdb_trace_write(tdb, " NULL");
1217 return;
1220 /* snprintf here is purely cargo-cult programming. */
1221 p = msg;
1222 p += snprintf(p, sizeof(msg), " %zu:", rec.dsize);
1224 for (i = 0; i < rec.dsize; i++) {
1225 snprintf(p, 3, "%02x", rec.dptr[i]);
1226 p += 2;
1229 tdb_trace_write(tdb, msg);
1232 void tdb_trace(struct tdb_context *tdb, const char *op)
1234 tdb_trace_start(tdb);
1235 tdb_trace_write(tdb, op);
1236 tdb_trace_end(tdb);
1239 void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op)
1241 char msg[sizeof(tdb_off_t) * 4 + 1];
1243 snprintf(msg, sizeof(msg), "%u ", seqnum);
1244 tdb_trace_write(tdb, msg);
1245 tdb_trace_write(tdb, op);
1246 tdb_trace_end(tdb);
1249 void tdb_trace_open(struct tdb_context *tdb, const char *op,
1250 unsigned hash_size, unsigned tdb_flags, unsigned open_flags)
1252 char msg[128];
1254 snprintf(msg, sizeof(msg),
1255 "%s %u 0x%x 0x%x", op, hash_size, tdb_flags, open_flags);
1256 tdb_trace_start(tdb);
1257 tdb_trace_write(tdb, msg);
1258 tdb_trace_end(tdb);
1261 void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret)
1263 tdb_trace_start(tdb);
1264 tdb_trace_write(tdb, op);
1265 tdb_trace_end_ret(tdb, ret);
1268 void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret)
1270 tdb_trace_start(tdb);
1271 tdb_trace_write(tdb, op);
1272 tdb_trace_write(tdb, " =");
1273 tdb_trace_record(tdb, ret);
1274 tdb_trace_end(tdb);
1277 void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
1278 TDB_DATA rec)
1280 tdb_trace_start(tdb);
1281 tdb_trace_write(tdb, op);
1282 tdb_trace_record(tdb, rec);
1283 tdb_trace_end(tdb);
1286 void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
1287 TDB_DATA rec, int ret)
1289 tdb_trace_start(tdb);
1290 tdb_trace_write(tdb, op);
1291 tdb_trace_record(tdb, rec);
1292 tdb_trace_end_ret(tdb, ret);
1295 void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
1296 TDB_DATA rec, TDB_DATA ret)
1298 tdb_trace_start(tdb);
1299 tdb_trace_write(tdb, op);
1300 tdb_trace_record(tdb, rec);
1301 tdb_trace_write(tdb, " =");
1302 tdb_trace_record(tdb, ret);
1303 tdb_trace_end(tdb);
1306 void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
1307 TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
1308 int ret)
1310 char msg[1 + sizeof(ret) * 4];
1312 snprintf(msg, sizeof(msg), " %#x", flag);
1313 tdb_trace_start(tdb);
1314 tdb_trace_write(tdb, op);
1315 tdb_trace_record(tdb, rec1);
1316 tdb_trace_record(tdb, rec2);
1317 tdb_trace_write(tdb, msg);
1318 tdb_trace_end_ret(tdb, ret);
1321 void tdb_trace_1plusn_rec_flag_ret(struct tdb_context *tdb, const char *op,
1322 TDB_DATA rec,
1323 const TDB_DATA *recs, int num_recs,
1324 unsigned flag, int ret)
1326 char msg[1 + sizeof(ret) * 4];
1327 int i;
1329 snprintf(msg, sizeof(msg), " %#x", flag);
1330 tdb_trace_start(tdb);
1331 tdb_trace_write(tdb, op);
1332 tdb_trace_record(tdb, rec);
1333 for (i=0; i<num_recs; i++) {
1334 tdb_trace_record(tdb, recs[i]);
1336 tdb_trace_write(tdb, msg);
1337 tdb_trace_end_ret(tdb, ret);
1340 void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
1341 TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret)
1343 tdb_trace_start(tdb);
1344 tdb_trace_write(tdb, op);
1345 tdb_trace_record(tdb, rec1);
1346 tdb_trace_record(tdb, rec2);
1347 tdb_trace_write(tdb, " =");
1348 tdb_trace_record(tdb, ret);
1349 tdb_trace_end(tdb);
1351 #endif