2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #include "tdb_private.h"
30 _PUBLIC_ TDB_DATA tdb_null
;
33 non-blocking increment of the tdb sequence number if the tdb has been opened using
36 _PUBLIC_
void tdb_increment_seqnum_nonblock(struct tdb_context
*tdb
)
40 if (!(tdb
->flags
& TDB_SEQNUM
)) {
44 /* we ignore errors from this, as we have no sane way of
47 tdb_ofs_read(tdb
, TDB_SEQNUM_OFS
, &seqnum
);
49 tdb_ofs_write(tdb
, TDB_SEQNUM_OFS
, &seqnum
);
53 increment the tdb sequence number if the tdb has been opened using
56 static void tdb_increment_seqnum(struct tdb_context
*tdb
)
58 if (!(tdb
->flags
& TDB_SEQNUM
)) {
62 if (tdb
->transaction
!= NULL
) {
63 tdb_increment_seqnum_nonblock(tdb
);
67 #if defined(HAVE___ATOMIC_ADD_FETCH) && defined(HAVE___ATOMIC_ADD_LOAD)
68 if (tdb
->map_ptr
!= NULL
) {
69 uint32_t *pseqnum
= (uint32_t *)(
70 TDB_SEQNUM_OFS
+ (char *)tdb
->map_ptr
);
71 __atomic_add_fetch(pseqnum
, 1, __ATOMIC_SEQ_CST
);
76 if (tdb_nest_lock(tdb
, TDB_SEQNUM_OFS
, F_WRLCK
,
77 TDB_LOCK_WAIT
|TDB_LOCK_PROBE
) != 0) {
81 tdb_increment_seqnum_nonblock(tdb
);
83 tdb_nest_unlock(tdb
, TDB_SEQNUM_OFS
, F_WRLCK
, false);
86 static int tdb_key_compare(TDB_DATA key
, TDB_DATA data
, void *private_data
)
88 return memcmp(data
.dptr
, key
.dptr
, data
.dsize
);
91 void tdb_chainwalk_init(struct tdb_chainwalk_ctx
*ctx
, tdb_off_t ptr
)
93 *ctx
= (struct tdb_chainwalk_ctx
) { .slow_ptr
= ptr
};
96 bool tdb_chainwalk_check(struct tdb_context
*tdb
,
97 struct tdb_chainwalk_ctx
*ctx
,
102 if (ctx
->slow_chase
) {
103 ret
= tdb_ofs_read(tdb
, ctx
->slow_ptr
, &ctx
->slow_ptr
);
108 ctx
->slow_chase
= !ctx
->slow_chase
;
110 if (next_ptr
== ctx
->slow_ptr
) {
111 tdb
->ecode
= TDB_ERR_CORRUPT
;
112 TDB_LOG((tdb
, TDB_DEBUG_ERROR
,
113 "tdb_chainwalk_check: circular chain\n"));
120 /* Returns 0 on fail. On success, return offset of record, and fills
122 static tdb_off_t
tdb_find(struct tdb_context
*tdb
, TDB_DATA key
, uint32_t hash
,
123 struct tdb_record
*r
)
126 struct tdb_chainwalk_ctx chainwalk
;
128 /* read in the hash top */
129 if (tdb_ofs_read(tdb
, TDB_HASH_TOP(hash
), &rec_ptr
) == -1)
132 tdb_chainwalk_init(&chainwalk
, rec_ptr
);
134 /* keep looking until we find the right record */
138 if (tdb_rec_read(tdb
, rec_ptr
, r
) == -1)
141 if (!TDB_DEAD(r
) && hash
==r
->full_hash
142 && key
.dsize
==r
->key_len
143 && tdb_parse_data(tdb
, key
, rec_ptr
+ sizeof(*r
),
144 r
->key_len
, tdb_key_compare
,
150 ok
= tdb_chainwalk_check(tdb
, &chainwalk
, rec_ptr
);
155 tdb
->ecode
= TDB_ERR_NOEXIST
;
159 /* As tdb_find, but if you succeed, keep the lock */
160 tdb_off_t
tdb_find_lock_hash(struct tdb_context
*tdb
, TDB_DATA key
, uint32_t hash
, int locktype
,
161 struct tdb_record
*rec
)
165 if (tdb_lock(tdb
, BUCKET(hash
), locktype
) == -1)
167 if (!(rec_ptr
= tdb_find(tdb
, key
, hash
, rec
)))
168 tdb_unlock(tdb
, BUCKET(hash
), locktype
);
172 static TDB_DATA
_tdb_fetch(struct tdb_context
*tdb
, TDB_DATA key
);
174 struct tdb_update_hash_state
{
175 const TDB_DATA
*dbufs
;
180 static int tdb_update_hash_cmp(TDB_DATA key
, TDB_DATA data
, void *private_data
)
182 struct tdb_update_hash_state
*state
= private_data
;
183 unsigned char *dptr
= data
.dptr
;
186 if (state
->dbufs_len
!= data
.dsize
) {
190 for (i
=0; i
<state
->num_dbufs
; i
++) {
191 TDB_DATA dbuf
= state
->dbufs
[i
];
192 if( dbuf
.dsize
> 0) {
194 ret
= memcmp(dptr
, dbuf
.dptr
, dbuf
.dsize
);
205 /* update an entry in place - this only works if the new data size
206 is <= the old data size and the key exists.
207 on failure return -1.
209 static int tdb_update_hash(struct tdb_context
*tdb
, TDB_DATA key
,
211 const TDB_DATA
*dbufs
, int num_dbufs
,
214 struct tdb_record rec
;
215 tdb_off_t rec_ptr
, ofs
;
219 if (!(rec_ptr
= tdb_find(tdb
, key
, hash
, &rec
)))
222 /* it could be an exact duplicate of what is there - this is
223 * surprisingly common (eg. with a ldb re-index). */
224 if (rec
.data_len
== dbufs_len
) {
225 struct tdb_update_hash_state state
= {
226 .dbufs
= dbufs
, .num_dbufs
= num_dbufs
,
227 .dbufs_len
= dbufs_len
231 ret
= tdb_parse_record(tdb
, key
, tdb_update_hash_cmp
, &state
);
237 /* must be long enough key, data and tailer */
238 if (rec
.rec_len
< key
.dsize
+ dbufs_len
+ sizeof(tdb_off_t
)) {
239 tdb
->ecode
= TDB_SUCCESS
; /* Not really an error */
243 ofs
= rec_ptr
+ sizeof(rec
) + rec
.key_len
;
245 for (i
=0; i
<num_dbufs
; i
++) {
246 TDB_DATA dbuf
= dbufs
[i
];
249 ret
= tdb
->methods
->tdb_write(tdb
, ofs
, dbuf
.dptr
, dbuf
.dsize
);
256 if (dbufs_len
!= rec
.data_len
) {
258 rec
.data_len
= dbufs_len
;
259 return tdb_rec_write(tdb
, rec_ptr
, &rec
);
265 /* find an entry in the database given a key */
266 /* If an entry doesn't exist tdb_err will be set to
267 * TDB_ERR_NOEXIST. If a key has no data attached
268 * then the TDB_DATA will have zero length but
271 static TDB_DATA
_tdb_fetch(struct tdb_context
*tdb
, TDB_DATA key
)
274 struct tdb_record rec
;
278 /* find which hash bucket it is in */
279 hash
= tdb
->hash_fn(&key
);
280 if (!(rec_ptr
= tdb_find_lock_hash(tdb
,key
,hash
,F_RDLCK
,&rec
)))
283 ret
.dptr
= tdb_alloc_read(tdb
, rec_ptr
+ sizeof(rec
) + rec
.key_len
,
285 ret
.dsize
= rec
.data_len
;
286 tdb_unlock(tdb
, BUCKET(rec
.full_hash
), F_RDLCK
);
290 _PUBLIC_ TDB_DATA
tdb_fetch(struct tdb_context
*tdb
, TDB_DATA key
)
292 TDB_DATA ret
= _tdb_fetch(tdb
, key
);
294 tdb_trace_1rec_retrec(tdb
, "tdb_fetch", key
, ret
);
299 * Find an entry in the database and hand the record's data to a parsing
300 * function. The parsing function is executed under the chain read lock, so it
301 * should be fast and should not block on other syscalls.
303 * DON'T CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
305 * For mmapped tdb's that do not have a transaction open it points the parsing
306 * function directly at the mmap area, it avoids the malloc/memcpy in this
307 * case. If a transaction is open or no mmap is available, it has to do
308 * malloc/read/parse/free.
310 * This is interesting for all readers of potentially large data structures in
311 * the tdb records, ldb indexes being one example.
313 * Return -1 if the record was not found.
316 _PUBLIC_
int tdb_parse_record(struct tdb_context
*tdb
, TDB_DATA key
,
317 int (*parser
)(TDB_DATA key
, TDB_DATA data
,
322 struct tdb_record rec
;
326 /* find which hash bucket it is in */
327 hash
= tdb
->hash_fn(&key
);
329 if (!(rec_ptr
= tdb_find_lock_hash(tdb
,key
,hash
,F_RDLCK
,&rec
))) {
330 /* record not found */
331 tdb_trace_1rec_ret(tdb
, "tdb_parse_record", key
, -1);
332 tdb
->ecode
= TDB_ERR_NOEXIST
;
335 tdb_trace_1rec_ret(tdb
, "tdb_parse_record", key
, 0);
337 ret
= tdb_parse_data(tdb
, key
, rec_ptr
+ sizeof(rec
) + rec
.key_len
,
338 rec
.data_len
, parser
, private_data
);
340 tdb_unlock(tdb
, BUCKET(rec
.full_hash
), F_RDLCK
);
345 /* check if an entry in the database exists
347 note that 1 is returned if the key is found and 0 is returned if not found
348 this doesn't match the conventions in the rest of this module, but is
351 static int tdb_exists_hash(struct tdb_context
*tdb
, TDB_DATA key
, uint32_t hash
)
353 struct tdb_record rec
;
355 if (tdb_find_lock_hash(tdb
, key
, hash
, F_RDLCK
, &rec
) == 0)
357 tdb_unlock(tdb
, BUCKET(rec
.full_hash
), F_RDLCK
);
361 _PUBLIC_
int tdb_exists(struct tdb_context
*tdb
, TDB_DATA key
)
363 uint32_t hash
= tdb
->hash_fn(&key
);
366 ret
= tdb_exists_hash(tdb
, key
, hash
);
367 tdb_trace_1rec_ret(tdb
, "tdb_exists", key
, ret
);
372 * Move a dead record to the freelist. The hash chain and freelist
375 static int tdb_del_dead(struct tdb_context
*tdb
,
378 struct tdb_record
*rec
,
383 ret
= tdb_write_lock_record(tdb
, rec_ptr
);
385 /* Someone traversing here: Just leave it dead */
388 ret
= tdb_write_unlock_record(tdb
, rec_ptr
);
392 ret
= tdb_ofs_write(tdb
, last_ptr
, &rec
->next
);
399 ret
= tdb_free(tdb
, rec_ptr
, rec
);
404 * Walk the hash chain and leave tdb->max_dead_records around. Move
405 * the rest of dead records to the freelist.
407 int tdb_trim_dead(struct tdb_context
*tdb
, uint32_t hash
)
409 struct tdb_chainwalk_ctx chainwalk
;
410 struct tdb_record rec
;
411 tdb_off_t last_ptr
, rec_ptr
;
412 bool locked_freelist
= false;
416 last_ptr
= TDB_HASH_TOP(hash
);
419 * Init chainwalk with the pointer to the hash top. It might
420 * be that the very first record in the chain is a dead one
421 * that we have to delete.
423 tdb_chainwalk_init(&chainwalk
, last_ptr
);
425 ret
= tdb_ofs_read(tdb
, last_ptr
, &rec_ptr
);
430 while (rec_ptr
!= 0) {
431 bool deleted
= false;
434 ret
= tdb_rec_read(tdb
, rec_ptr
, &rec
);
440 * Make a copy of rec.next: Further down we might
441 * delete and put the record on the freelist. Make
442 * sure that modifications in that code path can't
443 * break the chainwalk here.
447 if (rec
.magic
== TDB_DEAD_MAGIC
) {
450 if (num_dead
> tdb
->max_dead_records
) {
452 if (!locked_freelist
) {
454 * Lock the freelist only if
455 * it's really required.
457 ret
= tdb_lock(tdb
, -1, F_WRLCK
);
461 locked_freelist
= true;
478 * Don't do the chainwalk check if "rec_ptr" was
479 * deleted. We reduced the chain, and the chainwalk
480 * check might catch up early. Imagine a valid chain
481 * with just dead records: We never can bump the
482 * "slow" pointer in chainwalk_check, as there isn't
483 * anything left to jump to and compare.
490 ok
= tdb_chainwalk_check(tdb
, &chainwalk
, next
);
500 if (locked_freelist
) {
501 tdb_unlock(tdb
, -1, F_WRLCK
);
506 /* delete an entry in the database given a key */
507 static int tdb_delete_hash(struct tdb_context
*tdb
, TDB_DATA key
, uint32_t hash
)
510 struct tdb_record rec
;
513 if (tdb
->read_only
|| tdb
->traverse_read
) {
514 tdb
->ecode
= TDB_ERR_RDONLY
;
518 rec_ptr
= tdb_find_lock_hash(tdb
, key
, hash
, F_WRLCK
, &rec
);
524 * Mark the record dead
526 rec
.magic
= TDB_DEAD_MAGIC
;
527 ret
= tdb_rec_write(tdb
, rec_ptr
, &rec
);
532 tdb_increment_seqnum(tdb
);
534 ret
= tdb_trim_dead(tdb
, hash
);
536 if (tdb_unlock(tdb
, BUCKET(hash
), F_WRLCK
) != 0)
537 TDB_LOG((tdb
, TDB_DEBUG_WARNING
, "tdb_delete: WARNING tdb_unlock failed!\n"));
541 _PUBLIC_
int tdb_delete(struct tdb_context
*tdb
, TDB_DATA key
)
543 uint32_t hash
= tdb
->hash_fn(&key
);
546 ret
= tdb_delete_hash(tdb
, key
, hash
);
547 tdb_trace_1rec_ret(tdb
, "tdb_delete", key
, ret
);
552 * See if we have a dead record around with enough space
554 tdb_off_t
tdb_find_dead(struct tdb_context
*tdb
, uint32_t hash
,
555 struct tdb_record
*r
, tdb_len_t length
,
556 tdb_off_t
*p_last_ptr
)
558 tdb_off_t rec_ptr
, last_ptr
;
559 struct tdb_chainwalk_ctx chainwalk
;
560 tdb_off_t best_rec_ptr
= 0;
561 tdb_off_t best_last_ptr
= 0;
562 struct tdb_record best
= { .rec_len
= UINT32_MAX
};
564 length
+= sizeof(tdb_off_t
); /* tailer */
566 last_ptr
= TDB_HASH_TOP(hash
);
568 /* read in the hash top */
569 if (tdb_ofs_read(tdb
, last_ptr
, &rec_ptr
) == -1)
572 tdb_chainwalk_init(&chainwalk
, rec_ptr
);
574 /* keep looking until we find the right record */
578 if (tdb_rec_read(tdb
, rec_ptr
, r
) == -1)
581 if (TDB_DEAD(r
) && (r
->rec_len
>= length
) &&
582 (r
->rec_len
< best
.rec_len
)) {
583 best_rec_ptr
= rec_ptr
;
584 best_last_ptr
= last_ptr
;
590 ok
= tdb_chainwalk_check(tdb
, &chainwalk
, rec_ptr
);
596 if (best
.rec_len
== UINT32_MAX
) {
601 *p_last_ptr
= best_last_ptr
;
605 static int _tdb_storev(struct tdb_context
*tdb
, TDB_DATA key
,
606 const TDB_DATA
*dbufs
, int num_dbufs
,
607 int flag
, uint32_t hash
)
609 struct tdb_record rec
;
610 tdb_off_t rec_ptr
, ofs
;
611 tdb_len_t rec_len
, dbufs_len
;
617 for (i
=0; i
<num_dbufs
; i
++) {
618 size_t dsize
= dbufs
[i
].dsize
;
620 if ((dsize
!= 0) && (dbufs
[i
].dptr
== NULL
)) {
621 tdb
->ecode
= TDB_ERR_EINVAL
;
626 if (dbufs_len
< dsize
) {
627 tdb
->ecode
= TDB_ERR_OOM
;
632 rec_len
= key
.dsize
+ dbufs_len
;
633 if ((rec_len
< key
.dsize
) || (rec_len
< dbufs_len
)) {
634 tdb
->ecode
= TDB_ERR_OOM
;
638 /* check for it existing, on insert. */
639 if (flag
== TDB_INSERT
) {
640 if (tdb_exists_hash(tdb
, key
, hash
)) {
641 tdb
->ecode
= TDB_ERR_EXISTS
;
645 /* first try in-place update, on modify or replace. */
646 if (tdb_update_hash(tdb
, key
, hash
, dbufs
, num_dbufs
,
650 if (tdb
->ecode
== TDB_ERR_NOEXIST
&&
651 flag
== TDB_MODIFY
) {
652 /* if the record doesn't exist and we are in TDB_MODIFY mode then
653 we should fail the store */
657 /* reset the error code potentially set by the tdb_update_hash() */
658 tdb
->ecode
= TDB_SUCCESS
;
660 /* delete any existing record - if it doesn't exist we don't
661 care. Doing this first reduces fragmentation, and avoids
662 coalescing with `allocated' block before it's updated. */
663 if (flag
!= TDB_INSERT
)
664 tdb_delete_hash(tdb
, key
, hash
);
666 /* we have to allocate some space */
667 rec_ptr
= tdb_allocate(tdb
, hash
, rec_len
, &rec
);
673 /* Read hash top into next ptr */
674 if (tdb_ofs_read(tdb
, TDB_HASH_TOP(hash
), &rec
.next
) == -1)
677 rec
.key_len
= key
.dsize
;
678 rec
.data_len
= dbufs_len
;
679 rec
.full_hash
= hash
;
680 rec
.magic
= TDB_MAGIC
;
684 /* write out and point the top of the hash chain at it */
685 ret
= tdb_rec_write(tdb
, ofs
, &rec
);
691 ret
= tdb
->methods
->tdb_write(tdb
, ofs
, key
.dptr
, key
.dsize
);
697 for (i
=0; i
<num_dbufs
; i
++) {
698 if (dbufs
[i
].dsize
== 0) {
702 ret
= tdb
->methods
->tdb_write(tdb
, ofs
, dbufs
[i
].dptr
,
707 ofs
+= dbufs
[i
].dsize
;
710 ret
= tdb_ofs_write(tdb
, TDB_HASH_TOP(hash
), &rec_ptr
);
712 /* Need to tdb_unallocate() here */
720 tdb_increment_seqnum(tdb
);
725 static int _tdb_store(struct tdb_context
*tdb
, TDB_DATA key
,
726 TDB_DATA dbuf
, int flag
, uint32_t hash
)
728 return _tdb_storev(tdb
, key
, &dbuf
, 1, flag
, hash
);
731 /* store an element in the database, replacing any existing element
734 return 0 on success, -1 on failure
736 _PUBLIC_
int tdb_store(struct tdb_context
*tdb
, TDB_DATA key
, TDB_DATA dbuf
, int flag
)
741 if (tdb
->read_only
|| tdb
->traverse_read
) {
742 tdb
->ecode
= TDB_ERR_RDONLY
;
743 tdb_trace_2rec_flag_ret(tdb
, "tdb_store", key
, dbuf
, flag
, -1);
747 /* find which hash bucket it is in */
748 hash
= tdb
->hash_fn(&key
);
749 if (tdb_lock(tdb
, BUCKET(hash
), F_WRLCK
) == -1)
752 ret
= _tdb_store(tdb
, key
, dbuf
, flag
, hash
);
753 tdb_trace_2rec_flag_ret(tdb
, "tdb_store", key
, dbuf
, flag
, ret
);
754 tdb_unlock(tdb
, BUCKET(hash
), F_WRLCK
);
758 _PUBLIC_
int tdb_storev(struct tdb_context
*tdb
, TDB_DATA key
,
759 const TDB_DATA
*dbufs
, int num_dbufs
, int flag
)
764 if (tdb
->read_only
|| tdb
->traverse_read
) {
765 tdb
->ecode
= TDB_ERR_RDONLY
;
766 tdb_trace_1plusn_rec_flag_ret(tdb
, "tdb_storev", key
,
767 dbufs
, num_dbufs
, flag
, -1);
771 /* find which hash bucket it is in */
772 hash
= tdb
->hash_fn(&key
);
773 if (tdb_lock(tdb
, BUCKET(hash
), F_WRLCK
) == -1)
776 ret
= _tdb_storev(tdb
, key
, dbufs
, num_dbufs
, flag
, hash
);
777 tdb_trace_1plusn_rec_flag_ret(tdb
, "tdb_storev", key
,
778 dbufs
, num_dbufs
, flag
, -1);
779 tdb_unlock(tdb
, BUCKET(hash
), F_WRLCK
);
783 /* Append to an entry. Create if not exist. */
784 _PUBLIC_
int tdb_append(struct tdb_context
*tdb
, TDB_DATA key
, TDB_DATA new_dbuf
)
790 /* find which hash bucket it is in */
791 hash
= tdb
->hash_fn(&key
);
792 if (tdb_lock(tdb
, BUCKET(hash
), F_WRLCK
) == -1)
795 dbufs
[0] = _tdb_fetch(tdb
, key
);
798 ret
= _tdb_storev(tdb
, key
, dbufs
, 2, 0, hash
);
799 tdb_trace_2rec_retrec(tdb
, "tdb_append", key
, dbufs
[0], dbufs
[1]);
801 tdb_unlock(tdb
, BUCKET(hash
), F_WRLCK
);
802 SAFE_FREE(dbufs
[0].dptr
);
808 return the name of the current tdb file
809 useful for external logging functions
811 _PUBLIC_
const char *tdb_name(struct tdb_context
*tdb
)
817 return the underlying file descriptor being used by tdb, or -1
818 useful for external routines that want to check the device/inode
821 _PUBLIC_
int tdb_fd(struct tdb_context
*tdb
)
827 return the current logging function
828 useful for external tdb routines that wish to log tdb errors
830 _PUBLIC_ tdb_log_func
tdb_log_fn(struct tdb_context
*tdb
)
832 return tdb
->log
.log_fn
;
837 get the tdb sequence number. Only makes sense if the writers opened
838 with TDB_SEQNUM set. Note that this sequence number will wrap quite
839 quickly, so it should only be used for a 'has something changed'
840 test, not for code that relies on the count of the number of changes
841 made. If you want a counter then use a tdb record.
843 The aim of this sequence number is to allow for a very lightweight
844 test of a possible tdb change.
846 _PUBLIC_
int tdb_get_seqnum(struct tdb_context
*tdb
)
850 if (tdb
->transaction
!= NULL
) {
851 tdb_ofs_read(tdb
, TDB_SEQNUM_OFS
, &seqnum
);
855 #if defined(HAVE___ATOMIC_ADD_FETCH) && defined(HAVE___ATOMIC_ADD_LOAD)
856 if (tdb
->map_ptr
!= NULL
) {
857 uint32_t *pseqnum
= (uint32_t *)(
858 TDB_SEQNUM_OFS
+ (char *)tdb
->map_ptr
);
860 __atomic_load(pseqnum
, &ret
,__ATOMIC_SEQ_CST
);
865 tdb_ofs_read(tdb
, TDB_SEQNUM_OFS
, &seqnum
);
869 _PUBLIC_
int tdb_hash_size(struct tdb_context
*tdb
)
871 return tdb
->hash_size
;
874 _PUBLIC_
size_t tdb_map_size(struct tdb_context
*tdb
)
876 return tdb
->map_size
;
879 _PUBLIC_
int tdb_get_flags(struct tdb_context
*tdb
)
884 _PUBLIC_
void tdb_add_flags(struct tdb_context
*tdb
, unsigned flags
)
886 if ((flags
& TDB_ALLOW_NESTING
) &&
887 (flags
& TDB_DISALLOW_NESTING
)) {
888 tdb
->ecode
= TDB_ERR_NESTING
;
889 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, "tdb_add_flags: "
890 "allow_nesting and disallow_nesting are not allowed together!"));
894 if (flags
& TDB_ALLOW_NESTING
) {
895 tdb
->flags
&= ~TDB_DISALLOW_NESTING
;
897 if (flags
& TDB_DISALLOW_NESTING
) {
898 tdb
->flags
&= ~TDB_ALLOW_NESTING
;
904 _PUBLIC_
void tdb_remove_flags(struct tdb_context
*tdb
, unsigned flags
)
906 if ((flags
& TDB_ALLOW_NESTING
) &&
907 (flags
& TDB_DISALLOW_NESTING
)) {
908 tdb
->ecode
= TDB_ERR_NESTING
;
909 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, "tdb_remove_flags: "
910 "allow_nesting and disallow_nesting are not allowed together!"));
914 if ((flags
& TDB_NOLOCK
) &&
915 (tdb
->feature_flags
& TDB_FEATURE_FLAG_MUTEX
) &&
916 (tdb
->mutexes
== NULL
)) {
917 tdb
->ecode
= TDB_ERR_LOCK
;
918 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, "tdb_remove_flags: "
919 "Can not remove NOLOCK flag on mutexed databases"));
923 if (flags
& TDB_ALLOW_NESTING
) {
924 tdb
->flags
|= TDB_DISALLOW_NESTING
;
926 if (flags
& TDB_DISALLOW_NESTING
) {
927 tdb
->flags
|= TDB_ALLOW_NESTING
;
930 tdb
->flags
&= ~flags
;
935 enable sequence number handling on an open tdb
937 _PUBLIC_
void tdb_enable_seqnum(struct tdb_context
*tdb
)
939 tdb
->flags
|= TDB_SEQNUM
;
944 add a region of the file to the freelist. Length is the size of the region in bytes,
945 which includes the free list header that needs to be added
947 static int tdb_free_region(struct tdb_context
*tdb
, tdb_off_t offset
, ssize_t length
)
949 struct tdb_record rec
;
950 if (length
<= sizeof(rec
)) {
951 /* the region is not worth adding */
954 if (length
+ offset
> tdb
->map_size
) {
955 TDB_LOG((tdb
, TDB_DEBUG_FATAL
,"tdb_free_region: adding region beyond end of file\n"));
958 memset(&rec
,'\0',sizeof(rec
));
959 rec
.rec_len
= length
- sizeof(rec
);
960 if (tdb_free(tdb
, offset
, &rec
) == -1) {
961 TDB_LOG((tdb
, TDB_DEBUG_FATAL
,"tdb_free_region: failed to add free record\n"));
968 wipe the entire database, deleting all records. This can be done
969 very fast by using a allrecord lock. The entire data portion of the
970 file becomes a single entry in the freelist.
972 This code carefully steps around the recovery area, leaving it alone
974 _PUBLIC_
int tdb_wipe_all(struct tdb_context
*tdb
)
977 tdb_off_t offset
= 0;
979 tdb_off_t recovery_head
;
980 tdb_len_t recovery_size
= 0;
982 if (tdb_lockall(tdb
) != 0) {
986 tdb_trace(tdb
, "tdb_wipe_all");
988 /* see if the tdb has a recovery area, and remember its size
989 if so. We don't want to lose this as otherwise each
990 tdb_wipe_all() in a transaction will increase the size of
991 the tdb by the size of the recovery area */
992 if (tdb_ofs_read(tdb
, TDB_RECOVERY_HEAD
, &recovery_head
) == -1) {
993 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, "tdb_wipe_all: failed to read recovery head\n"));
997 if (recovery_head
!= 0) {
998 struct tdb_record rec
;
999 if (tdb
->methods
->tdb_read(tdb
, recovery_head
, &rec
, sizeof(rec
), DOCONV()) == -1) {
1000 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, "tdb_wipe_all: failed to read recovery record\n"));
1003 recovery_size
= rec
.rec_len
+ sizeof(rec
);
1006 /* wipe the hashes */
1007 for (i
=0;i
<tdb
->hash_size
;i
++) {
1008 if (tdb_ofs_write(tdb
, TDB_HASH_TOP(i
), &offset
) == -1) {
1009 TDB_LOG((tdb
, TDB_DEBUG_FATAL
,"tdb_wipe_all: failed to write hash %d\n", i
));
1014 /* wipe the freelist */
1015 if (tdb_ofs_write(tdb
, FREELIST_TOP
, &offset
) == -1) {
1016 TDB_LOG((tdb
, TDB_DEBUG_FATAL
,"tdb_wipe_all: failed to write freelist\n"));
1020 /* add all the rest of the file to the freelist, possibly leaving a gap
1021 for the recovery area */
1022 if (recovery_size
== 0) {
1023 /* the simple case - the whole file can be used as a freelist */
1024 data_len
= (tdb
->map_size
- TDB_DATA_START(tdb
->hash_size
));
1025 if (tdb_free_region(tdb
, TDB_DATA_START(tdb
->hash_size
), data_len
) != 0) {
1029 /* we need to add two freelist entries - one on either
1030 side of the recovery area
1032 Note that we cannot shift the recovery area during
1033 this operation. Only the transaction.c code may
1034 move the recovery area or we risk subtle data
1037 data_len
= (recovery_head
- TDB_DATA_START(tdb
->hash_size
));
1038 if (tdb_free_region(tdb
, TDB_DATA_START(tdb
->hash_size
), data_len
) != 0) {
1041 /* and the 2nd free list entry after the recovery area - if any */
1042 data_len
= tdb
->map_size
- (recovery_head
+recovery_size
);
1043 if (tdb_free_region(tdb
, recovery_head
+recovery_size
, data_len
) != 0) {
1048 tdb_increment_seqnum_nonblock(tdb
);
1050 if (tdb_unlockall(tdb
) != 0) {
1051 TDB_LOG((tdb
, TDB_DEBUG_FATAL
,"tdb_wipe_all: failed to unlock\n"));
1062 struct traverse_state
{
1064 struct tdb_context
*dest_db
;
1068 traverse function for repacking
1070 static int repack_traverse(struct tdb_context
*tdb
, TDB_DATA key
, TDB_DATA data
, void *private_data
)
1072 struct traverse_state
*state
= (struct traverse_state
*)private_data
;
1073 if (tdb_store(state
->dest_db
, key
, data
, TDB_INSERT
) != 0) {
1074 state
->error
= true;
1083 _PUBLIC_
int tdb_repack(struct tdb_context
*tdb
)
1085 struct tdb_context
*tmp_db
;
1086 struct traverse_state state
;
1088 tdb_trace(tdb
, "tdb_repack");
1090 if (tdb_transaction_start(tdb
) != 0) {
1091 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, __location__
" Failed to start transaction\n"));
1095 tmp_db
= tdb_open("tmpdb", tdb_hash_size(tdb
), TDB_INTERNAL
, O_RDWR
|O_CREAT
, 0);
1096 if (tmp_db
== NULL
) {
1097 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, __location__
" Failed to create tmp_db\n"));
1098 tdb_transaction_cancel(tdb
);
1102 state
.error
= false;
1103 state
.dest_db
= tmp_db
;
1105 if (tdb_traverse_read(tdb
, repack_traverse
, &state
) == -1) {
1106 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, __location__
" Failed to traverse copying out\n"));
1107 tdb_transaction_cancel(tdb
);
1113 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, __location__
" Error during traversal\n"));
1114 tdb_transaction_cancel(tdb
);
1119 if (tdb_wipe_all(tdb
) != 0) {
1120 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, __location__
" Failed to wipe database\n"));
1121 tdb_transaction_cancel(tdb
);
1126 state
.error
= false;
1127 state
.dest_db
= tdb
;
1129 if (tdb_traverse_read(tmp_db
, repack_traverse
, &state
) == -1) {
1130 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, __location__
" Failed to traverse copying back\n"));
1131 tdb_transaction_cancel(tdb
);
1137 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, __location__
" Error during second traversal\n"));
1138 tdb_transaction_cancel(tdb
);
1145 if (tdb_transaction_commit(tdb
) != 0) {
1146 TDB_LOG((tdb
, TDB_DEBUG_FATAL
, __location__
" Failed to commit\n"));
1153 /* Even on files, we can get partial writes due to signals. */
1154 bool tdb_write_all(int fd
, const void *buf
, size_t count
)
1158 ret
= write(fd
, buf
, count
);
1161 buf
= (const char *)buf
+ ret
;
1167 bool tdb_add_off_t(tdb_off_t a
, tdb_off_t b
, tdb_off_t
*pret
)
1169 tdb_off_t ret
= a
+ b
;
1171 if ((ret
< a
) || (ret
< b
)) {
1179 static void tdb_trace_write(struct tdb_context
*tdb
, const char *str
)
1181 if (!tdb_write_all(tdb
->tracefd
, str
, strlen(str
))) {
1182 close(tdb
->tracefd
);
1187 static void tdb_trace_start(struct tdb_context
*tdb
)
1190 char msg
[sizeof(tdb_off_t
) * 4 + 1];
1192 tdb_ofs_read(tdb
, TDB_SEQNUM_OFS
, &seqnum
);
1193 snprintf(msg
, sizeof(msg
), "%u ", seqnum
);
1194 tdb_trace_write(tdb
, msg
);
1197 static void tdb_trace_end(struct tdb_context
*tdb
)
1199 tdb_trace_write(tdb
, "\n");
1202 static void tdb_trace_end_ret(struct tdb_context
*tdb
, int ret
)
1204 char msg
[sizeof(ret
) * 4 + 4];
1205 snprintf(msg
, sizeof(msg
), " = %i\n", ret
);
1206 tdb_trace_write(tdb
, msg
);
1209 static void tdb_trace_record(struct tdb_context
*tdb
, TDB_DATA rec
)
1211 char msg
[21 + rec
.dsize
*2], *p
;
1214 /* We differentiate zero-length records from non-existent ones. */
1215 if (rec
.dptr
== NULL
) {
1216 tdb_trace_write(tdb
, " NULL");
1220 /* snprintf here is purely cargo-cult programming. */
1222 p
+= snprintf(p
, sizeof(msg
), " %zu:", rec
.dsize
);
1224 for (i
= 0; i
< rec
.dsize
; i
++) {
1225 snprintf(p
, 3, "%02x", rec
.dptr
[i
]);
1229 tdb_trace_write(tdb
, msg
);
1232 void tdb_trace(struct tdb_context
*tdb
, const char *op
)
1234 tdb_trace_start(tdb
);
1235 tdb_trace_write(tdb
, op
);
1239 void tdb_trace_seqnum(struct tdb_context
*tdb
, uint32_t seqnum
, const char *op
)
1241 char msg
[sizeof(tdb_off_t
) * 4 + 1];
1243 snprintf(msg
, sizeof(msg
), "%u ", seqnum
);
1244 tdb_trace_write(tdb
, msg
);
1245 tdb_trace_write(tdb
, op
);
1249 void tdb_trace_open(struct tdb_context
*tdb
, const char *op
,
1250 unsigned hash_size
, unsigned tdb_flags
, unsigned open_flags
)
1254 snprintf(msg
, sizeof(msg
),
1255 "%s %u 0x%x 0x%x", op
, hash_size
, tdb_flags
, open_flags
);
1256 tdb_trace_start(tdb
);
1257 tdb_trace_write(tdb
, msg
);
1261 void tdb_trace_ret(struct tdb_context
*tdb
, const char *op
, int ret
)
1263 tdb_trace_start(tdb
);
1264 tdb_trace_write(tdb
, op
);
1265 tdb_trace_end_ret(tdb
, ret
);
1268 void tdb_trace_retrec(struct tdb_context
*tdb
, const char *op
, TDB_DATA ret
)
1270 tdb_trace_start(tdb
);
1271 tdb_trace_write(tdb
, op
);
1272 tdb_trace_write(tdb
, " =");
1273 tdb_trace_record(tdb
, ret
);
1277 void tdb_trace_1rec(struct tdb_context
*tdb
, const char *op
,
1280 tdb_trace_start(tdb
);
1281 tdb_trace_write(tdb
, op
);
1282 tdb_trace_record(tdb
, rec
);
1286 void tdb_trace_1rec_ret(struct tdb_context
*tdb
, const char *op
,
1287 TDB_DATA rec
, int ret
)
1289 tdb_trace_start(tdb
);
1290 tdb_trace_write(tdb
, op
);
1291 tdb_trace_record(tdb
, rec
);
1292 tdb_trace_end_ret(tdb
, ret
);
1295 void tdb_trace_1rec_retrec(struct tdb_context
*tdb
, const char *op
,
1296 TDB_DATA rec
, TDB_DATA ret
)
1298 tdb_trace_start(tdb
);
1299 tdb_trace_write(tdb
, op
);
1300 tdb_trace_record(tdb
, rec
);
1301 tdb_trace_write(tdb
, " =");
1302 tdb_trace_record(tdb
, ret
);
1306 void tdb_trace_2rec_flag_ret(struct tdb_context
*tdb
, const char *op
,
1307 TDB_DATA rec1
, TDB_DATA rec2
, unsigned flag
,
1310 char msg
[1 + sizeof(ret
) * 4];
1312 snprintf(msg
, sizeof(msg
), " %#x", flag
);
1313 tdb_trace_start(tdb
);
1314 tdb_trace_write(tdb
, op
);
1315 tdb_trace_record(tdb
, rec1
);
1316 tdb_trace_record(tdb
, rec2
);
1317 tdb_trace_write(tdb
, msg
);
1318 tdb_trace_end_ret(tdb
, ret
);
1321 void tdb_trace_1plusn_rec_flag_ret(struct tdb_context
*tdb
, const char *op
,
1323 const TDB_DATA
*recs
, int num_recs
,
1324 unsigned flag
, int ret
)
1326 char msg
[1 + sizeof(ret
) * 4];
1329 snprintf(msg
, sizeof(msg
), " %#x", flag
);
1330 tdb_trace_start(tdb
);
1331 tdb_trace_write(tdb
, op
);
1332 tdb_trace_record(tdb
, rec
);
1333 for (i
=0; i
<num_recs
; i
++) {
1334 tdb_trace_record(tdb
, recs
[i
]);
1336 tdb_trace_write(tdb
, msg
);
1337 tdb_trace_end_ret(tdb
, ret
);
1340 void tdb_trace_2rec_retrec(struct tdb_context
*tdb
, const char *op
,
1341 TDB_DATA rec1
, TDB_DATA rec2
, TDB_DATA ret
)
1343 tdb_trace_start(tdb
);
1344 tdb_trace_write(tdb
, op
);
1345 tdb_trace_record(tdb
, rec1
);
1346 tdb_trace_record(tdb
, rec2
);
1347 tdb_trace_write(tdb
, " =");
1348 tdb_trace_record(tdb
, ret
);