2 * Copyright (c) 2003 Dustin Sallings <dustin@spy.net>
3 * Copyright (C) The #Seppo contributors. All rights reserved.
6 * https://github.com/dustin/snippets/blob/master/ocaml/lib/cdb.ml
10 * CDB Implementation. http://cr.yp.to/cdb/cdb.txt
13 (* The cdb hash function is ``h = ((h << 5) + h) ^ c'', with a starting
18 table_count
: int array
;
19 (* Hash index pointers *)
20 mutable pointers
: (int32
* int32
) list
;
24 let hash_init : int64
= 5381L
25 let ff64 : int64
= 0xffL
26 let ffffffff64 : int64
= 0xFFffFFffL
27 let ff32 : int32
= Int32.of_int
0xff
29 let hash (s
: bytes
) : int32
=
30 let h = ref hash_init in
34 Int64.logand
ffffffff64
36 (Int64.add
(Int64.shift_left
!h 5) !h)
37 (Int64.of_int
(int_of_char c
))))
41 let wri4b_le oc
(byt
: int -> int) =
42 let wri idx
= byt idx
|> output_byte oc
in
48 let write_le cdc
(i32
: int) =
49 wri4b_le cdc
.out
(fun byt
-> (i32
lsr (byt
* 8)) land 0xff)
51 let write_le32 cdc
(i32
: int32
) =
52 wri4b_le cdc
.out
(fun byt
->
53 Int32.to_int
(Int32.logand
(Int32.shift_right_logical i32
(byt
* 8)) ff32))
55 let cdb_creator_of_out_channel out_channel
: cdb_creator
=
57 { table_count
= Array.make
256 0; pointers
= []; out
= out_channel
}
59 (* Skip over the header *)
60 seek_out
cdb.out
2048;
63 let open_out (fn
: string) : cdb_creator
=
64 fn
|> open_out_bin
|> cdb_creator_of_out_channel
66 let hash_to_table h = Int32.to_int
(Int32.logand
h ff32)
68 let hash_to_bucket h len
=
69 Int32.rem
(Int32.shift_right_logical
h 8) (Int32.of_int len
) |> Int32.to_int
71 let pos_out_32 x
= x
|> LargeFile.pos_out
|> Int64.to_int32
74 (* Add the hash to the list *)
76 cdc
.pointers
<- (h, pos_out_32 cdc
.out
) :: cdc
.pointers
;
77 let table = hash_to_table h in
78 cdc
.table_count
.(table) <- cdc
.table_count
.(table) + 1;
79 (* Add the data to the file *)
80 write_le cdc
(Bytes.length k
);
81 write_le cdc
(Bytes.length v
);
82 output_bytes cdc
.out k
;
83 output_bytes cdc
.out v
85 (** Process a hash table *)
86 let process_table cdc table_start slot_table slot_pointers i tc
=
87 (* Length of the table *)
89 (* Store the table position *)
90 slot_table
:= (pos_out_32 cdc
.out
, Int32.of_int
len) :: !slot_table
;
91 (* Build the hash table *)
92 let ht = Array.make
len None
in
93 let cur_p = ref table_start
.(i
) in
95 try Hashtbl.find slot_pointers n
with Not_found
-> (Int32.zero
, Int32.zero
)
97 for _
= 0 to pred tc
do
98 let hp = lookupSlot !cur_p in
101 (* Find an available hash bucket *)
102 let rec find_slot where
=
103 match ht.(where
) with
105 | Some _
-> if where
+ 1 = len then find_slot 0 else find_slot (where
+ 1)
107 let where = find_slot (hash_to_bucket (fst
hp) len) in
108 ht.(where) <- Some
hp
110 (* Write this hash table *)
114 match hpp
with None
-> (Int32.zero
, Int32.zero
) | Some x
-> x
120 let close_cdb_out cdc
=
121 let cur_entry = ref 0 in
122 let table_start = Array.make
256 0 in
123 (* Find all the hash starts *)
126 cur_entry := !cur_entry + x
;
127 table_start.(i
) <- !cur_entry)
129 (* Build out the slot pointers hash *)
130 let slot_pointers = Hashtbl.create
(List.length cdc
.pointers
) in
131 (* Fill in the slot pointers *)
135 let table = hash_to_table h in
136 table_start.(table) <- pred
table_start.(table);
137 Hashtbl.replace
slot_pointers table_start.(table) hp)
139 (* Write the shit out *)
140 let slot_table = ref [] in
141 (* Write out the hash tables *)
143 (process_table cdc
table_start slot_table slot_pointers)
145 (* write out the pointer sets *)
149 write_le32 cdc
(fst x
);
150 write_le32 cdc
(snd x
))
151 (List.rev
!slot_table);
154 (** {1 Iterating a cdb file} *)
156 (* read a little-endian integer *)
158 let a = input_byte f
in
159 let b = input_byte f
in
160 let c = input_byte f
in
161 let d = input_byte f
in
162 a lor (b lsl 8) lor (c lsl 16) lor (d lsl 24)
164 (* Int32 version of read_le *)
166 let a = input_byte f
in
167 let b = input_byte f
in
168 let c = input_byte f
in
169 let d = input_byte f
in
171 (Int32.of_int
(a lor (b lsl 8) lor (c lsl 16)))
172 (Int32.shift_left
(Int32.of_int
d) 24)
174 let iter (f
: bytes
* bytes
-> bool) (fn
: string) : unit =
175 let fin = open_in_bin fn
in
177 (* Figure out where the end of all data is *)
178 let eod = read_le32 fin in
179 (* Seek to the record section *)
182 (* (pos_in fin) < eod *)
183 if Int32.compare
(Int64.to_int32
(LargeFile.pos_in
fin)) eod < 0 then (
184 let klen = read_le fin in
185 let dlen = read_le fin in
186 let key = Bytes.create
klen in
187 let data = Bytes.create
dlen in
188 really_input
fin key 0 klen;
189 really_input
fin data 0 dlen;
190 if f
(key, data) then loop ())
200 (* Position * length *)
201 tables
: (int32
* int) array
;
204 let open_cdb_in (fn
: string) : cdb_file
=
205 let fin = open_in_bin fn
in
206 let tables = Array.make
256 (Int32.zero
, 0) in
207 (* Set the positions and lengths *)
210 let pos = read_le32 fin in
211 let len = read_le fin in
212 tables.(i
) <- (pos, len))
216 let close_cdb_in cdf
= close_in cdf
.f
218 let find_all (cdf
: cdb_file
) (key : bytes
) : bytes
Stream.t
=
219 let kh = key |> hash in
220 (* Find out where the hash table is *)
221 let hpos, hlen
= cdf
.tables.(hash_to_table kh) and fd
= cdf
.f
in
223 if x
>= hlen
then None
225 (* Calculate the slot containing these entries *)
226 let lslot = (hash_to_bucket kh hlen
+ x
) mod hlen
in
227 let spos = Int32.add (Int32.of_int
(lslot * 8)) hpos in
228 LargeFile.seek_in fd
(Int64.of_int32
spos);
229 let h = read_le32 fd
in
230 let pos = read_le32 fd
in
231 (* validate that we a real bucket *)
232 if h = kh && Int32.compare
pos Int32.zero
> 0 then (
233 LargeFile.seek_in fd
(Int64.of_int32
pos);
234 let klen = read_le fd
in
235 if klen = Bytes.length
key then (
236 let dlen = read_le fd
in
237 let rkey = Bytes.create
klen in
238 really_input fd
rkey 0 klen;
240 let rdata = Bytes.create
dlen in
241 really_input fd
rdata 0 dlen;
249 let find_first cdf
key =
250 try Some
(key |> find_all cdf
|> Stream.next
) with Stream.Failure
-> None