readme
[Seppo.git] / lib / ds_cdb.ml
bloba9873a3155f23177ca78b2b28e4c61df95e0cdf4
1 (*
2 * Copyright (c) 2003 Dustin Sallings <dustin@spy.net>
3 * Copyright (C) The #Seppo contributors. All rights reserved.
4 *
5 * based on
6 * https://github.com/dustin/snippets/blob/master/ocaml/lib/cdb.ml
7 *)
9 (**
10 * CDB Implementation. http://cr.yp.to/cdb/cdb.txt
13 (* The cdb hash function is ``h = ((h << 5) + h) ^ c'', with a starting
14 hash of 5381.
17 type cdb_creator = {
18 table_count : int array;
19 (* Hash index pointers *)
20 mutable pointers : (int32 * int32) list;
21 out : out_channel;
24 let hash_init : int64 = 5381L
25 let ff64 : int64 = 0xffL
26 let ffffffff64 : int64 = 0xFFffFFffL
27 let ff32 : int32 = Int32.of_int 0xff
29 let hash (s : bytes) : int32 =
30 let h = ref hash_init in
31 Bytes.iter
32 (fun c ->
33 h :=
34 Int64.logand ffffffff64
35 (Int64.logxor
36 (Int64.add (Int64.shift_left !h 5) !h)
37 (Int64.of_int (int_of_char c))))
39 Int64.to_int32 !h
41 let wri4b_le oc (byt : int -> int) =
42 let wri idx = byt idx |> output_byte oc in
43 wri 0;
44 wri 1;
45 wri 2;
46 wri 3
48 let write_le cdc (i32 : int) =
49 wri4b_le cdc.out (fun byt -> (i32 lsr (byt * 8)) land 0xff)
51 let write_le32 cdc (i32 : int32) =
52 wri4b_le cdc.out (fun byt ->
53 Int32.to_int (Int32.logand (Int32.shift_right_logical i32 (byt * 8)) ff32))
55 let cdb_creator_of_out_channel out_channel : cdb_creator =
56 let cdb =
57 { table_count = Array.make 256 0; pointers = []; out = out_channel }
59 (* Skip over the header *)
60 seek_out cdb.out 2048;
61 cdb
63 let open_out (fn : string) : cdb_creator =
64 fn |> open_out_bin |> cdb_creator_of_out_channel
66 let hash_to_table h = Int32.to_int (Int32.logand h ff32)
68 let hash_to_bucket h len =
69 Int32.rem (Int32.shift_right_logical h 8) (Int32.of_int len) |> Int32.to_int
71 let pos_out_32 x = x |> LargeFile.pos_out |> Int64.to_int32
73 let add cdc k v =
74 (* Add the hash to the list *)
75 let h = hash k in
76 cdc.pointers <- (h, pos_out_32 cdc.out) :: cdc.pointers;
77 let table = hash_to_table h in
78 cdc.table_count.(table) <- cdc.table_count.(table) + 1;
79 (* Add the data to the file *)
80 write_le cdc (Bytes.length k);
81 write_le cdc (Bytes.length v);
82 output_bytes cdc.out k;
83 output_bytes cdc.out v
85 (** Process a hash table *)
86 let process_table cdc table_start slot_table slot_pointers i tc =
87 (* Length of the table *)
88 let len = tc * 2 in
89 (* Store the table position *)
90 slot_table := (pos_out_32 cdc.out, Int32.of_int len) :: !slot_table;
91 (* Build the hash table *)
92 let ht = Array.make len None in
93 let cur_p = ref table_start.(i) in
94 let lookupSlot n =
95 try Hashtbl.find slot_pointers n with Not_found -> (Int32.zero, Int32.zero)
97 for _ = 0 to pred tc do
98 let hp = lookupSlot !cur_p in
99 cur_p := !cur_p + 1;
101 (* Find an available hash bucket *)
102 let rec find_slot where =
103 match ht.(where) with
104 | None -> where
105 | Some _ -> if where + 1 = len then find_slot 0 else find_slot (where + 1)
107 let where = find_slot (hash_to_bucket (fst hp) len) in
108 ht.(where) <- Some hp
109 done;
110 (* Write this hash table *)
111 Array.iter
112 (fun hpp ->
113 let h, t =
114 match hpp with None -> (Int32.zero, Int32.zero) | Some x -> x
116 write_le32 cdc h;
117 write_le32 cdc t)
120 let close_cdb_out cdc =
121 let cur_entry = ref 0 in
122 let table_start = Array.make 256 0 in
123 (* Find all the hash starts *)
124 Array.iteri
125 (fun i x ->
126 cur_entry := !cur_entry + x;
127 table_start.(i) <- !cur_entry)
128 cdc.table_count;
129 (* Build out the slot pointers hash *)
130 let slot_pointers = Hashtbl.create (List.length cdc.pointers) in
131 (* Fill in the slot pointers *)
132 List.iter
133 (fun hp ->
134 let h = fst hp in
135 let table = hash_to_table h in
136 table_start.(table) <- pred table_start.(table);
137 Hashtbl.replace slot_pointers table_start.(table) hp)
138 cdc.pointers;
139 (* Write the shit out *)
140 let slot_table = ref [] in
141 (* Write out the hash tables *)
142 Array.iteri
143 (process_table cdc table_start slot_table slot_pointers)
144 cdc.table_count;
145 (* write out the pointer sets *)
146 seek_out cdc.out 0;
147 List.iter
148 (fun x ->
149 write_le32 cdc (fst x);
150 write_le32 cdc (snd x))
151 (List.rev !slot_table);
152 close_out cdc.out
154 (** {1 Iterating a cdb file} *)
156 (* read a little-endian integer *)
157 let read_le f =
158 let a = input_byte f in
159 let b = input_byte f in
160 let c = input_byte f in
161 let d = input_byte f in
162 a lor (b lsl 8) lor (c lsl 16) lor (d lsl 24)
164 (* Int32 version of read_le *)
165 let read_le32 f =
166 let a = input_byte f in
167 let b = input_byte f in
168 let c = input_byte f in
169 let d = input_byte f in
170 Int32.logor
171 (Int32.of_int (a lor (b lsl 8) lor (c lsl 16)))
172 (Int32.shift_left (Int32.of_int d) 24)
174 let iter (f : bytes * bytes -> bool) (fn : string) : unit =
175 let fin = open_in_bin fn in
177 (* Figure out where the end of all data is *)
178 let eod = read_le32 fin in
179 (* Seek to the record section *)
180 seek_in fin 2048;
181 let rec loop () =
182 (* (pos_in fin) < eod *)
183 if Int32.compare (Int64.to_int32 (LargeFile.pos_in fin)) eod < 0 then (
184 let klen = read_le fin in
185 let dlen = read_le fin in
186 let key = Bytes.create klen in
187 let data = Bytes.create dlen in
188 really_input fin key 0 klen;
189 really_input fin data 0 dlen;
190 if f (key, data) then loop ())
192 loop ();
193 close_in fin
194 with x ->
195 close_in fin;
196 raise x
198 type cdb_file = {
199 f : in_channel;
200 (* Position * length *)
201 tables : (int32 * int) array;
204 let open_cdb_in (fn : string) : cdb_file =
205 let fin = open_in_bin fn in
206 let tables = Array.make 256 (Int32.zero, 0) in
207 (* Set the positions and lengths *)
208 Array.iteri
209 (fun i _ ->
210 let pos = read_le32 fin in
211 let len = read_le fin in
212 tables.(i) <- (pos, len))
213 tables;
214 { f = fin; tables }
216 let close_cdb_in cdf = close_in cdf.f
218 let find_all (cdf : cdb_file) (key : bytes) : bytes Stream.t =
219 let kh = key |> hash in
220 (* Find out where the hash table is *)
221 let hpos, hlen = cdf.tables.(hash_to_table kh) and fd = cdf.f in
222 let rec loop x =
223 if x >= hlen then None
224 else
225 (* Calculate the slot containing these entries *)
226 let lslot = (hash_to_bucket kh hlen + x) mod hlen in
227 let spos = Int32.add (Int32.of_int (lslot * 8)) hpos in
228 LargeFile.seek_in fd (Int64.of_int32 spos);
229 let h = read_le32 fd in
230 let pos = read_le32 fd in
231 (* validate that we a real bucket *)
232 if h = kh && Int32.compare pos Int32.zero > 0 then (
233 LargeFile.seek_in fd (Int64.of_int32 pos);
234 let klen = read_le fd in
235 if klen = Bytes.length key then (
236 let dlen = read_le fd in
237 let rkey = Bytes.create klen in
238 really_input fd rkey 0 klen;
239 if rkey = key then (
240 let rdata = Bytes.create dlen in
241 really_input fd rdata 0 dlen;
242 Some rdata)
243 else loop (x + 1))
244 else loop (x + 1))
245 else loop (x + 1)
247 Stream.from loop
249 let find_first cdf key =
250 try Some (key |> find_all cdf |> Stream.next) with Stream.Failure -> None