Implemented new tokenization algorithm.
[mailvisa2.git] / src / wordlist.ml
blob8cae53d55352066789e1aa27018e716c6624a11a
1 (* Functions for operating on word lists *)
3 open Str
5 open Common
7 type wordlist = {
8 messages : int;
9 words : (string, float) Hashtbl.t;
12 (** Read headers from an input channel. *)
13 let read_headers channel =
14 let rec aux channel headers =
15 let line = input_line channel in
16 if (String.length line) = 0 then headers
17 else begin
18 (match (bounded_split (regexp_string ": ") line 2) with
19 | [key; value] -> Hashtbl.add headers key value
20 | _ -> failwith ("Invalid line in input: " ^ line));
21 aux channel headers
22 end
24 aux channel (Hashtbl.create 4)
26 (** Read words and scores from an input channel. *)
27 let read_words_version_1 channel =
28 let rec aux channel words =
29 let eof =
30 (try
31 let line = input_line channel in
32 (match (bounded_split (regexp_string "\t") line 2) with
33 | [word; score] -> (try
34 Hashtbl.replace words word (float_of_string score)
35 with Failure _ ->
36 prerr_endline ("Cannot convert " ^ score ^ " to float"))
37 | _ -> failwith ("Invalid line in input: " ^ line));
38 false
39 with End_of_file -> true) in
40 if eof then words else (aux channel words)
42 aux channel (Hashtbl.create 0x1000)
44 (** Load a version 1 wordlist from a channel,
45 after headers have been read.
47 let load_version_1_wordlist_from_channel channel headers =
48 { messages = (match hash_get_maybe headers "Messages" with
49 | None -> 0
50 | Some n -> (int_of_string n));
51 words = read_words_version_1 channel }
53 (** Load a wordlist from an input channel. *)
54 let load_wordlist_from_channel channel =
55 let headers = read_headers channel in
56 (* Check the word list file format version. *)
57 (match hash_get_maybe headers "Version" with
58 | None -> (* assume version 1 *)
59 load_version_1_wordlist_from_channel channel headers
60 | Some "1" -> load_version_1_wordlist_from_channel channel headers
61 | Some version -> failwith ("Cannot handle word lists with version " ^ version))
63 (** Output a line to an output channel. *)
64 let output_line channel line =
65 output_string channel line;
66 output_char channel '\n'
68 (** Convert a float to a string in Mailvisa syntax. *)
69 let float_to_string x =
70 let str = (string_of_float x) in
71 if (String.get str ((String.length str) - 1)) = '.' then str ^ "0" else str
73 (** Save a version 1 wordlist to an output channel. *)
74 let save_version_1_wordlist_to_channel wordlist channel =
75 (* Write headers. *)
76 output_line channel "Content-Type: text/x-mailvisa-wordlist";
77 output_line channel "Version: 1";
78 output_string channel "Messages: ";
79 output_line channel (string_of_int wordlist.messages);
80 output_line channel "";
81 (* Write words. *)
82 Hashtbl.iter (fun word score ->
83 output_string channel word;
84 output_char channel '\t';
85 output_line channel (float_to_string score))
86 wordlist.words
88 (** Save a wordlist to an output channel. *)
89 let save_wordlist_to_channel wordlist channel =
90 save_version_1_wordlist_to_channel wordlist channel
92 (** Get the incidence of a word *)
93 let get_incidence wordlist word =
94 match hash_get_maybe wordlist.words word with
95 | None -> 0.0
96 | Some n -> n /. (float_of_int wordlist.messages)
98 (** Get the score of a word, using the given value as a default. *)
99 let get_score_with_default wordlist word default =
100 match hash_get_maybe wordlist.words word with
101 | None -> default
102 | Some value -> value
104 (** Get the score of a word, returning a default value if the score is unknown. *)
105 let get_score wordlist word = get_score_with_default wordlist word 0.4
107 (** Load a wordlist from a file. *)
108 let load_wordlist filename =
109 load_wordlist_from_channel (open_in filename)
111 (** Save a wordlist to a file. *)
112 let save_wordlist wordlist filename =
113 save_wordlist_to_channel wordlist (open_out filename)
115 (** Set the score for a word in the wordlist (the word is added if it is not present yet). *)
116 let set_score wordlist word score =
117 Hashtbl.replace wordlist.words word score
119 let create_wordlist messages words =
120 { words = words;
121 messages = messages }
122 let wordlist_messages wordlist = wordlist.messages
123 let wordlist_words wordlist = wordlist.words