1 (* Functions for operating on word lists *)
9 words
: (string, float) Hashtbl.t
;
12 (** Read headers from an input channel. *)
13 let read_headers channel
=
14 let rec aux channel headers
=
15 let line = input_line channel
in
16 if (String.length
line) = 0 then headers
18 (match (bounded_split
(regexp_string
": ") line 2) with
19 | [key
; value] -> Hashtbl.add headers key
value
20 | _
-> failwith
("Invalid line in input: " ^
line));
24 aux channel
(Hashtbl.create
4)
26 (** Read words and scores from an input channel. *)
27 let read_words_version_1 channel
=
28 let rec aux channel words
=
31 let line = input_line channel
in
32 (match (bounded_split
(regexp_string
"\t") line 2) with
33 | [word; score
] -> (try
34 Hashtbl.replace words
word (float_of_string score
)
36 prerr_endline
("Cannot convert " ^ score ^
" to float"))
37 | _
-> failwith
("Invalid line in input: " ^
line));
39 with End_of_file
-> true) in
40 if eof then words
else (aux channel words
)
42 aux channel
(Hashtbl.create
0x1000)
44 (** Load a version 1 wordlist from a channel,
45 after headers have been read.
47 let load_version_1_wordlist_from_channel channel headers
=
48 { messages
= (match hash_get_maybe headers
"Messages" with
50 | Some n
-> (int_of_string n
));
51 words
= read_words_version_1 channel
}
53 (** Load a wordlist from an input channel. *)
54 let load_wordlist_from_channel channel
=
55 let headers = read_headers channel
in
56 (* Check the word list file format version. *)
57 (match hash_get_maybe
headers "Version" with
58 | None
-> (* assume version 1 *)
59 load_version_1_wordlist_from_channel channel
headers
60 | Some
"1" -> load_version_1_wordlist_from_channel channel
headers
61 | Some version
-> failwith
("Cannot handle word lists with version " ^ version
))
63 (** Output a line to an output channel. *)
64 let output_line channel
line =
65 output_string channel
line;
66 output_char channel '
\n'
68 (** Convert a float to a string in Mailvisa syntax. *)
69 let float_to_string x
=
70 let str = (string_of_float x
) in
71 if (String.get
str ((String.length
str) - 1)) = '
.'
then str ^
"0" else str
73 (** Save a version 1 wordlist to an output channel. *)
74 let save_version_1_wordlist_to_channel wordlist channel
=
76 output_line channel
"Content-Type: text/x-mailvisa-wordlist";
77 output_line channel
"Version: 1";
78 output_string channel
"Messages: ";
79 output_line channel
(string_of_int wordlist
.messages
);
80 output_line channel
"";
82 Hashtbl.iter
(fun word score
->
83 output_string channel
word;
84 output_char channel '
\t'
;
85 output_line channel
(float_to_string score
))
88 (** Save a wordlist to an output channel. *)
89 let save_wordlist_to_channel wordlist channel
=
90 save_version_1_wordlist_to_channel wordlist channel
92 (** Get the incidence of a word *)
93 let get_incidence wordlist
word =
94 match hash_get_maybe wordlist
.words
word with
96 | Some n
-> n
/. (float_of_int wordlist
.messages
)
98 (** Get the score of a word, using the given value as a default. *)
99 let get_score_with_default wordlist
word default
=
100 match hash_get_maybe wordlist
.words
word with
102 | Some
value -> value
104 (** Get the score of a word, returning a default value if the score is unknown. *)
105 let get_score wordlist
word = get_score_with_default wordlist
word 0.4
107 (** Load a wordlist from a file. *)
108 let load_wordlist filename
=
109 load_wordlist_from_channel (open_in filename
)
111 (** Save a wordlist to a file. *)
112 let save_wordlist wordlist filename
=
113 save_wordlist_to_channel wordlist
(open_out filename
)
115 (** Set the score for a word in the wordlist (the word is added if it is not present yet). *)
116 let set_score wordlist
word score
=
117 Hashtbl.replace wordlist
.words
word score
119 let create_wordlist messages words
=
121 messages
= messages
}
122 let wordlist_messages wordlist
= wordlist
.messages
123 let wordlist_words wordlist
= wordlist
.words