1 (* Calculate words' spam probability based on frequency in
10 let default_confdir = (Sys.getenv
"HOME") ^
"/settings/mailvisa2"
11 let default_scorefile = "scores"
12 let default_goodfile = "good"
13 let default_badfile = "bad"
14 let default_good_multiplier = 1.0
16 let usage = "USAGE: mailvisa-calculate-scores [options]"
18 let help = "Valid options are:
20 -c <path> Look for files in <path> (default: $HOME/settings/mailvisa2)
21 -g <file> Load good words from <file> (default: \"good\")
22 -b <file> Load bad words from <file> (default: \"bad\")
23 -f <file> Write scores to <file> (default: \"scores\")
24 -m <num> Multiply number of good occurrences by <num> (default: 1.0)"
26 (** Parse command options from args into hash table. *)
27 let parse_options args
=
28 let options = Hashtbl.create
8 in
30 let add_flag name
= Hashtbl.add
options name
"true" in
31 let add_option name
= increment
i; Hashtbl.add
options name args
.(!i) in
32 while !i < (Array.length args
) do
38 | "-b" -> add_option "badfile"
39 | "-c" -> add_option "confdir"
40 | "-f" -> add_flag "scorefile"
41 | "-g" -> add_option "goodfile"
42 | "-m" -> add_option "good_multiplier"
44 output_string
Pervasives.stderr
("Invalid option: " ^
option ^
"\n");
52 let options = parse_options Sys.argv
53 let confdir = get_option_with_default
options "confdir" default_confdir
55 (** Create an absolute path from a path, by prepending confdir if the path does not contain a slash. *)
56 let absolute_path path
=
57 if String.contains path '
/'
then path
58 else (confdir ^
"/" ^ path
)
60 (* Set constants from command line options *)
61 let badfile = absolute_path (hash_get
options "badfile" ~default
:default_badfile)
62 let scorefile = absolute_path (hash_get
options "scorefile" ~default
:default_scorefile)
63 let goodfile = absolute_path (hash_get
options "goodfile" ~default
:default_goodfile)
64 let good_multiplier = hash_get_float
options "good_multiplier" ~default
:default_good_multiplier
68 (** Calculate the score of a word *)
69 let calculate_word_score word good bad
=
70 let b = get_incidence bad
word in
71 let g = good_multiplier *. (get_incidence good
word) in
72 let score = b /. (b +. g) in
75 else if score > 0.9 then
81 prerr_string
("Loading good words from " ^
goodfile ^
"...");
83 let good = load_wordlist
goodfile in
84 prerr_endline
((string_of_int
(Hashtbl.length
(wordlist_words
good))) ^
" words loaded");
85 prerr_string
("Loading bad words from " ^
badfile ^
"...");
87 let bad = load_wordlist
badfile in
88 prerr_endline
((string_of_int
(Hashtbl.length
(wordlist_words
bad))) ^
" words loaded");
89 prerr_string
"Calculating probabilities...";
91 let score = Hashtbl.create
10000 in
93 Hashtbl.iter
(fun word count
->
94 if (not
(Hashtbl.mem
score word)) && count
> 4.0 then
95 Hashtbl.add
score word (calculate_word_score word good bad))
98 Hashtbl.iter
(fun word count
->
99 if (not
(Hashtbl.mem
score word)) && count
> 4.0 then
100 Hashtbl.add
score word (calculate_word_score word good bad))
101 (wordlist_words
good);
102 prerr_endline
"done";
104 let scores = create_wordlist
((wordlist_messages
bad) + (wordlist_messages
good)) score in
105 save_wordlist
scores scorefile