Added mailvisa dispatch program.
[mailvisa2.git] / src / mailvisa-calculate-scores.ml
blobfc8dc3fd07154da87fa24161aabc47e475149490
1 (* Calculate words' spam probability based on frequency in
2 spam and ham messages.
3 *)
5 open Common
6 open Wordlist
8 (* Defaults *)
10 let default_confdir = (Sys.getenv "HOME") ^ "/settings/mailvisa2"
11 let default_scorefile = "scores"
12 let default_goodfile = "good"
13 let default_badfile = "bad"
14 let default_good_multiplier = 1.0
16 let usage = "USAGE: mailvisa-calculate-scores [options]"
18 let help = "Valid options are:
20 -c <path> Look for files in <path> (default: $HOME/settings/mailvisa2)
21 -g <file> Load good words from <file> (default: \"good\")
22 -b <file> Load bad words from <file> (default: \"bad\")
23 -f <file> Write scores to <file> (default: \"scores\")
24 -m <num> Multiply number of good occurrences by <num> (default: 1.0)"
26 (** Parse command options from args into hash table. *)
27 let parse_options args =
28 let options = Hashtbl.create 8 in
29 let i = ref 1 in
30 let add_flag name = Hashtbl.add options name "true" in
31 let add_option name = increment i; Hashtbl.add options name args.(!i) in
32 while !i < (Array.length args) do
33 (match args.(!i) with
34 | "-h" ->
35 print_endline usage;
36 print_endline help;
37 exit 0
38 | "-b" -> add_option "badfile"
39 | "-c" -> add_option "confdir"
40 | "-f" -> add_flag "scorefile"
41 | "-g" -> add_option "goodfile"
42 | "-m" -> add_option "good_multiplier"
43 | option ->
44 output_string Pervasives.stderr ("Invalid option: " ^ option ^ "\n");
45 exit 0x80);
46 increment i
47 done;
48 options
50 (* Initialization *)
52 let options = parse_options Sys.argv
53 let confdir = get_option_with_default options "confdir" default_confdir
55 (** Create an absolute path from a path, by prepending confdir if the path does not contain a slash. *)
56 let absolute_path path =
57 if String.contains path '/' then path
58 else (confdir ^ "/" ^ path)
60 (* Set constants from command line options *)
61 let badfile = absolute_path (hash_get options "badfile" ~default:default_badfile)
62 let scorefile = absolute_path (hash_get options "scorefile" ~default:default_scorefile)
63 let goodfile = absolute_path (hash_get options "goodfile" ~default:default_goodfile)
64 let good_multiplier = hash_get_float options "good_multiplier" ~default:default_good_multiplier
66 (* Functions *)
68 (** Calculate the score of a word *)
69 let calculate_word_score word good bad =
70 let b = get_incidence bad word in
71 let g = good_multiplier *. (get_incidence good word) in
72 let score = b /. (b +. g) in
73 if score < 0.1 then
74 0.1
75 else if score > 0.9 then
76 0.9
77 else
78 score
80 let _ =
81 prerr_string ("Loading good words from " ^ goodfile ^ "...");
82 flush stderr;
83 let good = load_wordlist goodfile in
84 prerr_endline ((string_of_int (Hashtbl.length (wordlist_words good))) ^ " words loaded");
85 prerr_string ("Loading bad words from " ^ badfile ^ "...");
86 flush stderr;
87 let bad = load_wordlist badfile in
88 prerr_endline ((string_of_int (Hashtbl.length (wordlist_words bad))) ^ " words loaded");
89 prerr_string "Calculating probabilities...";
90 flush stderr;
91 let score = Hashtbl.create 10000 in
93 Hashtbl.iter (fun word count ->
94 if (not (Hashtbl.mem score word)) && count > 4.0 then
95 Hashtbl.add score word (calculate_word_score word good bad))
96 (wordlist_words bad);
98 Hashtbl.iter (fun word count ->
99 if (not (Hashtbl.mem score word)) && count > 4.0 then
100 Hashtbl.add score word (calculate_word_score word good bad))
101 (wordlist_words good);
102 prerr_endline "done";
104 let scores = create_wordlist ((wordlist_messages bad) + (wordlist_messages good)) score in
105 save_wordlist scores scorefile