Added various log messages to mailvisad.
[mailvisa2.git] / src / tokenize.ml
blob0de0cfb4c521326cff0b61df5d79ce3169d97311
1 (** Filter tokens by length. *)
2 let filter_tokens tokens =
3 List.filter (fun token -> String.length token >= 1) tokens
5 (** Split a string into tokens according to the rules used by Mailvisa 1 *)
6 let tokenize_version_1 str =
7 filter_tokens (Str.split (Str.regexp "[^0-9A-Za-z_]") str)
9 (** Split a string into tokens. This function considers a token to be
10 a sequence of one or more token characters, where token characters
11 are any character that isn't a control character (\x00 through \0x1f)
12 or a punctuaction character (space, and the characters in the string
13 "!\"'().:;<>?[]`{}").
15 let tokenize_version_2 str =
16 filter_tokens (Str.split (Str.regexp "[][\x00-\x20!\"'().:;<>?`{}]") str)
18 (** Split a string into tokens *)
19 let tokenize = tokenize_version_2