1 (** Filter tokens by length. *)
2 let filter_tokens tokens
=
3 List.filter
(fun token
-> String.length token
>= 1) tokens
5 (** Split a string into tokens according to the rules used by Mailvisa 1 *)
6 let tokenize_version_1 str
=
7 filter_tokens (Str.split
(Str.regexp
"[^0-9A-Za-z_]") str
)
9 (** Split a string into tokens. This function considers a token to be
10 a sequence of one or more token characters, where token characters
11 are any character that isn't a control character (\x00 through \0x1f)
12 or a punctuaction character (space, and the characters in the string
15 let tokenize_version_2 str
=
16 filter_tokens (Str.split
(Str.regexp
"[][\x00-\x20!\"'().:;<>?`{}]") str
)
18 (** Split a string into tokens *)
19 let tokenize = tokenize_version_2