From c19d639d6ec37eef5f71d412e88b63c4cce3c06d Mon Sep 17 00:00:00 2001 From: inglorion Date: Sat, 27 Dec 2008 10:59:48 +0100 Subject: [PATCH] Implemented new tokenization algorithm. --- src/tokenize.ml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/tokenize.ml b/src/tokenize.ml index 7790382..0de0cfb 100644 --- a/src/tokenize.ml +++ b/src/tokenize.ml @@ -1,7 +1,19 @@ +(** Filter tokens by length. *) +let filter_tokens tokens = + List.filter (fun token -> String.length token >= 1) tokens + (** Split a string into tokens according to the rules used by Mailvisa 1 *) let tokenize_version_1 str = - let words = Str.split (Str.regexp "[^0-9A-Za-z_]") str in - List.filter (fun word -> String.length word >= 1) words + filter_tokens (Str.split (Str.regexp "[^0-9A-Za-z_]") str) + +(** Split a string into tokens. This function considers a token to be + a sequence of one or more token characters, where token characters + are any character that isn't a control character (\x00 through \0x1f) + or a punctuaction character (space, and the characters in the string + "!\"'().:;<>?[]`{}"). +*) +let tokenize_version_2 str = + filter_tokens (Str.split (Str.regexp "[][\x00-\x20!\"'().:;<>?`{}]") str) (** Split a string into tokens *) -let tokenize = tokenize_version_1 +let tokenize = tokenize_version_2 -- 2.11.4.GIT