From c19d639d6ec37eef5f71d412e88b63c4cce3c06d Mon Sep 17 00:00:00 2001
From: inglorion <mailvisa@inglorion.net>
Date: Sat, 27 Dec 2008 10:59:48 +0100
Subject: [PATCH] Implemented new tokenization algorithm.

---
 src/tokenize.ml | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/tokenize.ml b/src/tokenize.ml
index 7790382..0de0cfb 100644
--- a/src/tokenize.ml
+++ b/src/tokenize.ml
@@ -1,7 +1,19 @@
+(** Filter tokens by length. *)
+let filter_tokens tokens =
+    List.filter (fun token -> String.length token >= 1) tokens
+
 (** Split a string into tokens according to the rules used by Mailvisa 1 *)
 let tokenize_version_1 str =
-  let words = Str.split (Str.regexp "[^0-9A-Za-z_]") str in
-    List.filter (fun word -> String.length word >= 1) words
+  filter_tokens (Str.split (Str.regexp "[^0-9A-Za-z_]") str)
+
+(** Split a string into tokens. This function considers a token to be
+    a sequence of one or more token characters, where token characters
+    are any character that isn't a control character (\x00 through \0x1f)
+    or a punctuaction character (space, and the characters in the string
+    "!\"'().:;<>?[]`{}").
+*)
+let tokenize_version_2 str =
+  filter_tokens (Str.split (Str.regexp "[][\x00-\x20!\"'().:;<>?`{}]") str)
 
 (** Split a string into tokens *)
-let tokenize = tokenize_version_1
+let tokenize = tokenize_version_2
-- 
2.11.4.GIT