Minor reorganization of html filter code.
[beagle.git] / bludgeon / Token.cs
blobdcffff4bf881379ee45a25627e75cbbe2a998737
2 using System;
3 using System.Text;
5 namespace Bludgeon {
7 public class Token {
9 // There are no vowels here to avoid stop words, stemming, etc.
10 // We also exclude l, since it looks too much like 1
11 private const string token_chars = "bcdfghjkmnpqrstvwxz0123456789";
13 // A mixture of ASCII characters and latin characters outside
14 // the normal ISO-8859-1 range to test UTF-8 support. These
15 // *should* still stem properly.
16 private const string unicode_token_chars = "bĉđfğħjķłmńpqrŝſŧvwxžƀ0123456789";
18 public const int Count = 512;
20 static public string IdToString (int id)
22 if (id < 0 || id >= Count)
23 throw new ArgumentException ();
24 return token_table [id];
27 static public int StringToId (string str)
29 int i;
30 i = Array.BinarySearch (token_table, str);
31 if (i < 0 || i >= Count)
32 return -1;
33 return i;
36 private static Random random = new Random ();
38 static public string GetRandom ()
40 return token_table [random.Next (Count)];
43 static public string GetRandomWithUnicode ()
45 return unicode_token_table [random.Next (Count)];
48 ///////////////////////////////////////////////////////////////////////
50 static private string [] token_table;
51 static private string [] unicode_token_table;
53 static Token ()
55 token_table = new string [Count];
56 unicode_token_table = new string [Count];
58 char [] buffer = new char [2];
60 for (int i = 0; i < Count; ++i) {
61 int a, b;
62 a = i / token_chars.Length;
63 b = i % token_chars.Length;
65 buffer [0] = token_chars [a];
66 buffer [1] = token_chars [b];
68 token_table [i] = new string (buffer);
71 for (int i = 0; i < Count; ++i) {
72 int a, b;
73 a = i / token_chars.Length;
74 b = i % unicode_token_chars.Length;
76 buffer [0] = token_chars [a];
77 buffer [1] = unicode_token_chars [b];
79 unicode_token_table [i] = new string (buffer);