Compute lucene-style scores for our hits.
[beagle.git] / bludgeon / Token.cs
blob07a8e11a33c93e525d39e9d9f788dce825a4a228
2 using System;
3 using System.Text;
5 namespace Bludgeon {
7 public class Token {
9 public const int Count = 512;
11 static public string IdToString (int id)
13 if (id < 0 || id >= Count)
14 throw new ArgumentException ();
15 return token_table [id];
18 static public int StringToId (string str)
20 int i;
21 i = Array.BinarySearch (token_table, str);
22 if (i < 0 || i >= Count)
23 return -1;
24 return i;
27 private static Random random = new Random ();
29 static public string GetRandom ()
31 return token_table [random.Next (Count)];
34 ///////////////////////////////////////////////////////////////////////
36 static private string [] token_table;
38 static Token ()
40 token_table = new string [Count];
42 for (int i = 0; i < Count; ++i)
43 token_table [i] = TokenFromSeed (i);
44 Array.Sort (token_table);
46 // Paranoia is healthy.
48 for (int i = 1; i < Count; ++i)
49 if (token_table [i-1] == token_table [i])
50 throw new Exception ("Duplicate tokens!");
53 // This is a silly algorithm, but it is an easy way to
54 // reproducibly generate a bunch of strings that all stem to
55 // distinct values.
57 static private char [] buffer = new char [8];
58 static private string TokenFromSeed (int seed)
60 const int first_char = 97; // lower case 'a'
61 const uint p = 23;
62 const uint q = 7;
64 uint state = (uint) seed;
66 for (int i = 0; i < buffer.Length; ++i) {
68 // Put 'z' in the last two characters,
69 // to avoid unhappy accidents of stemming.
70 if (i >= buffer.Length - 2) {
71 buffer [i] = 'z';
72 continue;
75 buffer [i] = Convert.ToChar (first_char + (state % p));
76 state *= state + q;
79 return new string (buffer);