1 // Copyright 2007 Google Inc.
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 // A difference cover for a period length p is a set D of integers
18 // that satisfies: { (i-j) mod p | i,j in D } == [0,p)
19 // In other words, the differences of the integers (mod p)
20 // cover the whole period.
22 // Example: D=={0,1,3,7,8} is a difference cover for period length 16
24 // 1-0 == 1 0-7 == -7 == 9 (mod 16)
25 // 3-1 == 2 1-7 == -6 == 10 (mod 16)
26 // 3-0 == 3 3-8 == -5 == 11 (mod 16)
27 // 7-3 == 4 3-7 == -4 == 12 (mod 16)
28 // 8-3 == 5 0-3 == -3 == 13 (mod 16)
29 // 7-1 == 6 1-3 == -2 == 14 (mod 16)
30 // 7-0 == 7 0-1 == -1 == 15 (mod 16)
32 // This file defines two classes to support the use of difference
33 // covers in the suffix array construction and Burrows-Wheeler transform
34 // algorithms described in the following articles (available at
35 // http://www.cs.helsinki.fi/juha.karkkainen/):
37 // J. Karkkainen, S. Burkhardt:
38 // Fast lightweight suffix array construction and checking.
39 // In Proc. 14th Symposium on Combinatorial Pattern Matching (CPM '03).
40 // LNCS 2676, Springer, 2003, pp. 55-69.
42 // J Karkkainen, P. Sanders, S. Burkhardt:
43 // Linear work suffix array construction.
44 // J. ACM, 53 (6), pp. 918-936, 2006.
47 // Fast BWT in Small Space by Blockwise Suffix Sorting.
48 // Theoretical Computer Science, 2007 (to appear).
50 // The class DifferenceCover, in particuler, is quite general and
51 // may be useful for other applications of difference covers.
54 #ifndef DCSBWT_DIFFERENCE_COVER_H__
55 #define DCSBWT_DIFFERENCE_COVER_H__
62 //-------------------------
63 class DifferenceCover
{
64 // The class DifferenceCover computes and stores a difference cover D
65 // for a period length p, where p is any power of two representable by
66 // unsigned int. The size of D for small values of p is:
67 // p 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16386
68 // |D| 1 2 3 4 5 7 9 13 20 28 40 58 82 112 160
69 // For larger p, the size of D is guaranteed to be smaller than
70 // sqrt(1.5*p)+5.5. (The size of the supporting data structures is
71 // more than 8*p bytes, though, so be careful with large values of p).
73 // Note: unsigned int is used in most computations, because we want
74 // to do arithmetic modulo a power of two (see C++ Standard 3.9.1/4).
75 // It is possible to use another integral type, even a user-defined type
76 // that behaves like an integer, for providing the input for and storing
77 // the output of the member functions as long as the following requirements
79 // - The type can represent all values in the range [0,period]
80 // (note: including period) and conversion between the type and
81 // unsigned int works both ways for these values.
82 // - For any value, a conversion from the type to unsigned int is
83 // performed mod 2^w, where w is the number of bits in unsigned int
84 // (see C++ Standard 4.7/2). For example, -1 is converted to 2^w-1.
85 // (This concerns only ModuloPeriod.)
86 // - The type supports division by a power of two using operator>>
87 // (for any value used as an input to DivideByPeriod).
88 // All built-in integral types satisfy the requirements.
91 // constructor require: period is a power of two
92 explicit DifferenceCover(unsigned int period
);
93 // no default constructor
94 // implicit destructor
95 // implicit copy constructor
96 // implicit copy assignment
98 // get basic dimensions
99 unsigned int period() const { return 1 << logperiod_
; }
100 unsigned int size() const { return cover_
.size(); }
102 // Division by period: i div p and i mod p
103 // The results satisfy: (i div p) * p + (i mod p) == i
104 // See comments above on the type of i (for both div and mod).
105 // The type Integer must support division by a power of two using
106 // operator>> for the value i. For built-in types, the behaviour
107 // for negative values is implementation defined (5.8/3).
108 // Thus negative inputs are forbidden.
109 template <typename Integer
>
110 Integer
DivideByPeriod (Integer i
) const {
112 return i
>> logperiod_
;
114 // ModuloPeriod accepts any value as an argument, including negative values
115 // and values larger than MAX_UINT, provided the conversion is done
116 // mod 2^w (see above). The return value is always in the range [0,period).
117 // For example, ModuloPeriod(-1) returns period-1.
118 unsigned int ModuloPeriod (unsigned int i
) const { return i
& mask_
; }
120 // returns true if i is in D
122 inline bool Contains(unsigned int i
) const;
125 // Rank(i) = size of (D intersect [0,i)) require: i<p
126 // Select(j) = jth element of D (starting with 0th) require: j<|D|
127 // Invariant: Rank(Select(j))==j
128 // Select(Rank(i))>=i when Rank(i)<|D|
129 inline unsigned int Rank(unsigned int i
) const;
130 inline unsigned int Select(unsigned int j
) const;
132 // returns k in D such that (k+diff) mod p is in D too
134 inline unsigned int Coverer(unsigned int diff
) const;
136 // STL random access iterators for D
137 typedef std::vector
<unsigned int>::const_iterator iterator
;
138 iterator
begin() const { return cover_
.begin(); }
139 iterator
end() const { return cover_
.end(); }
141 // Find the size of a difference cover without constructing one.
142 static unsigned int CoverSizeForPeriod(unsigned int period
);
144 // Helps keep track of space requirements.
145 static size_t SizeInBytesForPeriod(unsigned int period
);
148 const unsigned int logperiod_
;
149 const unsigned int mask_
; // period-1 == 00..011..1
151 std::vector
<unsigned int> cover_
;
152 std::vector
<unsigned int> ranks_
;
153 std::vector
<unsigned int> coverers_
;
155 // Most of the construction is done in these functions.
158 void ComputeCoverers();
161 bool IsCorrect() const;
164 template <typename Integer
>
165 class DifferenceCoverSample
{
166 // The class DifferenceCoverSample represents the set
167 // Dn = { 0<=i<n | i mod p is in D }
168 // where D is a difference cover for a period p.
169 // The key property of Dn is the support for the following operation:
170 // - Shift(i,j) returns k in [0,p) such that i+k and j+k
171 // are both in Dn (for any i,j in [0,n-k))
172 // Other operations provided by the class include:
173 // - Fill(OutputIterator) writes Dn to the output range
174 // - Rank(i) (for i in Dn) is the position of i in the Fill output
175 // - PeriodInterval() is the distance from i to i+p in the Fill output
176 // (must be the same for all i, see note on ordering below)
178 // Note: The ordering of Dn produced by Fill must satisfy the
179 // condition that Rank(i+p)-Rank(i) has the same (positive)
180 // value for all i in Dn with i<n-p. The function PeriodInterval
181 // returns this value. Two orders satisfying this are
182 // (d0<d1<d2<... are the elements of D):
183 // 1: d0, d1, d2, ..., d0+p, d1+p, d2+p, ..., d0+2*p, ...
184 // This increasing order is simple to implement
185 // and may have an advantage in the speed of Rank.
186 // 2: d0, d0+p, d0+2*p, ..., d1, d1+p, d1+2*p, ..., d2, ...
187 // This order corresponds to the one in the articles mentioned above
188 // and may make sorting the sample suffixes simpler and faster.
189 // The current implementation uses the first (increasing) order.
191 // The template argument Integer of can be any integral type, even
192 // a user defined type that behaves like an integer as long as
193 // the following requirements are satisfied:
194 // - conversion to unsigned int should be performed mod 2^w
195 // where w is the number of bits in unsigned int (see 4.7/2)
196 // - support the following operations when all values and results
197 // are in the range [0,n]:
198 // + conversion from unsigned int
199 // + division and multiplication by a power of two using
200 // operator>> and operator<<
201 // + addition (+), subtraction (-), multiplication (*), pre-increment (++)
202 // + order comparison (<)
203 // All built-in integral types satisfy the requirements
206 typedef Integer integer_type
;
209 // require: period is a power of two
210 // require: range >= period
211 DifferenceCoverSample(unsigned int period
, Integer range
);
212 // no default constructor
213 // implicit destructor
214 // implicit copy constructor
215 // implicit copy assignment
217 // get basic dimensions
218 inline Integer
period() const { return period_
; } // the p
219 inline Integer
range() const { return range_
; } // the n
220 inline Integer
size() const { return size_
; } // size of Dn
221 inline Integer
period_size() const { return dc_
.size(); } // size of D
223 // Contains(i) returns true if i is in Dn
224 inline bool Contains(Integer i
) const;
227 // The type of arguments is unsigned int since only the values
228 // i mod p and j mod p are used in the computation.
229 // Note: i and j can be any values. No check is made to ensure
230 // that i, j, i+shift, or j+shift is in the range.
231 inline Integer
Shift(unsigned int i
, unsigned int j
) const;
233 // Fill an output range with the elements of Dn.
234 // The output range must have room for size() elements.
235 // The value type of the output range must be able to store
236 // integers in the range [0,range()).
237 template <typename OutputIterator
>
238 OutputIterator
Fill(OutputIterator it
) const;
241 // require: i is in Dn
242 inline Integer
Rank(Integer i
) const;
244 // PeriodInterval (see above)
245 inline Integer
PeriodInterval() const;
249 const DifferenceCover dc_
;
250 const Integer period_
;
251 const Integer range_
;
257 } // namespace dcsbwt
259 #endif // DCSBWT_DIFFERENCE_COVER_H__