tools/bwt/dcs-bwt/src/prefix_doubling.h

   1 // Copyright 2007 Google Inc.
   2
   3 // This program is free software; you can redistribute it and/or
   4 // modify it under the terms of the GNU General Public License
   5 // as published by the Free Software Foundation; either version 2
   6 // of the License, or (at your option) any later version.
   7
   8 // This program is distributed in the hope that it will be useful,
   9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 // GNU General Public License for more details.
  12
  13 // You should have received a copy of the GNU General Public License
  14 // along with this program; if not, write to the Free Software
  15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  16
  17 // Prefix doubling is a technique for sorting the suffixes
  18 // of a string of length n in O(n log n) worst-case time.
  19 // This file contains an implementation of a prefix doubling algorithm
  20 // based on the Larsson-Sadakane algorithm described in
  21 //
  22 //   J. Larsson, K. Sadakane: Faster Suffix Sorting.
  23 //   Theoretical Computer Science, in press 2007.
  24 //   http://eprints.csce.kyushu-u.ac.jp/33/
  25 //
  26 // The simple way to use the algorithm is the function
  27 // SortSuffixesByPrefixDoubling().
  28 // More flexibility and more chances for optimizations are offered
  29 // by the class PrefixDoubler.
  30
  31 #ifndef DCSBWT_PREFIX_DOUBLING_H__
  32 #define DCSBWT_PREFIX_DOUBLING_H__
  33
  34 #include <functional>  // for unary_function
  35 #include <cassert>
  36
  37 namespace dcsbwt {
  38
  39 // The function SortSuffixesByPrefixDoubling takes three arrays as
  40 // an argument. The first array contains the input string, and the
  41 // other two contain the output on exit:
  42 // - The rank array contains a rank (an integer in the range [0..n])
  43 //   for each suffix such that the order of the ranks is the same as
  44 //   the lexicographic order of the suffixes.
  45 // - The suffix array contains the suffixes in the lexicographic order
  46 //   with each suffix represented by its starting position.
  47 // For example:
  48 //   input string: BANANA
  49 //     rank array: 4362510
  50 //   suffix array: 6531042
  51 //
  52 // The arrays are, in fact, STL random access iterator ranges.
  53 // Here is a precise description of the input and the output:
  54 // - The range [str,str_end) contains the input string; it will not be
  55 //   modified. The length of the range, i.e., the length of the string,
  56 //   (call it n) must be at most (2^32)-3.
  57 //   The type of the elements should be either char or unsigned char.
  58 //   (Other types are possible as long as they can be converted to
  59 //   unsigned char; this is checked in debug mode.) The characters
  60 //   are converted to unsigned char for purposes of comparison.
  61 //   (The standard string comparison operators and strcmp-like functions
  62 //   behave the same way.)
  63 // - The range [rank_array,rank_array_end) contains the ranks on output;
  64 //   the contents on input are not used. The size of the range should
  65 //   be n+1 (one larger than the size of [str,str_end)). The element
  66 //   type should be an integral type capable of storing all values in
  67 //   the range [0..n] and supporting conversion to and from uint32.
  68 //   After execution, rank_array[i] contains the number of the suffixes
  69 //   of the string (including the empty suffix and the full string)
  70 //   that are smaller than the suffix [str+i,str_end).
  71 //   An alternative definition is that the range contains a permutation
  72 //   of the integers [0..n] satisfying for all i,j in [0..n]:
  73 //     rank_array[i] < rank_array[j] if and only if
  74 //     std::lexicographical_compare(str+i, str_end, str+j, str_end,
  75 //                                  std::less<unsigned char>())
  76 //     returns true.
  77 // - The range [suffix_array,suffix_array_end) contains the suffix array
  78 //   on output; the contents on input are not used. The size of the range
  79 //   should be n+1 (the same as [rank_array,rank_array_end)). The element
  80 //   type should be an integral type capable of storing all values in
  81 //   the range [0..n+2] (the values n+1 and n+2 are needed by internal
  82 //   computation) and supporting conversion to and from uint32.
  83 //   After execution, the range contains a permutation of
  84 //   the integers [0..n], where a value i represents the suffix
  85 //   [str+i,str_end) and the order is the same as the lexicographical
  86 //   order of the suffixes. It satisfies, for all i in [0..n]:
  87 //   rank_array[suffix_array[i]] == suffix_array[rank_array[i]] == i.
  88 //   (If you wonder why both rank and suffix array are produced when
  89 //   one can so easily be constructed from the other, it is because both
  90 //   arrays are needed during the computation, so one can as well
  91 //   use them both for output.)
  92
  93 template <typename StringIterator,
  94           typename RankIterator,
  95           typename SuffixIterator>
  96 void SortSuffixesByPrefixDoubling(
  97     StringIterator str, StringIterator str_end,
  98     RankIterator rank_array, RankIterator rank_array_end,
  99     SuffixIterator suffix_array, SuffixIterator suffix_array_end);
 100
 101 // SortSuffixesByPrefixDoubling is not sufficiently flexible for some
 102 // special situations (in particular, those arising in the suffix sorting
 103 // algorithms based on difference covers). The class PrefixDoubler provides
 104 // the core algorithm in its bare form together with some supporting
 105 // components that can be used for building special applications.
 106 // To understand PrefixDoubler you need to know more about the algorithm.
 107 //
 108 // An h-order of the suffixes is an order based on the first h characters
 109 // (call it the h-prefix) of each suffix. If the suffix is shorter than h,
 110 // the h-prefix is the suffix itself.  An h-order is not a total order
 111 // in general (but a weak order, see below) as multiple suffixes can have
 112 // the same h-prefix. For example, under 0-order all suffixes are equal.
 113 // However, for h >= n, the h-order is always the lexicographical total
 114 // order of the suffixes.
 115 //
 116 // The algorithm is based on a doubling procedure that turns an h-order
 117 // into a 2h-order. The outline of the algorithm is:
 118 // 1. Obtain an initial h-order for some h > 0.
 119 // 2. Apply the doubling procedure repeatedly until fully sorted.
 120 // The second step is the core algorithm implemented by the member function
 121 // PrefixDoubler::SortSuffixes(). It takes as an input the value of h (called
 122 // order_depth in the code) and the two arrays, rank and suffix array,
 123 // which must be consistent with the initial h-order on input.
 124 // On output, the rank array is as with SortSuffixesByPrefixDoubling()
 125 // but the suffix array is not (see below). The suffix array can be
 126 // computed from the rank array using PrefixDoubler::ComputeFinalSuffixArray().
 127 // Note that the original string is not an input to the main algorithm;
 128 // it is only used for obtaining the initial h-order. There are functions
 129 // that help in computing the initial h-order, but users can also write
 130 // their own, which is the main source for the added flexibility.
 131 //
 132 // The rank array is the primary representation of the h-order at all stages
 133 // of the algorithm. It must satisfy the following conditions:
 134 // 1. If rank_array[i] < rank_array[j], then the suffix starting at
 135 //    position i is lexicographically smaller than the suffix starting at
 136 //    position j.
 137 // 2. If rank_array[i] == rank_array[j], then suffixes i and j have
 138 //    the same h-prefix. (The reverse implication is not required;
 139 //    see the paragraph on weak orders below.)
 140 // 3. rank_array[i] is equal to the number of other suffixes whose rank is
 141 //    smaller or equal to rank_array[i]. For example, rank_array[n]=0
 142 //    for all h > 0 (n is the empty suffix).
 143 // When all suffixes have a different rank (e.g. for all h >= n), this
 144 // definition coincides with the earlier definition of the output
 145 // of SortSuffixesByPrefixDoubling().
 146 //
 147 // Example:  input string: BANANA
 148 //     1-order rank array: 4363630  <-- initial 1-order
 149 //     2-order rank array: 4363610  <-- after first doubling
 150 //     4-order rank array: 4362510  <-- after second doubling
 151 //                                      this is the total order
 152 //
 153 // The suffix array in its basic form contains the suffixes (i.e.,
 154 // a permutation of [0..n]) in the order defined by the rank array,
 155 // with equal suffixes in arbitrary order.
 156 //
 157 // Example:  input string: BANANA
 158 //     1-order rank array: 4363630
 159 //   1-order suffix array: 6135024  <-- both of these would be valid
 160 //                         6531042  <-- inputs to SortSuffixes()
 161 //
 162 // An advanced form allows overwriting some entries with special markers
 163 // as explained later. For basic use it is enough to know that:
 164 // - On input to the main algorithm SortSuffixes(), the suffix array
 165 //   can but does not have to contain the markers.
 166 // - On output, the suffix array contains markers to a large extent,
 167 //   and the funtion ComputeFinalSuffixArray() should be used for
 168 //   obtaining the final suffix array if needed.
 169 //
 170 // The class PrefixDoubler stores no information about the state of the
 171 // algorithm except the length of the string; each function takes all
 172 // necessary information as an argument. This design gives the user full
 173 // control over the storage and representation of the arrays, which can
 174 // be useful since these arrays can be really large. The reasons
 175 // why this class exists at all (instead of separate functions) are:
 176 // - collecting the related functions together
 177 // - encapsulation of the special markers explained below
 178 // - parametrization of the integral type
 179 // The last point refers to the template paraneter type Integer, which is
 180 // used in all internal integral computations. It should be able to represent
 181 // all values in the range [0..n+2]. The value type of the rank and
 182 // suffix arrays should support conversion to and from the type Integer
 183 // for all values in the range [0..n+2].
 184 //
 185 // The rest of the documentation is not essential for basic use but
 186 // may be useful for understanding the code and for some optimizations.
 187 //
 188 // A weak order over a set is an equivalence class partition of the set
 189 // with a total order on the equivalence classes. Every h-order is
 190 // a weak order over the set of suffixes. Weak orders are exactly
 191 // those that fit the STL requirement of strict weak ordering (see 25.3
 192 // in the C++ Standard). Two relations on weak orders are of interest to us:
 193 // - Two weak orders W1 and W2 are *consistent* if there are no two
 194 //   elements x and y for which x<y according to W1 and y<x according
 195 //   to W2. All h-orders are consistent with each other.
 196 // - A weak order W1 is a *refinement* of a weak order W2 if W1 and W2
 197 //   are consistent and the equivalence class partition of W1 is a
 198 //   refinement of the equivalence class partition of W2.
 199 //   A g-order is a refinement of an h-order if g > h.
 200 // The rank array is able to represent any weak order on the suffixes
 201 // A valid order for a specific value of h is any order that:
 202 // - is consistent with the lexicographical total order and
 203 // - is a refinement of the h-order.
 204 // These are the conditions 1 and 2 in the earlier rank array definition.
 205 // Any valid order for h > 0 is a legal input to the algorithm.
 206 // A more refined order makes the algorithm faster in general.
 207 // Note though that an additional refinement is much more effective if
 208 // it allows increasing the value of h. Additional refinements have to
 209 // be reflected in the suffix array, too.
 210 //
 211 // Example:  input string: BANANA
 212 //     1-order rank array: 4363630  <-- no additional refinement
 213 //                         4363610  <-- additional refinement
 214 //   1-order suffix array: 6531042  <-- valid for both cases
 215 //                         6135024  <-- not valid for refined case
 216 //
 217 // A suffix that has been separated from other suffixes (i.e.,has
 218 // a unique rank), is called finished: its rank and its position in the
 219 // suffix array do not change anymore. The suffix array entry for
 220 // a finished suffix is not needed anymore (the rank is enough)
 221 // and may be overwritten by special a special marker,
 222 // whose purpose is to quickly jump over the finished parts.
 223 // Two kinds of markings are used:
 224 // - The value FinishedSuffixMarker() marks a single finished suffix.
 225 // - The value FinishedGroupMarker() marks the beginning of a group of
 226 //   more than one consecutive finished suffixes. The entry following
 227 //   the marker contains the next position after the end of the group.
 228 //   The remaining entries in the group are never accessed.
 229 // Any combination of unmarked finished suffixes, single marked
 230 // suffixes and marked groups are legal including consecutive marked
 231 // suffixes and groups. Unmarked finished suffixes get marked and
 232 // consecutive marked suffixes and groups get combined when encountered.
 233 // Adding markers already during the computation of the initial
 234 // h-order may speed up the algorithm. (Besides allowing jumping over
 235 // groups, the markers avoid cache misses due to rank array accesses.)
 236
 237 template <typename Integer>
 238 class PrefixDoubler {
 239  public:
 240   // Only the length of the string is set on initialization
 241   explicit PrefixDoubler(Integer string_length)
 242       : string_length_(string_length) {}
 243   // no default constructor
 244   // implicit destructor
 245   // implicit copy constructor
 246   // implicit copy assignment
 247
 248   // The main repeated doubling algorithm
 249   template <typename RankIterator, typename SuffixIterator>
 250   void SortSuffixes(
 251       RankIterator rank_array, RankIterator rank_array_end,
 252       SuffixIterator suffix_array, SuffixIterator suffix_array_end,
 253       Integer order_depth) const;
 254
 255   // Compute an initial 1-order from a string.
 256   // This works for any string whose characters can be compared.
 257   // The ordering is based on std::less<value_type> for the
 258   // value_type of the StringIterator.
 259   // WARNING: If the value type is char and the string contains values
 260   // outside [0..128), the order may be inconsistent with the standard
 261   // string comparisons and strcmp-like functions, which convert to
 262   // unsigned char for comparisons. Use ComputeOneOrderForCharString()
 263   // instead.
 264   // TODO: add support for user-defined character comparison operator
 265   template <typename StringIterator, typename RankIterator,
 266             typename SuffixIterator>
 267   void ComputeOneOrder(
 268       StringIterator str, StringIterator str_end,
 269       RankIterator rank_array, RankIterator rank_array_end,
 270       SuffixIterator suffix_array, SuffixIterator suffix_array_end) const;
 271
 272   // Compute an initial 1-order from a string of (unsigned) char.
 273   // This works for strings whose characters are convertible to unsigned char.
 274   // In debug mode, there is a check that the conversion loses no information.
 275   // The ordering is based on std::less<unsigned char>.
 276   // This is probably faster than ComputeOneOrder(), and produces an order
 277   // consistent with the standard string comparisons and strcmp-like functions.
 278   template <typename StringIterator, typename RankIterator,
 279             typename SuffixIterator>
 280   void ComputeOneOrderForCharString(
 281       StringIterator str, StringIterator str_end,
 282       RankIterator rank_array, RankIterator rank_array_end,
 283       SuffixIterator suffix_array, SuffixIterator suffix_array_end) const;
 284
 285   // Compute the initial suffix_array including markers
 286   // from the initial rank array.
 287   template <typename RankIterator, typename SuffixIterator>
 288   void ComputeInitialSuffixArray(
 289       RankIterator rank_array, RankIterator rank_array_end,
 290       SuffixIterator suffix_array, SuffixIterator suffix_array_end,
 291       Integer order_depth) const;
 292
 293   // Compute the final suffix array (without markers) from the final
 294   // rank array.
 295   // This should only be called with the fully sorted rank array
 296   // (checked in debug mode).
 297   template <typename RankIterator, typename SuffixIterator>
 298   void ComputeFinalSuffixArray(
 299       RankIterator rank_array, RankIterator rank_array_end,
 300       SuffixIterator suffix_array, SuffixIterator suffix_array_end) const;
 301
 302   // Special markers for finished suffixes
 303   inline Integer FinishedSuffixMarker() const { return string_length_ + 1; }
 304   inline Integer FinishedGroupMarker() const { return string_length_ + 2; }
 305   inline bool IsFinishedSuffix(Integer i) const {
 306     return FinishedSuffixMarker() == i;
 307   }
 308   inline bool IsFinishedGroup(Integer i) const  {
 309     return FinishedGroupMarker() == i;
 310   }
 311   inline bool IsFinishedSuffixOrGroup(Integer i) const  {
 312     assert(i <= string_length_ + 2);
 313     return i > string_length_;
 314   }
 315
 316   // Do one doubling step.
 317   // Returns the new order_depth.
 318   template <typename RankIterator, typename SuffixIterator>
 319   Integer Double(
 320       RankIterator rank_array, RankIterator rank_array_end,
 321       SuffixIterator suffix_array, SuffixIterator suffix_array_end,
 322       Integer order_depth) const;
 323
 324  private:
 325   Integer string_length_;
 326
 327   // Functor that retrieves the rank at i+order_depth
 328   template <typename RankIterator>
 329   class RankAtDepth : public std::unary_function<Integer, Integer> {
 330    public:
 331     RankAtDepth(RankIterator rank_array, Integer order_depth)
 332         : rank_plus_depth_(rank_array + order_depth) {}
 333     Integer operator() (Integer i) const {
 334       return rank_plus_depth_[i];
 335     }
 336    private:
 337     RankIterator rank_plus_depth_;
 338   };
 339
 340   // Do one doubling step for a group of suffixes.
 341   template <typename RankIterator, typename SuffixIterator>
 342   void RefineSubrange(
 343       RankIterator rank_array, SuffixIterator suffix_array,
 344       SuffixIterator begin, SuffixIterator end,
 345       Integer common_prefix_length) const;
 346
 347   // Do one doubling step for a group of suffixes.
 348   template <typename RankIterator, typename SuffixIterator>
 349   void RefineSmallSubrange(
 350       RankIterator rank_array, SuffixIterator suffix_array,
 351       SuffixIterator begin, SuffixIterator end,
 352       Integer common_prefix_length) const;
 353 };
 354
 355 }  // namespace dcsbwt
 356
 357 #endif  // DCSBWT_PREFIX_DOUBLING_H__