3 Original source from qsufsort.c
5 Copyright 1999, N. Jesper Larsson, all rights reserved.
7 This file contains an implementation of the algorithm presented in "Faster
8 Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
9 Sadakane (sada@is.s.u-tokyo.ac.jp).
11 This software may be used freely for any purpose. However, when distributed,
12 the original source must be clearly stated, and, when the source code is
13 distributed, the copyright notice must be retained and any alterations in
14 the code must be clearly marked. No warranty is given regarding the quality
17 Modified by Wong Chi-Kwong, 2004
19 Changes summary: - Used long variable and function names
20 - Removed global variables
21 - Replace pointer references with array references
22 - Used insertion sort in place of selection sort and increased insertion sort threshold
23 - Reconstructing suffix array from inverse becomes an option
24 - Add handling where end-of-text symbol is not necessary < all characters
25 - Removed codes for supporting alphabet size > number of characters
27 No warrenty is given regarding the quality of the modifications.
35 #include "MiscUtilities.h"
38 static void QSufSortSortSplit(int* __restrict V
, int* __restrict I
, const int lowestPos
,
39 const int highestPos
, const int numSortedChar
);
40 static int QSufSortChoosePivot(int* __restrict V
, int* __restrict I
, const int lowestPos
,
41 const int highestPos
, const int numSortedChar
);
42 static void QSufSortInsertSortSplit(int* __restrict V
, int* __restrict I
, const int lowestPos
,
43 const int highestPos
, const int numSortedChar
);
44 static void QSufSortBucketSort(int* __restrict V
, int* __restrict I
, const int numChar
, const int alphabetSize
);
45 static int QSufSortTransform(int* __restrict V
, int* __restrict I
, const int numChar
, const int largestInputSymbol
,
46 const int smallestInputSymbol
, const int maxNewAlphabetSize
, int *numSymbolAggregated
);
49 /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
50 n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
51 contents of x[n] is disregarded, the n-th symbol being regarded as
52 end-of-string smaller than all other symbols.*/
53 void QSufSortSuffixSort(int* __restrict V
, int* __restrict I
, const int numChar
, const int largestInputSymbol
,
54 const int smallestInputSymbol
, const int skipTransform
) {
57 int s
, negatedSortedGroupLength
;
58 int numSymbolAggregated
;
59 int maxNumInputSymbol
;
63 maxNumInputSymbol
= largestInputSymbol
- smallestInputSymbol
+ 1;
66 /* bucketing possible*/
67 newAlphabetSize
= QSufSortTransform(V
, I
, numChar
, largestInputSymbol
, smallestInputSymbol
,
68 numChar
, &numSymbolAggregated
);
69 QSufSortBucketSort(V
, I
, numChar
, newAlphabetSize
);
72 numSortedPos
= numSymbolAggregated
;
75 while ((int)(I
[0]) >= -(int)numChar
) {
77 negatedSortedGroupLength
= 0;
81 i
-= s
; /* skip over sorted group.*/
82 negatedSortedGroupLength
+= s
;
84 if (negatedSortedGroupLength
) {
85 I
[i
+negatedSortedGroupLength
] = negatedSortedGroupLength
; /* combine preceding sorted groups */
86 negatedSortedGroupLength
= 0;
89 QSufSortSortSplit(V
, I
, i
, j
- 1, numSortedPos
);
92 } while (i
<= numChar
);
93 if (negatedSortedGroupLength
) {
94 /* array ends with a sorted group.*/
95 I
[i
+negatedSortedGroupLength
] = negatedSortedGroupLength
; /* combine sorted groups at end of I.*/
97 numSortedPos
*= 2; /* double sorted-depth.*/
102 void QSufSortGenerateSaFromInverse(const int* V
, int* __restrict I
, const int numChar
) {
105 for (i
=0; i
<=numChar
; i
++) {
111 /* Sorting routine called for each unsorted group. Sorts the array of integers
112 (suffix numbers) of length n starting at p. The algorithm is a ternary-split
113 quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
114 Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
115 function is based on Program 7.*/
116 static void QSufSortSortSplit(int* __restrict V
, int* __restrict I
, const int lowestPos
,
117 const int highestPos
, const int numSortedChar
) {
126 if (lowestPos
> highestPos
) {
127 fprintf(stderr
, "QSufSortSortSplit(): lowestPos > highestPos!\n");
132 numItem
= highestPos
- lowestPos
+ 1;
134 if (numItem
<= INSERT_SORT_NUM_ITEM
) {
135 QSufSortInsertSortSplit(V
, I
, lowestPos
, highestPos
, numSortedChar
);
139 v
= QSufSortChoosePivot(V
, I
, lowestPos
, highestPos
, numSortedChar
);
145 while (c
>= b
&& (f
= KEY(V
, I
, b
, numSortedChar
)) <= v
) {
147 swap(I
[a
], I
[b
], tmp
);
152 while (c
>= b
&& (f
= KEY(V
, I
, c
, numSortedChar
)) >= v
) {
154 swap(I
[c
], I
[d
], tmp
);
162 swap(I
[b
], I
[c
], tmp
);
170 for (l
= lowestPos
, m
= b
- s
; m
< b
; l
++, m
++) {
171 swap(I
[l
], I
[m
], tmp
);
177 for (l
= b
, m
= highestPos
- s
+ 1; m
<= highestPos
; l
++, m
++) {
178 swap(I
[l
], I
[m
], tmp
);
184 QSufSortSortSplit(V
, I
, lowestPos
, lowestPos
+ s
- 1, numSortedChar
);
187 // Update group number for equal portion
196 for (c
=a
; c
<=b
; c
++) {
202 QSufSortSortSplit(V
, I
, highestPos
- t
+ 1, highestPos
, numSortedChar
);
207 /* Algorithm by Bentley & McIlroy.*/
208 static int QSufSortChoosePivot(int* __restrict V
, int* __restrict I
, const int lowestPos
,
209 const int highestPos
, const int numSortedChar
) {
212 int keyl
, keym
, keyn
;
213 int key1
, key2
, key3
;
218 if (lowestPos
> highestPos
) {
219 fprintf(stderr
, "QSufSortChoosePivot(): lowestPos > highestPos!\n");
224 numItem
= highestPos
- lowestPos
+ 1;
227 if (numItem
<= INSERT_SORT_NUM_ITEM
) {
228 fprintf(stderr
, "QSufSortChoosePivot(): number of items <= INSERT_SORT_NUM_ITEM!\n");
233 m
= lowestPos
+ numItem
/ 2;
236 key1
= KEY(V
, I
, lowestPos
, numSortedChar
);
237 key2
= KEY(V
, I
, lowestPos
+s
, numSortedChar
);
238 key3
= KEY(V
, I
, lowestPos
+2*s
, numSortedChar
);
239 keyl
= med3(key1
, key2
, key3
);
240 key1
= KEY(V
, I
, m
-s
, numSortedChar
);
241 key2
= KEY(V
, I
, m
, numSortedChar
);
242 key3
= KEY(V
, I
, m
+s
, numSortedChar
);
243 keym
= med3(key1
, key2
, key3
);
244 key1
= KEY(V
, I
, highestPos
-2*s
, numSortedChar
);
245 key2
= KEY(V
, I
, highestPos
-s
, numSortedChar
);
246 key3
= KEY(V
, I
, highestPos
, numSortedChar
);
247 keyn
= med3(key1
, key2
, key3
);
249 return med3(keyl
, keym
, keyn
);
254 /* Quadratic sorting method to use for small subarrays. */
255 static void QSufSortInsertSortSplit(int* __restrict V
, int* __restrict I
, const int lowestPos
,
256 const int highestPos
, const int numSortedChar
) {
261 int key
[INSERT_SORT_NUM_ITEM
], pos
[INSERT_SORT_NUM_ITEM
];
262 int negativeSortedLength
;
266 if (lowestPos
> highestPos
) {
267 fprintf(stderr
, "QSufSortInsertSortSplit(): lowestPos > highestPos!\n");
272 numItem
= highestPos
- lowestPos
+ 1;
275 if (numItem
> INSERT_SORT_NUM_ITEM
) {
276 fprintf(stderr
, "QSufSortInsertSortSplit(): number of items > INSERT_SORT_NUM_ITEM!\n");
281 for (i
=0; i
<numItem
; i
++) {
283 if (I
[lowestPos
+ i
] < 0) {
284 fprintf(stderr
, "QSufSortInsertSortSplit(): I < 0 in unsorted region!\n");
288 pos
[i
] = I
[lowestPos
+ i
];
289 key
[i
] = V
[pos
[i
] + numSortedChar
];
292 for (i
=1; i
<numItem
; i
++) {
295 for (j
=i
; j
>0 && key
[j
-1] > tmpKey
; j
--) {
303 negativeSortedLength
= -1;
306 groupNum
= highestPos
;
308 I
[i
+lowestPos
] = pos
[i
];
309 V
[I
[i
+lowestPos
]] = groupNum
;
310 if (key
[i
-1] == key
[i
]) {
311 negativeSortedLength
= 0;
313 if (negativeSortedLength
< 0) {
314 I
[i
+lowestPos
] = negativeSortedLength
;
316 groupNum
= i
+ lowestPos
- 1;
317 negativeSortedLength
--;
322 I
[lowestPos
] = pos
[0];
323 V
[I
[lowestPos
]] = groupNum
;
324 if (negativeSortedLength
< 0) {
325 I
[lowestPos
] = negativeSortedLength
;
330 /* Bucketsort for first iteration.
332 Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
333 at least once. x[n] is 0. (This is the corresponding output of transform.) k
334 must be at most n+1. p is array of size n+1 whose contents are disregarded.
336 Output: x is V and p is I after the initial sorting stage of the refined
337 suffix sorting algorithm.*/
339 static void QSufSortBucketSort(int* __restrict V
, int* __restrict I
, const int numChar
, const int alphabetSize
) {
346 // mark linked list empty
347 for (i
=0; i
<alphabetSize
; i
++) {
351 // insert to linked list
352 for (i
=0; i
<=numChar
; i
++) {
358 currentIndex
= numChar
;
359 for (i
=alphabetSize
; i
>0; i
--) {
362 groupNum
= currentIndex
;
375 I
[currentIndex
] = -1;
382 /* Transforms the alphabet of x by attempting to aggregate several symbols into
383 one, while preserving the suffix order of x. The alphabet may also be
384 compacted, so that x on output comprises all integers of the new alphabet
385 with no skipped numbers.
387 Input: x is an array of size n+1 whose first n elements are positive
388 integers in the range l...k-1. p is array of size n+1, used for temporary
389 storage. q controls aggregation and compaction by defining the maximum intue
390 for any symbol during transformation: q must be at least k-l; if q<=n,
391 compaction is guaranteed; if k-l>n, compaction is never done; if q is
392 INT_MAX, the maximum number of symbols are aggregated into one.
394 Output: Returns an integer j in the range 1...q representing the size of the
395 new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
396 set to the number of old symbols grouped into one. Only x[n] is 0.*/
397 static int QSufSortTransform(int* __restrict V
, int* __restrict I
, const int numChar
, const int largestInputSymbol
,
398 const int smallestInputSymbol
, const int maxNewAlphabetSize
, int *numSymbolAggregated
) {
401 int a
; // numSymbolAggregated
403 int minSymbolInChunk
= 0, maxSymbolInChunk
= 0;
405 int maxNumInputSymbol
, maxNumBit
, maxSymbol
;
407 maxNumInputSymbol
= largestInputSymbol
- smallestInputSymbol
+ 1;
409 maxNumBit
= BITS_IN_WORD
- leadingZero(maxNumInputSymbol
);
410 maxSymbol
= INT_MAX
>> maxNumBit
;
412 c
= maxNumInputSymbol
;
413 for (a
= 0; a
< numChar
&& maxSymbolInChunk
<= maxSymbol
&& c
<= maxNewAlphabetSize
; a
++) {
414 minSymbolInChunk
= (minSymbolInChunk
<< maxNumBit
) | (V
[a
] - smallestInputSymbol
+ 1);
415 maxSymbolInChunk
= c
;
416 c
= (maxSymbolInChunk
<< maxNumBit
) | maxNumInputSymbol
;
419 mask
= (1 << (a
-1) * maxNumBit
) - 1; /* mask masks off top old symbol from chunk.*/
420 V
[numChar
] = smallestInputSymbol
- 1; /* emulate zero terminator.*/
423 // Section of code for maxSymbolInChunk > numChar removed!
424 if (maxSymbolInChunk
> numChar
) {
425 fprintf(stderr
, "QSufSortTransform(): maxSymbolInChunk > numChar!\n");
430 /* bucketing possible, compact alphabet.*/
431 for (i
=0; i
<=maxSymbolInChunk
; i
++) {
432 I
[i
] = 0; /* zero transformation table.*/
434 c
= minSymbolInChunk
;
435 for (i
=a
; i
<=numChar
; i
++) {
436 I
[c
] = 1; /* mark used chunk symbol.*/
437 c
= ((c
& mask
) << maxNumBit
) | (V
[i
] - smallestInputSymbol
+ 1); /* shift in next old symbol in chunk.*/
439 for (i
=1; i
<a
; i
++) { /* handle last r-1 positions.*/
440 I
[c
] = 1; /* mark used chunk symbol.*/
441 c
= (c
& mask
) << maxNumBit
; /* shift in next old symbol in chunk.*/
444 for (i
=0; i
<=maxSymbolInChunk
; i
++) {
446 I
[i
] = newAlphabetSize
;
450 c
= minSymbolInChunk
;
451 for (i
=0, j
=a
; j
<=numChar
; i
++, j
++) {
452 V
[i
] = I
[c
]; /* transform to new alphabet.*/
453 c
= ((c
& mask
) << maxNumBit
) | (V
[j
] - smallestInputSymbol
+ 1); /* shift in next old symbol in chunk.*/
455 for (; i
<numChar
; i
++) { /* handle last a-1 positions.*/
456 V
[i
] = I
[c
]; /* transform to new alphabet.*/
457 c
= (c
& mask
) << maxNumBit
; /* shift right-end zero in chunk.*/
460 V
[numChar
] = 0; /* end-of-string symbol is zero.*/
462 *numSymbolAggregated
= a
;
463 return newAlphabetSize
;