modified: makefile
[GalaxyCodeBases.git] / BGI / soap_src / soap_builder / QSufSort.c
blob99452d9d32b5c221585719d6d65ac1102219bdef
1 /* QSufSort.c
3 Original source from qsufsort.c
5 Copyright 1999, N. Jesper Larsson, all rights reserved.
7 This file contains an implementation of the algorithm presented in "Faster
8 Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
9 Sadakane (sada@is.s.u-tokyo.ac.jp).
11 This software may be used freely for any purpose. However, when distributed,
12 the original source must be clearly stated, and, when the source code is
13 distributed, the copyright notice must be retained and any alterations in
14 the code must be clearly marked. No warranty is given regarding the quality
15 of this software.
17 Modified by Wong Chi-Kwong, 2004
19 Changes summary: - Used long variable and function names
20 - Removed global variables
21 - Replace pointer references with array references
22 - Used insertion sort in place of selection sort and increased insertion sort threshold
23 - Reconstructing suffix array from inverse becomes an option
24 - Add handling where end-of-text symbol is not necessary < all characters
25 - Removed codes for supporting alphabet size > number of characters
27 No warrenty is given regarding the quality of the modifications.
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include "QSufSort.h"
35 #include "MiscUtilities.h"
37 // Static functions
38 static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
39 const int highestPos, const int numSortedChar);
40 static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos,
41 const int highestPos, const int numSortedChar);
42 static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
43 const int highestPos, const int numSortedChar);
44 static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize);
45 static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
46 const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated);
49 /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
50 n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
51 contents of x[n] is disregarded, the n-th symbol being regarded as
52 end-of-string smaller than all other symbols.*/
53 void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
54 const int smallestInputSymbol, const int skipTransform) {
56 int i, j;
57 int s, negatedSortedGroupLength;
58 int numSymbolAggregated;
59 int maxNumInputSymbol;
60 int numSortedPos = 1;
61 int newAlphabetSize;
63 maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
65 if (!skipTransform) {
66 /* bucketing possible*/
67 newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol,
68 numChar, &numSymbolAggregated);
69 QSufSortBucketSort(V, I, numChar, newAlphabetSize);
70 I[0] = -1;
71 V[numChar] = 0;
72 numSortedPos = numSymbolAggregated;
75 while ((int)(I[0]) >= -(int)numChar) {
76 i = 0;
77 negatedSortedGroupLength = 0;
78 do {
79 s = I[i];
80 if (s < 0) {
81 i -= s; /* skip over sorted group.*/
82 negatedSortedGroupLength += s;
83 } else {
84 if (negatedSortedGroupLength) {
85 I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */
86 negatedSortedGroupLength = 0;
88 j = V[s] + 1;
89 QSufSortSortSplit(V, I, i, j - 1, numSortedPos);
90 i = j;
92 } while (i <= numChar);
93 if (negatedSortedGroupLength) {
94 /* array ends with a sorted group.*/
95 I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/
97 numSortedPos *= 2; /* double sorted-depth.*/
102 void QSufSortGenerateSaFromInverse(const int* V, int* __restrict I, const int numChar) {
104 int i;
105 for (i=0; i<=numChar; i++) {
106 I[V[i]] = i + 1;
111 /* Sorting routine called for each unsorted group. Sorts the array of integers
112 (suffix numbers) of length n starting at p. The algorithm is a ternary-split
113 quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
114 Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
115 function is based on Program 7.*/
116 static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
117 const int highestPos, const int numSortedChar) {
119 int a, b, c, d;
120 int l, m;
121 int f, v, s, t;
122 int tmp;
123 int numItem;
125 #ifdef DEBUG
126 if (lowestPos > highestPos) {
127 fprintf(stderr, "QSufSortSortSplit(): lowestPos > highestPos!\n");
128 exit(1);
130 #endif
132 numItem = highestPos - lowestPos + 1;
134 if (numItem <= INSERT_SORT_NUM_ITEM) {
135 QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar);
136 return;
139 v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar);
141 a = b = lowestPos;
142 c = d = highestPos;
144 while (TRUE) {
145 while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) {
146 if (f == v) {
147 swap(I[a], I[b], tmp);
148 a++;
150 b++;
152 while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) {
153 if (f == v) {
154 swap(I[c], I[d], tmp);
155 d--;
157 c--;
159 if (b > c) {
160 break;
162 swap(I[b], I[c], tmp);
163 b++;
164 c--;
167 s = a - lowestPos;
168 t = b - a;
169 s = min(s, t);
170 for (l = lowestPos, m = b - s; m < b; l++, m++) {
171 swap(I[l], I[m], tmp);
174 s = d - c;
175 t = highestPos - d;
176 s = min(s, t);
177 for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) {
178 swap(I[l], I[m], tmp);
181 s = b - a;
182 t = d - c;
183 if (s > 0) {
184 QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar);
187 // Update group number for equal portion
188 a = lowestPos + s;
189 b = highestPos - t;
190 if (a == b) {
191 // Sorted group
192 V[I[a]] = a;
193 I[a] = -1;
194 } else {
195 // Unsorted group
196 for (c=a; c<=b; c++) {
197 V[I[c]] = b;
201 if (t > 0) {
202 QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar);
207 /* Algorithm by Bentley & McIlroy.*/
208 static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos,
209 const int highestPos, const int numSortedChar) {
211 int m;
212 int keyl, keym, keyn;
213 int key1, key2, key3;
214 int s;
215 int numItem;
217 #ifdef DEBUG
218 if (lowestPos > highestPos) {
219 fprintf(stderr, "QSufSortChoosePivot(): lowestPos > highestPos!\n");
220 exit(1);
222 #endif
224 numItem = highestPos - lowestPos + 1;
226 #ifdef DEBUG
227 if (numItem <= INSERT_SORT_NUM_ITEM) {
228 fprintf(stderr, "QSufSortChoosePivot(): number of items <= INSERT_SORT_NUM_ITEM!\n");
229 exit(1);
231 #endif
233 m = lowestPos + numItem / 2;
235 s = numItem / 8;
236 key1 = KEY(V, I, lowestPos, numSortedChar);
237 key2 = KEY(V, I, lowestPos+s, numSortedChar);
238 key3 = KEY(V, I, lowestPos+2*s, numSortedChar);
239 keyl = med3(key1, key2, key3);
240 key1 = KEY(V, I, m-s, numSortedChar);
241 key2 = KEY(V, I, m, numSortedChar);
242 key3 = KEY(V, I, m+s, numSortedChar);
243 keym = med3(key1, key2, key3);
244 key1 = KEY(V, I, highestPos-2*s, numSortedChar);
245 key2 = KEY(V, I, highestPos-s, numSortedChar);
246 key3 = KEY(V, I, highestPos, numSortedChar);
247 keyn = med3(key1, key2, key3);
249 return med3(keyl, keym, keyn);
254 /* Quadratic sorting method to use for small subarrays. */
255 static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
256 const int highestPos, const int numSortedChar) {
258 int i, j;
259 int tmpKey, tmpPos;
260 int numItem;
261 int key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM];
262 int negativeSortedLength;
263 int groupNum;
265 #ifdef DEBUG
266 if (lowestPos > highestPos) {
267 fprintf(stderr, "QSufSortInsertSortSplit(): lowestPos > highestPos!\n");
268 exit(1);
270 #endif
272 numItem = highestPos - lowestPos + 1;
274 #ifdef DEBUG
275 if (numItem > INSERT_SORT_NUM_ITEM) {
276 fprintf(stderr, "QSufSortInsertSortSplit(): number of items > INSERT_SORT_NUM_ITEM!\n");
277 exit(1);
279 #endif
281 for (i=0; i<numItem; i++) {
282 #ifdef DEBUG
283 if (I[lowestPos + i] < 0) {
284 fprintf(stderr, "QSufSortInsertSortSplit(): I < 0 in unsorted region!\n");
285 exit(1);
287 #endif
288 pos[i] = I[lowestPos + i];
289 key[i] = V[pos[i] + numSortedChar];
292 for (i=1; i<numItem; i++) {
293 tmpKey = key[i];
294 tmpPos = pos[i];
295 for (j=i; j>0 && key[j-1] > tmpKey; j--) {
296 key[j] = key[j-1];
297 pos[j] = pos[j-1];
299 key[j] = tmpKey;
300 pos[j] = tmpPos;
303 negativeSortedLength = -1;
305 i = numItem - 1;
306 groupNum = highestPos;
307 while (i > 0) {
308 I[i+lowestPos] = pos[i];
309 V[I[i+lowestPos]] = groupNum;
310 if (key[i-1] == key[i]) {
311 negativeSortedLength = 0;
312 } else {
313 if (negativeSortedLength < 0) {
314 I[i+lowestPos] = negativeSortedLength;
316 groupNum = i + lowestPos - 1;
317 negativeSortedLength--;
319 i--;
322 I[lowestPos] = pos[0];
323 V[I[lowestPos]] = groupNum;
324 if (negativeSortedLength < 0) {
325 I[lowestPos] = negativeSortedLength;
330 /* Bucketsort for first iteration.
332 Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
333 at least once. x[n] is 0. (This is the corresponding output of transform.) k
334 must be at most n+1. p is array of size n+1 whose contents are disregarded.
336 Output: x is V and p is I after the initial sorting stage of the refined
337 suffix sorting algorithm.*/
339 static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize) {
341 int i, c;
342 int d;
343 int groupNum;
344 int currentIndex;
346 // mark linked list empty
347 for (i=0; i<alphabetSize; i++) {
348 I[i] = -1;
351 // insert to linked list
352 for (i=0; i<=numChar; i++) {
353 c = V[i];
354 V[i] = (int)(I[c]);
355 I[c] = i;
358 currentIndex = numChar;
359 for (i=alphabetSize; i>0; i--) {
360 c = I[i-1];
361 d = (int)(V[c]);
362 groupNum = currentIndex;
363 V[c] = groupNum;
364 if (d >= 0) {
365 I[currentIndex] = c;
366 while (d >= 0) {
367 c = d;
368 d = V[c];
369 V[c] = groupNum;
370 currentIndex--;
371 I[currentIndex] = c;
373 } else {
374 // sorted group
375 I[currentIndex] = -1;
377 currentIndex--;
382 /* Transforms the alphabet of x by attempting to aggregate several symbols into
383 one, while preserving the suffix order of x. The alphabet may also be
384 compacted, so that x on output comprises all integers of the new alphabet
385 with no skipped numbers.
387 Input: x is an array of size n+1 whose first n elements are positive
388 integers in the range l...k-1. p is array of size n+1, used for temporary
389 storage. q controls aggregation and compaction by defining the maximum intue
390 for any symbol during transformation: q must be at least k-l; if q<=n,
391 compaction is guaranteed; if k-l>n, compaction is never done; if q is
392 INT_MAX, the maximum number of symbols are aggregated into one.
394 Output: Returns an integer j in the range 1...q representing the size of the
395 new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
396 set to the number of old symbols grouped into one. Only x[n] is 0.*/
397 static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
398 const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated) {
400 int c, i, j;
401 int a; // numSymbolAggregated
402 int mask;
403 int minSymbolInChunk = 0, maxSymbolInChunk = 0;
404 int newAlphabetSize;
405 int maxNumInputSymbol, maxNumBit, maxSymbol;
407 maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
409 maxNumBit = BITS_IN_WORD - leadingZero(maxNumInputSymbol);
410 maxSymbol = INT_MAX >> maxNumBit;
412 c = maxNumInputSymbol;
413 for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) {
414 minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1);
415 maxSymbolInChunk = c;
416 c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol;
419 mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/
420 V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/
422 #ifdef DEBUG
423 // Section of code for maxSymbolInChunk > numChar removed!
424 if (maxSymbolInChunk > numChar) {
425 fprintf(stderr, "QSufSortTransform(): maxSymbolInChunk > numChar!\n");
426 exit(1);
428 #endif
430 /* bucketing possible, compact alphabet.*/
431 for (i=0; i<=maxSymbolInChunk; i++) {
432 I[i] = 0; /* zero transformation table.*/
434 c = minSymbolInChunk;
435 for (i=a; i<=numChar; i++) {
436 I[c] = 1; /* mark used chunk symbol.*/
437 c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
439 for (i=1; i<a; i++) { /* handle last r-1 positions.*/
440 I[c] = 1; /* mark used chunk symbol.*/
441 c = (c & mask) << maxNumBit; /* shift in next old symbol in chunk.*/
443 newAlphabetSize = 1;
444 for (i=0; i<=maxSymbolInChunk; i++) {
445 if (I[i]) {
446 I[i] = newAlphabetSize;
447 newAlphabetSize++;
450 c = minSymbolInChunk;
451 for (i=0, j=a; j<=numChar; i++, j++) {
452 V[i] = I[c]; /* transform to new alphabet.*/
453 c = ((c & mask) << maxNumBit) | (V[j] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/
455 for (; i<numChar; i++) { /* handle last a-1 positions.*/
456 V[i] = I[c]; /* transform to new alphabet.*/
457 c = (c & mask) << maxNumBit; /* shift right-end zero in chunk.*/
460 V[numChar] = 0; /* end-of-string symbol is zero.*/
462 *numSymbolAggregated = a;
463 return newAlphabetSize;