Roll src/third_party/WebKit eac3800:0237a66 (svn 202606:202607)
[chromium-blink-merge.git] / courgette / third_party / qsufsort.h
blob399768c474ecf1e76e53a5d46861460fb24d192c
1 /*
2 qsufsort.h -- Suffix array generation.
4 Copyright 2003 Colin Percival
6 For the terms under which this work may be distributed, please see
7 the adjoining file "LICENSE".
9 ChangeLog:
10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit
11 values throughout.
12 --Benjamin Smedberg <benjamin@smedbergs.us>
13 2010-05-26 - Use a paged array for V and I. The address space may be too
14 fragmented for these big arrays to be contiguous.
15 --Stephen Adams <sra@chromium.org>
16 2015-08-03 - Extrat qsufsort to a separate file as template.
17 --Samuel Huang <huangs@chromium.org>
18 2015-08-19 - Optimize split() and search(), add comments.
19 --Samuel Huang <huangs@chromium.org>
22 #include <algorithm>
23 #include <cstring>
25 namespace courgette {
26 namespace qsuf {
28 // ------------------------------------------------------------------------
30 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the
31 // code formatting and variable names. The changes from the original are:
32 // (1) replacing tabs with spaces,
33 // (2) indentation,
34 // (3) using 'const',
35 // (4) changing the V and I parameters from int* to template <typename T>.
36 // (5) optimizing split() and search(); fix styles.
38 // The code appears to be a rewritten version of the suffix array algorithm
39 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
40 // Sadakane, special cased for bytes.
42 template <typename T> static void split(T I, T V, int start, int end, int h) {
43 // For small interval, apply selection sort.
44 if (end - start < 6) {
45 for (int i = start; i < end; ) {
46 int skip = 1;
47 int best = V[I[i] + h];
48 for (int j = i + 1; j < end; j++) {
49 int cur = V[I[j] + h];
50 if (best > cur) {
51 best = cur;
52 int tmp = I[i];
53 I[i] = I[j];
54 I[j] = tmp;
55 skip = 1;
56 } else if (best == cur) {
57 int tmp = I[i + skip];
58 I[i + skip] = I[j];
59 I[j] = tmp;
60 ++skip;
63 if (skip == 1) {
64 V[I[i]] = i;
65 I[i] = -1;
66 } else {
67 for (int j = i, jend = i + skip; j < jend; j++)
68 V[I[j]] = jend - 1;
70 i += skip;
72 return;
75 // Split [start, end) into 3 intervals:
76 // [start, j) with secondary keys < pivot,
77 // [j, k) with secondary keys == pivot,
78 // [k, end) with secondary keys > pivot.
79 int pivot = V[I[(start + end) / 2] + h];
80 int j = start;
81 int k = end;
82 for (int i = start; i < k; ) {
83 int cur = V[I[i] + h];
84 if (cur < pivot) {
85 if (i != j) {
86 int tmp = I[i];
87 I[i] = I[j];
88 I[j] = tmp;
90 ++i;
91 ++j;
92 } else if (cur > pivot) {
93 --k;
94 int tmp = I[i];
95 I[i] = I[k];
96 I[k] = tmp;
97 } else {
98 ++i;
102 // Recurse on the "< pivot" piece.
103 if (start < j)
104 split<T>(I, V, start, j, h);
106 // Update the "== pivot" piece.
107 if (j == k - 1) {
108 V[I[j]] = j;
109 I[j] = -1;
110 } else {
111 for (int i = j; i < k; ++i)
112 V[I[i]] = k - 1;
115 // Recurse on the "> pivot" piece.
116 if (k < end)
117 split<T>(I, V, k, end, h);
120 template <class T>
121 static void
122 qsufsort(T I, T V,const unsigned char *old,int oldsize)
124 int buckets[256];
125 int i,h,len;
127 for(i=0;i<256;i++) buckets[i]=0;
128 for(i=0;i<oldsize;i++) buckets[old[i]]++;
129 for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
130 for(i=255;i>0;i--) buckets[i]=buckets[i-1];
131 buckets[0]=0;
133 for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i;
134 I[0]=oldsize;
135 for(i=0;i<oldsize;i++) V[i]=buckets[old[i]];
136 V[oldsize]=0;
137 for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1;
138 I[0]=-1;
140 for(h=1;I[0]!=-(oldsize+1);h+=h) {
141 len=0;
142 for(i=0;i<oldsize+1;) {
143 if(I[i]<0) {
144 len-=I[i];
145 i-=I[i];
146 } else {
147 if(len) I[i-len]=-len;
148 len=V[I[i]]+1-i;
149 split<T>(I,V,i,i+len,h);
150 i+=len;
151 len=0;
154 if(len) I[i-len]=-len;
157 for(i=0;i<oldsize+1;i++) I[V[i]]=i;
160 static int
161 matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int newsize)
163 int i;
165 for(i=0;(i<oldsize)&&(i<newsize);i++)
166 if(old[i]!=newbuf[i]) break;
168 return i;
171 template <class T>
172 static int search(T I, const unsigned char *old, int oldsize,
173 const unsigned char *newbuf, int newsize, int *pos) {
174 int lo = 0;
175 int hi = oldsize;
176 while (hi - lo >= 2) {
177 int mid = (lo + hi) / 2;
178 if(memcmp(old+I[mid],newbuf,std::min(oldsize-I[mid],newsize))<0) {
179 lo = mid;
180 } else {
181 hi = mid;
185 int x = matchlen(old + I[lo], oldsize - I[lo], newbuf, newsize);
186 int y = matchlen(old + I[hi], oldsize - I[hi], newbuf, newsize);
187 if(x > y) {
188 *pos = I[lo];
189 return x;
191 *pos = I[hi];
192 return y;
195 // End of 'verbatim' code.
196 // ------------------------------------------------------------------------
198 } // namespace qsuf
199 } // namespace courgette