2 qsufsort.h -- Suffix array generation.
4 Copyright 2003 Colin Percival
6 For the terms under which this work may be distributed, please see
7 the adjoining file "LICENSE".
10 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit
12 --Benjamin Smedberg <benjamin@smedbergs.us>
13 2010-05-26 - Use a paged array for V and I. The address space may be too
14 fragmented for these big arrays to be contiguous.
15 --Stephen Adams <sra@chromium.org>
16 2015-08-03 - Extrat qsufsort to a separate file as template.
17 --Samuel Huang <huangs@chromium.org>
18 2015-08-19 - Optimize split() and search(), add comments.
19 --Samuel Huang <huangs@chromium.org>
28 // ------------------------------------------------------------------------
30 // The following code is taken verbatim from 'bsdiff.c'. Please keep all the
31 // code formatting and variable names. The changes from the original are:
32 // (1) replacing tabs with spaces,
35 // (4) changing the V and I parameters from int* to template <typename T>.
36 // (5) optimizing split() and search(); fix styles.
38 // The code appears to be a rewritten version of the suffix array algorithm
39 // presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko
40 // Sadakane, special cased for bytes.
42 template <typename T
> static void split(T I
, T V
, int start
, int end
, int h
) {
43 // For small interval, apply selection sort.
44 if (end
- start
< 6) {
45 for (int i
= start
; i
< end
; ) {
47 int best
= V
[I
[i
] + h
];
48 for (int j
= i
+ 1; j
< end
; j
++) {
49 int cur
= V
[I
[j
] + h
];
56 } else if (best
== cur
) {
57 int tmp
= I
[i
+ skip
];
67 for (int j
= i
, jend
= i
+ skip
; j
< jend
; j
++)
75 // Split [start, end) into 3 intervals:
76 // [start, j) with secondary keys < pivot,
77 // [j, k) with secondary keys == pivot,
78 // [k, end) with secondary keys > pivot.
79 int pivot
= V
[I
[(start
+ end
) / 2] + h
];
82 for (int i
= start
; i
< k
; ) {
83 int cur
= V
[I
[i
] + h
];
92 } else if (cur
> pivot
) {
102 // Recurse on the "< pivot" piece.
104 split
<T
>(I
, V
, start
, j
, h
);
106 // Update the "== pivot" piece.
111 for (int i
= j
; i
< k
; ++i
)
115 // Recurse on the "> pivot" piece.
117 split
<T
>(I
, V
, k
, end
, h
);
122 qsufsort(T I
, T V
,const unsigned char *old
,int oldsize
)
127 for(i
=0;i
<256;i
++) buckets
[i
]=0;
128 for(i
=0;i
<oldsize
;i
++) buckets
[old
[i
]]++;
129 for(i
=1;i
<256;i
++) buckets
[i
]+=buckets
[i
-1];
130 for(i
=255;i
>0;i
--) buckets
[i
]=buckets
[i
-1];
133 for(i
=0;i
<oldsize
;i
++) I
[++buckets
[old
[i
]]]=i
;
135 for(i
=0;i
<oldsize
;i
++) V
[i
]=buckets
[old
[i
]];
137 for(i
=1;i
<256;i
++) if(buckets
[i
]==buckets
[i
-1]+1) I
[buckets
[i
]]=-1;
140 for(h
=1;I
[0]!=-(oldsize
+1);h
+=h
) {
142 for(i
=0;i
<oldsize
+1;) {
147 if(len
) I
[i
-len
]=-len
;
149 split
<T
>(I
,V
,i
,i
+len
,h
);
154 if(len
) I
[i
-len
]=-len
;
157 for(i
=0;i
<oldsize
+1;i
++) I
[V
[i
]]=i
;
161 matchlen(const unsigned char *old
,int oldsize
,const unsigned char *newbuf
,int newsize
)
165 for(i
=0;(i
<oldsize
)&&(i
<newsize
);i
++)
166 if(old
[i
]!=newbuf
[i
]) break;
172 static int search(T I
, const unsigned char *old
, int oldsize
,
173 const unsigned char *newbuf
, int newsize
, int *pos
) {
176 while (hi
- lo
>= 2) {
177 int mid
= (lo
+ hi
) / 2;
178 if(memcmp(old
+I
[mid
],newbuf
,std::min(oldsize
-I
[mid
],newsize
))<0) {
185 int x
= matchlen(old
+ I
[lo
], oldsize
- I
[lo
], newbuf
, newsize
);
186 int y
= matchlen(old
+ I
[hi
], oldsize
- I
[hi
], newbuf
, newsize
);
195 // End of 'verbatim' code.
196 // ------------------------------------------------------------------------
199 } // namespace courgette