modified: makefile
[GalaxyCodeBases.git] / tools / bwt / dcs-bwt / src / inverse_bwt.cc
blob61a4cb84f3d3a136c81227fee71fe48a64de59f5
1 // Copyright 2007 Google Inc.
2 //
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
7 //
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 // Two algorithms for the inverse Burrows-Wheeler transform.
18 // 1. SimpleInverseBWTransformer: This is the algorithm basis in
19 // Seward: Space-time tradeoffs in the Inverse B-W Transform.
20 // 2. FastInverseBWTransformer: This is the algorithm mergeTL
21 // in Sewards paper with some additional heuristics to be
22 // able to handle very large blocks.
24 #include "inverse_bwt.h"
25 #include "stream.h"
27 #include <iostream>
28 #include <string>
29 #include <vector>
30 #include <numeric> // for partial_sum
32 namespace dcsbwt {
34 namespace {
36 //////////////////////////////////////////////////////////////////
37 // SimpleInverseBWTransformer
38 //////////////////////////////////////////////////////////////////
40 class SimpleInverseBWTransformer : public InverseBWTransformer {
41 public:
42 class Options : public InverseBWTransformer::AlgorithmSpecificOptions {
43 public:
44 Options() {}
45 virtual ~Options() {}
47 virtual bool Set(const std::string& options) {
48 return 0 == options.length();
50 virtual std::string Get() const { return std::string(); }
51 virtual int64 MaxBlockSize(int64 memory_budget) const {
52 return memory_budget / 9;
55 virtual InverseBWTransformer* GetTransformer(int64 memory_budget) {
56 return new SimpleInverseBWTransformer;
60 SimpleInverseBWTransformer() {}
61 virtual ~SimpleInverseBWTransformer() {}
63 private:
64 virtual bool DoTransform(const char* bwt, size_t bwt_size,
65 size_t eob_position, OutStream* output);
67 SimpleInverseBWTransformer(const SimpleInverseBWTransformer&);
68 SimpleInverseBWTransformer& operator=(const SimpleInverseBWTransformer&);
72 bool SimpleInverseBWTransformer::DoTransform(
73 const char* bwt, size_t bwt_size,
74 size_t eob_position, OutStream* output)
76 // rank[i] will be the number of occurrences of bwt[i] in bwt[0..i-1]
77 std::vector<uint32> rank(bwt_size, 0);
79 // count[] serves two purposes:
80 // 1. When the scan of the BWT reaches position i,
81 // count[c+1] is the number of occurrences of c in bwt[0..i-1]
82 // (count[0] counts the EOB symbol).
83 // 2. During the ouput generation, count[c] is the total number
84 // of characters smaller than c (including the single EOB symbol).
85 std::vector<uint32> count(257, 0);
87 // count characters before EOB
88 for (uint32 position = 0; position < eob_position; ++position) {
89 int ch = static_cast<unsigned char>(bwt[position]);
90 rank[position] = count[ch + 1];
91 ++count[ch + 1];
93 // count EOB
94 rank[eob_position] = 0;
95 count[0] = 1;
96 // count characters after EOB
97 for (uint32 position = eob_position + 1; position < bwt_size; ++position) {
98 int ch = static_cast<unsigned char>(bwt[position]);
99 rank[position] = count[ch + 1];
100 ++count[ch + 1];
102 std::partial_sum(count.begin(), count.end(), count.begin());
103 assert(count[256] == bwt_size);
105 OutStreamBuffer buffer;
106 buffer.Connect(output);
107 uint32 position = 0;
108 uint32 string_length = 0;
109 while (position != eob_position) {
110 int ch = static_cast<unsigned char>(bwt[position]);
111 buffer.WriteByte(ch);
112 ++string_length;
113 position = count[ch] + rank[position];
115 // If the BWT or the EOB position contain errors (or are garbage),
116 // it is likely that the cycle ends prematurely.
117 // In this case, we return false.
118 buffer.Disconnect();
119 return (string_length + 1 == bwt_size);
122 //////////////////////////////////////////////////////////////////
123 // FastInverseBWTransformer
124 //////////////////////////////////////////////////////////////////
126 class FastInverseBWTransformer : public InverseBWTransformer {
127 public:
128 class Options : public InverseBWTransformer::AlgorithmSpecificOptions {
129 public:
130 Options() {}
131 virtual ~Options() {}
133 virtual bool Set(const std::string& options) {
134 return 0 == options.length();
136 virtual std::string Get() const { return std::string(); }
137 virtual int64 MaxBlockSize(int64 memory_budget) const {
138 memory_budget -= kMemoryOverhead;
139 return memory_budget / sizeof(uint32);
142 virtual InverseBWTransformer* GetTransformer(int64 memory_budget) {
143 return new FastInverseBWTransformer;
145 private:
146 static const int64 kMemoryOverhead = 1 << 20;
149 FastInverseBWTransformer() {}
150 virtual ~FastInverseBWTransformer() {}
152 private:
153 virtual bool DoTransform(const char* bwt, size_t bwt_size,
154 size_t eob_position, OutStream* output);
156 FastInverseBWTransformer(const FastInverseBWTransformer&);
157 FastInverseBWTransformer& operator=(const FastInverseBWTransformer&);
160 bool FastInverseBWTransformer::DoTransform(
161 const char* bwt, size_t bwt_size,
162 size_t eob_position, OutStream* output)
164 // rank[i] will be the number of occurrences of bwt[i] in bwt[0..i-1]
165 std::vector<uint32> bwt_rank_low24(bwt_size);
166 std::vector<uint32> rank_milestone_buffer[256];
168 // count[] serves two purposes:
169 // 1. When the scan of the BWT reaches position i,
170 // count[c+1] is the number of occurrences of c in bwt[0..i-1]
171 // (count[0] counts the EOB symbol).
172 // 2. During the ouput generation, count[c] is the total number
173 // of characters smaller than c (including the single EOB symbol).
174 std::vector<uint32> count(257, 0);
176 // count EOB
177 bwt_rank_low24[eob_position] = 0;
178 count[0] = 1;
179 // count other characters
180 for (uint32 position = 0; position < bwt_size; ++position) {
181 if (position != eob_position) {
182 uint32 ch = static_cast<unsigned char>(bwt[position]);
183 uint32 rank = count[ch + 1];
184 uint32 rank_low24 = rank & 0x00FFFFFF;
185 //uint32 rank_hi8 = rank >> 24;
186 bwt_rank_low24[position] = (ch << 24) + rank_low24;
187 if (0 == rank_low24) {
188 rank_milestone_buffer[ch].push_back(position);
190 ++count[ch + 1];
193 std::partial_sum(count.begin(), count.end(), count.begin());
194 assert(count[256] == bwt_size);
196 for (int ch = 0; ch < 256; ++ch) {
197 rank_milestone_buffer[ch].push_back(bwt_size);
200 OutStreamBuffer buffer;
201 buffer.Connect(output);
202 uint32 position = 0;
203 uint32 string_length = 0;
204 while (position != eob_position) {
205 uint32 bwt_and_rank = bwt_rank_low24[position];
206 uint32 ch = bwt_and_rank >> 24;
207 buffer.WriteByte(ch);
208 ++string_length;
209 uint32 rank_low24 = bwt_and_rank & 0x00FFFFFF;
210 uint32 rank_hi8 = 0;
211 while (rank_milestone_buffer[ch][rank_hi8 + 1] <= position) ++rank_hi8;
212 position = count[ch] + (rank_hi8 << 24) + rank_low24;
214 // If the BWT or the EOB position contain errors (or are garbage),
215 // it is likely that the cycle ends prematurely.
216 // In this case, we return false.
217 buffer.Disconnect();
218 return (string_length + 1 == bwt_size);
221 } // namespace
223 bool InverseBWTransformer::Options::Set(const std::string& options_string) {
224 if (options_string.length() < 1) return false;
225 char algorithm = options_string[0];
226 AlgorithmSpecificOptions* options;
227 switch (algorithm) {
228 case 's':
229 options = new SimpleInverseBWTransformer::Options;
230 break;
231 case 'f':
232 options = new FastInverseBWTransformer::Options;
233 break;
234 default:
235 return false;
237 std::string algorithm_specific_options_string(options_string, 1);
238 bool options_are_valid = options->Set(algorithm_specific_options_string);
239 if (options_are_valid) {
240 algorithm_id_ = algorithm;
241 if (algorithm_specific_options_) delete algorithm_specific_options_;
242 algorithm_specific_options_ = options;
243 } else {
244 delete options;
246 return options_are_valid;
249 bool InverseBWTransformer::Transform(
250 const char* bwt_block, size_t bwt_block_size,
251 size_t eob_position, OutStream* output,
252 Options options,
253 int64 memory_budget)
255 assert(bwt_block_size <= options.MaxBlockSize(memory_budget));
256 InverseBWTransformer* algorithm = options.GetAlgorithm(memory_budget);
257 bool success = algorithm->DoTransform(bwt_block, bwt_block_size,
258 eob_position,
259 output);
260 delete algorithm;
261 return success;
264 } // namespace dcsbwt