modified: makefile
[GalaxyCodeBases.git] / tools / bwt / dcs-bwt / src / bwt_compress.cc
blob8ceb1fce3fb1eae7abff93927ea4d17f2f8993eb
1 // Copyright 2007 Google Inc.
2 //
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
7 //
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 #include <iostream>
18 #include <algorithm> // for min
19 #include <cstdio>
21 #include "bwt_compress.h"
22 #include "prefix_doubling.h"
23 //#include "inverse_bwt.h"
24 //#include "bbb_compress.h"
25 #include "stream.h"
27 namespace dcsbwt {
29 extern int verbosity;
31 namespace {
32 // Integer division that rounds up
33 // REQUIRE: both arguments are positive
34 uint64 DivideAndRoundUp(uint64 numerator, uint64 denominator) {
35 assert(numerator > 0);
36 assert(denominator > 0);
37 return 1 + (numerator - 1) / denominator;
39 } // namespace
41 //const int64 BwtCompressor::kMinMemoryBudget = (1LL << 24);
42 //const int64 BwtCompressor::kDefaultMemoryBudget = (1LL << 30);
43 //const int64 BwtCompressor::kMaxMemoryBudget = (2LL << 30);
45 void BwtCompressor::Compress(InStream* input, int64 input_size,
46 OutStream* output, Options options,
47 int64 max_block_size)
49 assert(input_size >= 0);
50 assert(max_block_size >= 0);
51 assert(max_block_size <= options.MaxBlockSize());
52 // Determine block sizes
53 // There are (at most) two sizes of blocks: small and large
54 int64 num_large_blocks, large_block_size;
55 int64 num_small_blocks, small_block_size;
56 if (0 == input_size) {
57 // Special case for empty input
58 num_large_blocks = num_small_blocks = 0;
59 large_block_size = small_block_size = 0;
60 } else if (max_block_size > 0) {
61 // All blocks except possibly the last one have size max_block_size.
62 large_block_size = std::min(max_block_size, input_size);
63 num_large_blocks = input_size / large_block_size;
64 small_block_size = input_size - num_large_blocks * large_block_size;
65 num_small_blocks = (small_block_size > 0) ? 1 : 0;
66 } else {
67 // max_block_size == 0: use SuggestedBlockSize()
68 max_block_size = options.SuggestedBlockSize();
69 // Minimize the number of blocks and make block sizes as even as possible
70 int64 num_blocks = DivideAndRoundUp(input_size, max_block_size);
71 large_block_size = DivideAndRoundUp(input_size, num_blocks);
72 small_block_size = large_block_size - 1;
73 num_small_blocks = num_blocks * large_block_size - input_size;
74 num_large_blocks = num_blocks - num_small_blocks;
77 assert(num_large_blocks >= 0);
78 assert(num_small_blocks >= 0);
79 assert(input_size == 0 || num_large_blocks > 0);
80 assert(0 <= small_block_size);
81 assert(small_block_size <= large_block_size);
82 assert(large_block_size <= max_block_size);
83 assert(large_block_size <= options.MaxBlockSize());
84 assert(num_small_blocks * small_block_size
85 + num_large_blocks * large_block_size == input_size);
87 if (verbosity > 0) {
88 std::clog << "Compressing " << input_size << " bytes using block size "
89 << large_block_size << std::endl;
92 // Compress
93 BwtCompressor compressor;
94 compressor.Connect(output, options);
95 std::vector<char> block(large_block_size, 0);
96 char* block_begin = &block[0];
97 while (num_large_blocks--) {
98 input->Read(block_begin, large_block_size);
99 // CompressBlock should not fail for this block size
100 bool ok = compressor.CompressBlock(block_begin, large_block_size);
101 assert(ok);
103 std::fill(block_begin + small_block_size, block_begin + large_block_size, 0);
104 while (num_small_blocks--) {
105 input->Read(block_begin, small_block_size);
106 bool ok = compressor.CompressBlock(block_begin, small_block_size);
107 assert(ok);
109 compressor.Disconnect();
112 void BwtCompressor::Connect(OutStream* output, Options options) {
113 assert(compressor_ == NULL);
114 rawout_.Connect(output);
115 rawout_->Write("BWT", 3);
116 compressor_ = StreamCompressor::Connect(output,
117 options.GetCompressionOptions());
118 assert(compressor_ != NULL);
119 options_ = options;
122 OutStream* BwtCompressor::Disconnect() {
123 assert(compressor_ != NULL);
124 compressor_->Disconnect(); compressor_ = NULL;
125 WriteEmptyBlock();
126 return rawout_.Disconnect();
129 bool BwtCompressor::CompressBlock(char* block, size_t block_size) {
130 assert(compressor_ != NULL);
131 assert(block_size > 0);
132 if (block_size > options_.MaxBlockSize()) { return false; }
134 if (verbosity > 2) {
135 std::clog << "Compressing block of size " << block_size << std::endl;
138 WriteUint63(block_size);
139 compressor_->WriteBegin();
140 uint64 eob_position =
141 BWTransformer::Transform(block, block_size,
142 compressor_,
143 options_.GetTransformOptions(),
144 options_.GetMemoryBudget());
145 compressor_->WriteEnd();
146 WriteUint63(eob_position);
147 return true;
150 void BwtCompressor::WriteEmptyBlock() { WriteUint63(0); }
152 void BwtCompressor::WriteUint63(int64 value) {
153 if (options_.UsePrintableCharacters()) return WriteUint63Printable(value);
154 assert(value >= 0);
155 char bytes[9];
156 int position = 9;
157 unsigned char byte = value & 127;
158 value >>= 7;
159 bytes[--position] = byte;
160 while (value > 0) {
161 byte = (value & 127) | 128;
162 value >>= 7;
163 assert(position > 0);
164 bytes[--position] = byte;
166 rawout_->Write(bytes + position, 9 - position);
169 void BwtCompressor::WriteUint63Printable(int64 value) {
170 // Write value in decimal ended by newline.
171 assert(value >= 0);
172 char bytes[12];
173 int length = sprintf(bytes, "%lld", value);
174 bytes[length] = '\n'; // Replace '\0' with '\n'
175 rawout_->Write(bytes, length + 1);
179 ///////////////////////////// BwtDecompressor ///////////////////////////////
181 int64 BwtDecompressor::Decompress(InStreamBuffer* input, OutStream* output,
182 Options options) {
183 BwtDecompressor decompressor;
184 bool success = decompressor.Connect(input);
185 if (!success) return decompressor.GetInputError();
186 while (decompressor.HasData()) {
187 success = decompressor.DecompressBlock(output, options);
188 if (!success) {
189 if (decompressor.GetInputError()) {
190 return decompressor.GetInputError();
191 } else {
192 int64 too_large_block_size =
193 decompressor.GetUncompressedSizeOfNextBlock();
194 decompressor.Disconnect();
195 return too_large_block_size;
199 decompressor.Disconnect();
200 return 0;
203 bool BwtDecompressor::Connect(InStreamBuffer* input) {
204 assert(!low_memory_failure_);
205 input_error_ = 0; // clear previous errors
206 rawin_.Connect(input);
207 // Read the initial "BWT"
208 char magic_number[3];
209 rawin_->Read(magic_number, 3);
210 if (magic_number[0] != 'B' || magic_number[1] != 'W'
211 || magic_number[2] != 'T') {
212 SetError(-1);
213 return false;
215 assert(decompressor_ == NULL);
216 decompressor_ = StreamDecompressor::Connect(input);
217 if (!decompressor_) {
218 SetError(-1);
219 return false;
221 use_printable_characters_ = ('t' == (decompressor_->GetFormat())[0]);
222 uncompressed_block_size_ = ReadUint63();
223 if (uncompressed_block_size_ < 0) {
224 SetError(-1);
225 return false;
227 return true;
230 InStreamBuffer* BwtDecompressor::Disconnect() {
231 assert(decompressor_ != NULL);
232 decompressor_->Disconnect(); decompressor_ = NULL;
233 uncompressed_block_size_ = 0;
234 low_memory_failure_ = false;
235 return rawin_.Disconnect();
238 bool BwtDecompressor::DecompressBlock(OutStream* output, Options options) {
239 assert(uncompressed_block_size_ > 0);
240 assert(decompressor_ != NULL);
241 int64 max_block_size = options.MaxBlockSize(decompressor_->SizeInBytes());
242 if (uncompressed_block_size_ > max_block_size) {
243 assert(!low_memory_failure_);
244 low_memory_failure_ = true;
245 return false;
246 } else {
247 low_memory_failure_ = false;
249 std::vector<char> bwt(uncompressed_block_size_ + 1);
250 decompressor_->ReadBegin();
251 if (verbosity > 1) {
252 std::clog << "Decompressing a block of size " << uncompressed_block_size_
253 << std::endl;
255 decompressor_->Read(&bwt[0], bwt.size());
256 decompressor_->ReadEnd();
257 int64 eob_position = ReadUint63();
258 if (eob_position < 0 || eob_position > uncompressed_block_size_) {
259 SetError(-1);
260 return false;
262 if ( ! InverseBWTransformer::Transform(&bwt[0], bwt.size(),
263 eob_position, output,
264 options.GetTransformOptions(),
265 options.GetMemoryBudget())) {
266 SetError(-1);
267 return false;
270 // Get ready for the next block.
271 uncompressed_block_size_ = ReadUint63();
272 if (uncompressed_block_size_ < 0) {
273 SetError(-1);
274 return false;
276 return true;
279 // Read an integer value in the range [0,2^63) from the input
280 int64 BwtDecompressor::ReadUint63() {
281 if (use_printable_characters_) return ReadUint63Printable();
282 unsigned char byte;
283 int64 value = 0;
284 int num_bytes = 0; // used only for checking
285 do {
286 assert(++num_bytes <= 9);
287 byte = rawin_->ReadByte();
288 value = (value << 7) + (byte & 127);
289 } while (byte > 127);
290 return value;
293 // Read an integer value in the range [0,2^63) from the input
294 int64 BwtDecompressor::ReadUint63Printable() {
295 unsigned char byte;
296 char bytes[32];
297 int num_bytes = 0; // used only for checking
298 do {
299 assert(num_bytes < 32);
300 byte = rawin_->ReadByte();
301 bytes[num_bytes++] = byte;
302 } while ('\n' != byte);
303 char* next;
304 int64 value = strtoll(bytes, &next, 10);
305 return value;
308 // Store the error and clean up
309 void BwtDecompressor::SetError(int error) {
310 assert(error < 0);
311 input_error_ = error;
312 uncompressed_block_size_ = 0;
313 low_memory_failure_ = false;
314 if (decompressor_) {
315 decompressor_->Disconnect(); decompressor_ = NULL;
317 rawin_.Disconnect();
320 } // namespace dcsbwt