1 // Copyright 2007 Google Inc.
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 #include <algorithm> // for min
21 #include "bwt_compress.h"
22 #include "prefix_doubling.h"
23 //#include "inverse_bwt.h"
24 //#include "bbb_compress.h"
32 // Integer division that rounds up
33 // REQUIRE: both arguments are positive
34 uint64
DivideAndRoundUp(uint64 numerator
, uint64 denominator
) {
35 assert(numerator
> 0);
36 assert(denominator
> 0);
37 return 1 + (numerator
- 1) / denominator
;
41 //const int64 BwtCompressor::kMinMemoryBudget = (1LL << 24);
42 //const int64 BwtCompressor::kDefaultMemoryBudget = (1LL << 30);
43 //const int64 BwtCompressor::kMaxMemoryBudget = (2LL << 30);
45 void BwtCompressor::Compress(InStream
* input
, int64 input_size
,
46 OutStream
* output
, Options options
,
49 assert(input_size
>= 0);
50 assert(max_block_size
>= 0);
51 assert(max_block_size
<= options
.MaxBlockSize());
52 // Determine block sizes
53 // There are (at most) two sizes of blocks: small and large
54 int64 num_large_blocks
, large_block_size
;
55 int64 num_small_blocks
, small_block_size
;
56 if (0 == input_size
) {
57 // Special case for empty input
58 num_large_blocks
= num_small_blocks
= 0;
59 large_block_size
= small_block_size
= 0;
60 } else if (max_block_size
> 0) {
61 // All blocks except possibly the last one have size max_block_size.
62 large_block_size
= std::min(max_block_size
, input_size
);
63 num_large_blocks
= input_size
/ large_block_size
;
64 small_block_size
= input_size
- num_large_blocks
* large_block_size
;
65 num_small_blocks
= (small_block_size
> 0) ? 1 : 0;
67 // max_block_size == 0: use SuggestedBlockSize()
68 max_block_size
= options
.SuggestedBlockSize();
69 // Minimize the number of blocks and make block sizes as even as possible
70 int64 num_blocks
= DivideAndRoundUp(input_size
, max_block_size
);
71 large_block_size
= DivideAndRoundUp(input_size
, num_blocks
);
72 small_block_size
= large_block_size
- 1;
73 num_small_blocks
= num_blocks
* large_block_size
- input_size
;
74 num_large_blocks
= num_blocks
- num_small_blocks
;
77 assert(num_large_blocks
>= 0);
78 assert(num_small_blocks
>= 0);
79 assert(input_size
== 0 || num_large_blocks
> 0);
80 assert(0 <= small_block_size
);
81 assert(small_block_size
<= large_block_size
);
82 assert(large_block_size
<= max_block_size
);
83 assert(large_block_size
<= options
.MaxBlockSize());
84 assert(num_small_blocks
* small_block_size
85 + num_large_blocks
* large_block_size
== input_size
);
88 std::clog
<< "Compressing " << input_size
<< " bytes using block size "
89 << large_block_size
<< std::endl
;
93 BwtCompressor compressor
;
94 compressor
.Connect(output
, options
);
95 std::vector
<char> block(large_block_size
, 0);
96 char* block_begin
= &block
[0];
97 while (num_large_blocks
--) {
98 input
->Read(block_begin
, large_block_size
);
99 // CompressBlock should not fail for this block size
100 bool ok
= compressor
.CompressBlock(block_begin
, large_block_size
);
103 std::fill(block_begin
+ small_block_size
, block_begin
+ large_block_size
, 0);
104 while (num_small_blocks
--) {
105 input
->Read(block_begin
, small_block_size
);
106 bool ok
= compressor
.CompressBlock(block_begin
, small_block_size
);
109 compressor
.Disconnect();
112 void BwtCompressor::Connect(OutStream
* output
, Options options
) {
113 assert(compressor_
== NULL
);
114 rawout_
.Connect(output
);
115 rawout_
->Write("BWT", 3);
116 compressor_
= StreamCompressor::Connect(output
,
117 options
.GetCompressionOptions());
118 assert(compressor_
!= NULL
);
122 OutStream
* BwtCompressor::Disconnect() {
123 assert(compressor_
!= NULL
);
124 compressor_
->Disconnect(); compressor_
= NULL
;
126 return rawout_
.Disconnect();
129 bool BwtCompressor::CompressBlock(char* block
, size_t block_size
) {
130 assert(compressor_
!= NULL
);
131 assert(block_size
> 0);
132 if (block_size
> options_
.MaxBlockSize()) { return false; }
135 std::clog
<< "Compressing block of size " << block_size
<< std::endl
;
138 WriteUint63(block_size
);
139 compressor_
->WriteBegin();
140 uint64 eob_position
=
141 BWTransformer::Transform(block
, block_size
,
143 options_
.GetTransformOptions(),
144 options_
.GetMemoryBudget());
145 compressor_
->WriteEnd();
146 WriteUint63(eob_position
);
150 void BwtCompressor::WriteEmptyBlock() { WriteUint63(0); }
152 void BwtCompressor::WriteUint63(int64 value
) {
153 if (options_
.UsePrintableCharacters()) return WriteUint63Printable(value
);
157 unsigned char byte
= value
& 127;
159 bytes
[--position
] = byte
;
161 byte
= (value
& 127) | 128;
163 assert(position
> 0);
164 bytes
[--position
] = byte
;
166 rawout_
->Write(bytes
+ position
, 9 - position
);
169 void BwtCompressor::WriteUint63Printable(int64 value
) {
170 // Write value in decimal ended by newline.
173 int length
= sprintf(bytes
, "%lld", value
);
174 bytes
[length
] = '\n'; // Replace '\0' with '\n'
175 rawout_
->Write(bytes
, length
+ 1);
179 ///////////////////////////// BwtDecompressor ///////////////////////////////
181 int64
BwtDecompressor::Decompress(InStreamBuffer
* input
, OutStream
* output
,
183 BwtDecompressor decompressor
;
184 bool success
= decompressor
.Connect(input
);
185 if (!success
) return decompressor
.GetInputError();
186 while (decompressor
.HasData()) {
187 success
= decompressor
.DecompressBlock(output
, options
);
189 if (decompressor
.GetInputError()) {
190 return decompressor
.GetInputError();
192 int64 too_large_block_size
=
193 decompressor
.GetUncompressedSizeOfNextBlock();
194 decompressor
.Disconnect();
195 return too_large_block_size
;
199 decompressor
.Disconnect();
203 bool BwtDecompressor::Connect(InStreamBuffer
* input
) {
204 assert(!low_memory_failure_
);
205 input_error_
= 0; // clear previous errors
206 rawin_
.Connect(input
);
207 // Read the initial "BWT"
208 char magic_number
[3];
209 rawin_
->Read(magic_number
, 3);
210 if (magic_number
[0] != 'B' || magic_number
[1] != 'W'
211 || magic_number
[2] != 'T') {
215 assert(decompressor_
== NULL
);
216 decompressor_
= StreamDecompressor::Connect(input
);
217 if (!decompressor_
) {
221 use_printable_characters_
= ('t' == (decompressor_
->GetFormat())[0]);
222 uncompressed_block_size_
= ReadUint63();
223 if (uncompressed_block_size_
< 0) {
230 InStreamBuffer
* BwtDecompressor::Disconnect() {
231 assert(decompressor_
!= NULL
);
232 decompressor_
->Disconnect(); decompressor_
= NULL
;
233 uncompressed_block_size_
= 0;
234 low_memory_failure_
= false;
235 return rawin_
.Disconnect();
238 bool BwtDecompressor::DecompressBlock(OutStream
* output
, Options options
) {
239 assert(uncompressed_block_size_
> 0);
240 assert(decompressor_
!= NULL
);
241 int64 max_block_size
= options
.MaxBlockSize(decompressor_
->SizeInBytes());
242 if (uncompressed_block_size_
> max_block_size
) {
243 assert(!low_memory_failure_
);
244 low_memory_failure_
= true;
247 low_memory_failure_
= false;
249 std::vector
<char> bwt(uncompressed_block_size_
+ 1);
250 decompressor_
->ReadBegin();
252 std::clog
<< "Decompressing a block of size " << uncompressed_block_size_
255 decompressor_
->Read(&bwt
[0], bwt
.size());
256 decompressor_
->ReadEnd();
257 int64 eob_position
= ReadUint63();
258 if (eob_position
< 0 || eob_position
> uncompressed_block_size_
) {
262 if ( ! InverseBWTransformer::Transform(&bwt
[0], bwt
.size(),
263 eob_position
, output
,
264 options
.GetTransformOptions(),
265 options
.GetMemoryBudget())) {
270 // Get ready for the next block.
271 uncompressed_block_size_
= ReadUint63();
272 if (uncompressed_block_size_
< 0) {
279 // Read an integer value in the range [0,2^63) from the input
280 int64
BwtDecompressor::ReadUint63() {
281 if (use_printable_characters_
) return ReadUint63Printable();
284 int num_bytes
= 0; // used only for checking
286 assert(++num_bytes
<= 9);
287 byte
= rawin_
->ReadByte();
288 value
= (value
<< 7) + (byte
& 127);
289 } while (byte
> 127);
293 // Read an integer value in the range [0,2^63) from the input
294 int64
BwtDecompressor::ReadUint63Printable() {
297 int num_bytes
= 0; // used only for checking
299 assert(num_bytes
< 32);
300 byte
= rawin_
->ReadByte();
301 bytes
[num_bytes
++] = byte
;
302 } while ('\n' != byte
);
304 int64 value
= strtoll(bytes
, &next
, 10);
308 // Store the error and clean up
309 void BwtDecompressor::SetError(int error
) {
311 input_error_
= error
;
312 uncompressed_block_size_
= 0;
313 low_memory_failure_
= false;
315 decompressor_
->Disconnect(); decompressor_
= NULL
;
320 } // namespace dcsbwt