1 // Copyright 2007 Google Inc.
3 // This program is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU General Public License
5 // as published by the Free Software Foundation; either version 2
6 // of the License, or (at your option) any later version.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 // Two algorithms for the inverse Burrows-Wheeler transform.
18 // 1. SimpleInverseBWTransformer: This is the algorithm basis in
19 // Seward: Space-time tradeoffs in the Inverse B-W Transform.
20 // 2. FastInverseBWTransformer: This is the algorithm mergeTL
21 // in Sewards paper with some additional heuristics to be
22 // able to handle very large blocks.
24 #include "inverse_bwt.h"
30 #include <numeric> // for partial_sum
36 //////////////////////////////////////////////////////////////////
37 // SimpleInverseBWTransformer
38 //////////////////////////////////////////////////////////////////
40 class SimpleInverseBWTransformer
: public InverseBWTransformer
{
42 class Options
: public InverseBWTransformer::AlgorithmSpecificOptions
{
47 virtual bool Set(const std::string
& options
) {
48 return 0 == options
.length();
50 virtual std::string
Get() const { return std::string(); }
51 virtual int64
MaxBlockSize(int64 memory_budget
) const {
52 return memory_budget
/ 9;
55 virtual InverseBWTransformer
* GetTransformer(int64 memory_budget
) {
56 return new SimpleInverseBWTransformer
;
60 SimpleInverseBWTransformer() {}
61 virtual ~SimpleInverseBWTransformer() {}
64 virtual bool DoTransform(const char* bwt
, size_t bwt_size
,
65 size_t eob_position
, OutStream
* output
);
67 SimpleInverseBWTransformer(const SimpleInverseBWTransformer
&);
68 SimpleInverseBWTransformer
& operator=(const SimpleInverseBWTransformer
&);
72 bool SimpleInverseBWTransformer::DoTransform(
73 const char* bwt
, size_t bwt_size
,
74 size_t eob_position
, OutStream
* output
)
76 // rank[i] will be the number of occurrences of bwt[i] in bwt[0..i-1]
77 std::vector
<uint32
> rank(bwt_size
, 0);
79 // count[] serves two purposes:
80 // 1. When the scan of the BWT reaches position i,
81 // count[c+1] is the number of occurrences of c in bwt[0..i-1]
82 // (count[0] counts the EOB symbol).
83 // 2. During the ouput generation, count[c] is the total number
84 // of characters smaller than c (including the single EOB symbol).
85 std::vector
<uint32
> count(257, 0);
87 // count characters before EOB
88 for (uint32 position
= 0; position
< eob_position
; ++position
) {
89 int ch
= static_cast<unsigned char>(bwt
[position
]);
90 rank
[position
] = count
[ch
+ 1];
94 rank
[eob_position
] = 0;
96 // count characters after EOB
97 for (uint32 position
= eob_position
+ 1; position
< bwt_size
; ++position
) {
98 int ch
= static_cast<unsigned char>(bwt
[position
]);
99 rank
[position
] = count
[ch
+ 1];
102 std::partial_sum(count
.begin(), count
.end(), count
.begin());
103 assert(count
[256] == bwt_size
);
105 OutStreamBuffer buffer
;
106 buffer
.Connect(output
);
108 uint32 string_length
= 0;
109 while (position
!= eob_position
) {
110 int ch
= static_cast<unsigned char>(bwt
[position
]);
111 buffer
.WriteByte(ch
);
113 position
= count
[ch
] + rank
[position
];
115 // If the BWT or the EOB position contain errors (or are garbage),
116 // it is likely that the cycle ends prematurely.
117 // In this case, we return false.
119 return (string_length
+ 1 == bwt_size
);
122 //////////////////////////////////////////////////////////////////
123 // FastInverseBWTransformer
124 //////////////////////////////////////////////////////////////////
126 class FastInverseBWTransformer
: public InverseBWTransformer
{
128 class Options
: public InverseBWTransformer::AlgorithmSpecificOptions
{
131 virtual ~Options() {}
133 virtual bool Set(const std::string
& options
) {
134 return 0 == options
.length();
136 virtual std::string
Get() const { return std::string(); }
137 virtual int64
MaxBlockSize(int64 memory_budget
) const {
138 memory_budget
-= kMemoryOverhead
;
139 return memory_budget
/ sizeof(uint32
);
142 virtual InverseBWTransformer
* GetTransformer(int64 memory_budget
) {
143 return new FastInverseBWTransformer
;
146 static const int64 kMemoryOverhead
= 1 << 20;
149 FastInverseBWTransformer() {}
150 virtual ~FastInverseBWTransformer() {}
153 virtual bool DoTransform(const char* bwt
, size_t bwt_size
,
154 size_t eob_position
, OutStream
* output
);
156 FastInverseBWTransformer(const FastInverseBWTransformer
&);
157 FastInverseBWTransformer
& operator=(const FastInverseBWTransformer
&);
160 bool FastInverseBWTransformer::DoTransform(
161 const char* bwt
, size_t bwt_size
,
162 size_t eob_position
, OutStream
* output
)
164 // rank[i] will be the number of occurrences of bwt[i] in bwt[0..i-1]
165 std::vector
<uint32
> bwt_rank_low24(bwt_size
);
166 std::vector
<uint32
> rank_milestone_buffer
[256];
168 // count[] serves two purposes:
169 // 1. When the scan of the BWT reaches position i,
170 // count[c+1] is the number of occurrences of c in bwt[0..i-1]
171 // (count[0] counts the EOB symbol).
172 // 2. During the ouput generation, count[c] is the total number
173 // of characters smaller than c (including the single EOB symbol).
174 std::vector
<uint32
> count(257, 0);
177 bwt_rank_low24
[eob_position
] = 0;
179 // count other characters
180 for (uint32 position
= 0; position
< bwt_size
; ++position
) {
181 if (position
!= eob_position
) {
182 uint32 ch
= static_cast<unsigned char>(bwt
[position
]);
183 uint32 rank
= count
[ch
+ 1];
184 uint32 rank_low24
= rank
& 0x00FFFFFF;
185 //uint32 rank_hi8 = rank >> 24;
186 bwt_rank_low24
[position
] = (ch
<< 24) + rank_low24
;
187 if (0 == rank_low24
) {
188 rank_milestone_buffer
[ch
].push_back(position
);
193 std::partial_sum(count
.begin(), count
.end(), count
.begin());
194 assert(count
[256] == bwt_size
);
196 for (int ch
= 0; ch
< 256; ++ch
) {
197 rank_milestone_buffer
[ch
].push_back(bwt_size
);
200 OutStreamBuffer buffer
;
201 buffer
.Connect(output
);
203 uint32 string_length
= 0;
204 while (position
!= eob_position
) {
205 uint32 bwt_and_rank
= bwt_rank_low24
[position
];
206 uint32 ch
= bwt_and_rank
>> 24;
207 buffer
.WriteByte(ch
);
209 uint32 rank_low24
= bwt_and_rank
& 0x00FFFFFF;
211 while (rank_milestone_buffer
[ch
][rank_hi8
+ 1] <= position
) ++rank_hi8
;
212 position
= count
[ch
] + (rank_hi8
<< 24) + rank_low24
;
214 // If the BWT or the EOB position contain errors (or are garbage),
215 // it is likely that the cycle ends prematurely.
216 // In this case, we return false.
218 return (string_length
+ 1 == bwt_size
);
223 bool InverseBWTransformer::Options::Set(const std::string
& options_string
) {
224 if (options_string
.length() < 1) return false;
225 char algorithm
= options_string
[0];
226 AlgorithmSpecificOptions
* options
;
229 options
= new SimpleInverseBWTransformer::Options
;
232 options
= new FastInverseBWTransformer::Options
;
237 std::string
algorithm_specific_options_string(options_string
, 1);
238 bool options_are_valid
= options
->Set(algorithm_specific_options_string
);
239 if (options_are_valid
) {
240 algorithm_id_
= algorithm
;
241 if (algorithm_specific_options_
) delete algorithm_specific_options_
;
242 algorithm_specific_options_
= options
;
246 return options_are_valid
;
249 bool InverseBWTransformer::Transform(
250 const char* bwt_block
, size_t bwt_block_size
,
251 size_t eob_position
, OutStream
* output
,
255 assert(bwt_block_size
<= options
.MaxBlockSize(memory_budget
));
256 InverseBWTransformer
* algorithm
= options
.GetAlgorithm(memory_budget
);
257 bool success
= algorithm
->DoTransform(bwt_block
, bwt_block_size
,
264 } // namespace dcsbwt