1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Remember a subset of a sequence of values, using a modest amount of memory
9 Accumulate in powers of three, using 3-way median to collapse entries.
10 At any given time, there is one most-dense (highest power of 3) range of
11 entries and a series of less-dense ranges that hold 0..2 entries each. There
12 is a bounded-size storage array of S cells for all the entries.
14 The overflow detect is set up so that a new higher power of 3, K+1, is
15 triggered precisely when range K has 3n entries and all ranges < K have
18 In general, think of the range sizes as a multi-digit base 3 number, except
19 the highest digit may exceed 2:
21 3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
22 0 0 0 0 3n-1 2 2 unused:1
24 There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at
25 one more than that, and we add a new 3**0 entry and "carry" by performing
26 medians on any group of 3 elements:
28 3**6 3**5 3**4 3**3 3**2 3**1 3**0 K=2
29 0 0 0 0 3n-1 2 3 unused:0
30 0 0 0 0 3n-1 3 0 carry unused:2
31 0 0 0 0 3n 0 0 carry unused:4
33 To accumulate 2 entries at all levels < K and 3 just before the first carry at
34 level 0, we need 2*K + 1 unused cells after doing all carries, or five cells
35 in this case. Since we only have 4 cells in the example above, we need to
36 make room by starting a new power of three:
38 3**6 3**5 3**4 3**3 3**2 3**1 3**0
39 0 0 0 0 3n 0 0 K=2 unused:4
40 0 0 0 n 0 0 0 K=3 unused:2n+4
42 In the code below, we don't worry about overflow from the topmost place.
47 #include "encodings/compact_lang_det/subsetsequence.h"
50 #include "encodings/compact_lang_det/win/cld_logging.h"
52 void DumpInts(const char* label
, const int* v
, int n
) {
54 for (int i
= 0; i
< n
; ++i
) {
60 void DumpUint8s(const char* label
, const uint8
* v
, int n
) {
62 for (int i
= 0; i
< n
; ++i
) {
68 // Return median of seq_[sub] .. seq_[sub+2], favoring middle element
69 uint8
SubsetSequence::Median3(int sub
) {
70 if (seq_
[sub
] == seq_
[sub
+ 1]) {
73 if (seq_
[sub
] == seq_
[sub
+ 2]) {
79 void SubsetSequence::Init() {
85 seq_
[0] = 0; // Default value if no calls to Add
87 // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
88 int reserve
= (2 * k_
+ 1);
89 level_limit_e_
= kMaxSeq_
- reserve
;
90 level_limit_e_
= (level_limit_e_
/ 3) * 3; // Round down to multiple of 3
91 limit_e_
= level_limit_e_
;
94 // Compress level k by 3x, creating level k+1
95 void SubsetSequence::NewLevel() {
96 // printf("NewLevel 3 ** %d\n", k_ + 1);
97 //DumpUint8s("count[k]", count_, k_ + 1);
98 //DumpUint8s("seq[next]", seq_, next_e_);
100 // Incoming level must be an exact multiple of three in size
101 CHECK((count_
[k_
] % 3) == 0);
102 int k_size
= count_
[k_
];
103 int new_size
= k_size
/ 3;
105 // Compress down by 3x, via median
106 for (int j
= 0; j
< new_size
; ++j
) {
107 seq_
[j
] = Median3(j
* 3);
112 // Else Overflow -- just continue with 3x dense Level K
113 if (k_
< (kMaxLevel_
- 1)) {++k_
;}
114 count_
[k_
] = new_size
;
118 limit_e_
= next_e_
+ 3;
120 // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3
121 int reserve
= (2 * k_
+ 1);
122 level_limit_e_
= kMaxSeq_
- reserve
;
123 level_limit_e_
= (level_limit_e_
/ 3) * 3; // Round down to multiple of 3
125 //DumpUint8s("after: count[k]", count_, k_ + 1);
126 //DumpUint8s("after: seq[next]", seq_, next_e_);
129 void SubsetSequence::DoCarries() {
130 CHECK(count_
[k_
] > 3); // We depend on count_[k_] being > 3 to stop while
131 // Make room by carrying
133 //DumpUint8s("DoCarries count[k]", count_, k_ + 1);
134 //DumpUint8s("DoCarries seq[next]", seq_, next_e_);
137 while (count_
[i
] == 3) {
139 seq_
[next_e_
] = Median3(next_e_
);
145 limit_e_
= next_e_
+ 3;
147 //DumpUint8s("after: DoCarries count[k]", count_, k_ + 1);
148 //DumpUint8s("after: DoCarries seq[next]", seq_, next_e_);
150 // If we just fully carried into level K,
151 // Make sure there is now enough room, else start level K + 1
153 CHECK(count_
[k_
] == next_e_
);
154 if (next_e_
>= level_limit_e_
) {
160 void SubsetSequence::Add(uint8 e
) {
161 // Add an entry then carry as needed
166 if (next_e_
>= limit_e_
) {
172 // Collapse tail end by simple median across disparate-weight values,
173 // dropping or duplicating last value if need be.
174 // This routine is idempotent.
175 void SubsetSequence::Flush() {
176 // printf("Flush %d\n", count_[k_]);
177 int start_tail
= count_
[k_
];
178 int size_tail
= next_e_
- start_tail
;
179 if ((size_tail
% 3) == 2) {
180 seq_
[next_e_
] = seq_
[next_e_
- 1]; // Duplicate last value
184 // Compress tail down by 3x, via median
185 int new_size
= size_tail
/ 3; // May delete last value
186 for (int j
= 0; j
< new_size
; ++j
) {
187 seq_
[start_tail
+ j
] = Median3(start_tail
+ j
* 3);
190 next_e_
= start_tail
+ new_size
;
191 count_
[k_
] = next_e_
;
195 // Extract representative pattern of exactly N values into dst[0..n-1]
196 // This routine may be called multiple times, but it may downsample as a
197 // side effect, causing subsequent calls with larger N to get poor answers.
198 void SubsetSequence::Extract(int to_n
, uint8
* dst
) {
199 // Collapse partial-carries in tail
202 // Just use Bresenham to resample
203 int from_n
= next_e_
;
204 if (to_n
>= from_n
) {
205 // Up-sample from_n => to_n
206 int err
= to_n
- 1; // bias toward no overshoot
208 for (int i
= 0; i
< to_n
; ++i
) {
217 // Get to the point that the number of samples is <= 3 * to_n
218 while (next_e_
> (to_n
* 3)) {
219 // Compress down by 3x, via median
220 // printf("Extract, median %d / 3\n", next_e_);
221 if ((next_e_
% 3) == 2) {
222 seq_
[next_e_
] = seq_
[next_e_
- 1]; // Duplicate last value
225 int new_size
= next_e_
/ 3; // May delete last value
226 for (int j
= 0; j
< new_size
; ++j
) {
227 seq_
[j
] = Median3(j
* 3);
230 count_
[k_
] = next_e_
;
234 if (to_n
== from_n
) {
236 for (int i
= 0; i
< to_n
; ++i
) {
242 // Down-sample from_n => to_n, using medians
243 int err
= 0; // Bias to immediate median sample
245 for (int i
= 0; i
< from_n
; ++i
) {
248 if (i
<= (next_e_
- 2)) {