1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "encodings/compact_lang_det/tote.h"
6 #include <string.h> // memset
8 #include "encodings/compact_lang_det/win/cld_logging.h"
11 // Take a set of <key, value> pairs and tote them up.
12 // After explicitly sorting, retrieve top key, value pairs
17 memset(key_
, 0, sizeof(key_
));
18 // No need to initialize values
28 memset(key_
, 0, sizeof(key_
));
29 // No need to initialize values
32 // Increment count of quadgrams/trigrams/unigrams scored
33 void Tote::AddGram() {
37 // Three-way associative, guaranteeing that the largest two counts are always
38 // in the data structure. kMaxSize must be a multiple of 3, and is tied to the
39 // subscript calculations here, which are for 8 sets of 3-way associative
40 // buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a
41 // slightly-weird way: The initial probe point is [N] or [N+8], whichever
42 // is specified by key mod 16. In most cases (nearly *all* cases except Latin
43 // script), this entry matches and we update/return. The second probe is
44 // the other of [N] and [N+8]. The third probe is only used as a fallback to
45 // these two, and is there only for the rare case that there are three or more
46 // languages with Language enum values equal mod 8, contending within the same
47 // bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because
48 // the other scripts have fewer than 17 languages total.
49 // If you change kMaxSize, change the constants 7/8/15/16 below
50 void Tote::Add(uint8 ikey
, int idelta
) {
54 // Look for existing entry
56 if (key_
[sub0
] == ikey
) {
57 value_
[sub0
] += idelta
;
61 if (key_
[sub1
] == ikey
) {
62 value_
[sub1
] += idelta
;
65 int sub2
= (ikey
& 7) + 16;
66 if (key_
[sub2
] == ikey
) {
67 value_
[sub2
] += idelta
;
73 if (key_
[sub0
] == 0) {
75 } else if (key_
[sub1
] == 0) {
77 } else if (key_
[sub2
] == 0) {
80 // All choices allocated, need to replace smallest one
82 if (value_
[sub1
] < value_
[alloc
]) {alloc
= sub1
;}
83 if (value_
[sub2
] < value_
[alloc
]) {alloc
= sub2
;}
86 value_
[alloc
] = idelta
;
90 // Return current top key
91 int Tote::CurrentTopKey() {
94 for (int sub
= 0; sub
< kMaxSize_
; ++sub
) {
95 if (key_
[sub
] == 0) {continue;}
96 if (top_value
< value_
[sub
]) {
97 top_value
= value_
[sub
];
105 // Sort first n entries by decreasing order of value
106 // If key==0 other fields are not valid, treat value as -1
107 void Tote::Sort(int n
) {
108 // This is n**2, but n is small
109 for (int sub
= 0; sub
< n
; ++sub
) {
110 if (key_
[sub
] == 0) {value_
[sub
] = -1;}
112 // Bubble sort key[sub] and entry[sub]
113 for (int sub2
= sub
+ 1; sub2
< kMaxSize_
; ++sub2
) {
114 if (key_
[sub2
] == 0) {value_
[sub2
] = -1;}
115 if (value_
[sub
] < value_
[sub2
]) {
117 uint8 tmpk
= key_
[sub
];
118 key_
[sub
] = key_
[sub2
];
120 int tmpv
= value_
[sub
];
121 value_
[sub
] = value_
[sub2
];
128 void Tote::Dump(FILE* f
) {
129 for (int sub
= 0; sub
< kMaxSize_
; ++sub
) {
131 fprintf(f
, "[%2d] %3d %8d\n", sub
, key_
[sub
], value_
[sub
]);
134 fprintf(f
, "%d %d %d\n", gram_count_
, incr_count_
, byte_count_
);
140 // Take a set of <key, value> pairs and tote them up.
141 // After explicitly sorting, retrieve top key, value pairs
142 ToteWithReliability::ToteWithReliability() {
143 // No need to initialize score_ or value_
146 memset(closepair_
, 0, sizeof(closepair_
));
147 memset(key_
, 0, sizeof(key_
));
150 ToteWithReliability::~ToteWithReliability() {
153 void ToteWithReliability::Reinit() {
154 // No need to initialize score_ or value_
157 memset(closepair_
, 0, sizeof(closepair_
));
158 memset(key_
, 0, sizeof(key_
));
162 // Weight reliability by ibytes
163 // Also see three-way associative comments above for Tote
164 void ToteWithReliability::Add(uint8 ikey
, int ibytes
,
165 int score
, int ireliability
) {
170 // Look for existing entry
171 int sub0
= ikey
& 15;
172 if (key_
[sub0
] == ikey
) {
173 value_
[sub0
] += ibytes
;
174 score_
[sub0
] += score
;
175 reliability_
[sub0
] += ireliability
* ibytes
;
179 if (key_
[sub1
] == ikey
) {
180 value_
[sub1
] += ibytes
;
181 score_
[sub1
] += score
;
182 reliability_
[sub1
] += ireliability
* ibytes
;
185 int sub2
= (ikey
& 7) + 16;
186 if (key_
[sub2
] == ikey
) {
187 value_
[sub2
] += ibytes
;
188 score_
[sub2
] += score
;
189 reliability_
[sub2
] += ireliability
* ibytes
;
193 // Allocate new entry
195 if (key_
[sub0
] == 0) {
197 } else if (key_
[sub1
] == 0) {
199 } else if (key_
[sub2
] == 0) {
202 // All choices allocated, need to replace smallest one
204 if (value_
[sub1
] < value_
[alloc
]) {alloc
= sub1
;}
205 if (value_
[sub2
] < value_
[alloc
]) {alloc
= sub2
;}
208 value_
[alloc
] = ibytes
;
209 score_
[alloc
] = score
;
210 reliability_
[alloc
] = ireliability
* ibytes
;
214 // Find subscript of a given packed language, or -1
215 int ToteWithReliability::Find(uint8 ikey
) {
219 // Linear search if sorted
220 for (int sub
= 0; sub
< kMaxSize_
; ++sub
) {
221 if (key_
[sub
] == ikey
) {return sub
;}
226 // Look for existing entry
227 int sub0
= ikey
& 15;
228 if (key_
[sub0
] == ikey
) {
232 if (key_
[sub1
] == ikey
) {
235 int sub2
= (ikey
& 7) + 16;
236 if (key_
[sub2
] == ikey
) {
243 // Return current top key
244 int ToteWithReliability::CurrentTopKey() {
247 for (int sub
= 0; sub
< kMaxSize_
; ++sub
) {
248 if (key_
[sub
] == 0) {continue;}
249 if (top_value
< value_
[sub
]) {
250 top_value
= value_
[sub
];
258 // Sort first n entries by decreasing order of value
259 // If key==0 other fields are not valid, treat value as -1
260 void ToteWithReliability::Sort(int n
) {
261 // This is n**2, but n is small
262 for (int sub
= 0; sub
< n
; ++sub
) {
263 if (key_
[sub
] == 0) {value_
[sub
] = -1;}
265 // Bubble sort key[sub] and entry[sub]
266 for (int sub2
= sub
+ 1; sub2
< kMaxSize_
; ++sub2
) {
267 if (key_
[sub2
] == 0) {value_
[sub2
] = -1;}
268 if (value_
[sub
] < value_
[sub2
]) {
270 uint8 tmpk
= key_
[sub
];
271 key_
[sub
] = key_
[sub2
];
274 int tmpv
= value_
[sub
];
275 value_
[sub
] = value_
[sub2
];
278 int tmps
= score_
[sub
];
279 score_
[sub
] = score_
[sub2
];
282 int tmpr
= reliability_
[sub
];
283 reliability_
[sub
] = reliability_
[sub2
];
284 reliability_
[sub2
] = tmpr
;
291 void ToteWithReliability::Dump(FILE* f
) {
292 for (int sub
= 0; sub
< kMaxSize_
; ++sub
) {
294 fprintf(f
, "[%2d] %3d %6d %5d %4d\n",
295 sub
, key_
[sub
], value_
[sub
], score_
[sub
], reliability_
[sub
]);
298 fprintf(f
, " %d#\n", incr_count_
);