Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / third_party / cld / encodings / compact_lang_det / compact_lang_det.cc
blobe5af2005df638f4ec135914d37ba2bfaa08fecf2
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "encodings/compact_lang_det/compact_lang_det.h"
6 #include "encodings/compact_lang_det/compact_lang_det_impl.h"
7 #include "encodings/compact_lang_det/win/cld_basictypes.h"
9 // String is "code_version - data_scrape_date"
10 static const char* kDetectLanguageVersion = "V1.6 - 20081121";
12 // Large-table version for all ~160 languages (all Tiers)
14 // Scan interchange-valid UTF-8 bytes and detect most likely language
15 Language CompactLangDet::DetectLanguage(
16 const DetectionTables* tables,
17 const char* buffer,
18 int buffer_length,
19 bool is_plain_text,
20 bool* is_reliable) {
21 bool allow_extended_lang = false;
22 Language language3[3];
23 int percent3[3];
24 double normalized_score3[3];
25 int text_bytes;
26 int flags = 0;
27 Language plus_one = UNKNOWN_LANGUAGE;
28 const char* tld_hint = "";
29 int encoding_hint = UNKNOWN_ENCODING;
30 Language language_hint = UNKNOWN_LANGUAGE;
32 Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
33 tables,
34 buffer,
35 buffer_length,
36 is_plain_text,
37 tld_hint, // "id" boosts Indonesian
38 encoding_hint, // SJS boosts Japanese
39 language_hint, // ITALIAN boosts it
40 allow_extended_lang,
41 flags,
42 plus_one,
43 language3,
44 percent3,
45 normalized_score3,
46 &text_bytes,
47 is_reliable);
48 // Default to English.
49 if (lang == UNKNOWN_LANGUAGE) {
50 lang = ENGLISH;
52 return lang;
55 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
56 Language CompactLangDet::DetectLanguageSummary(
57 const DetectionTables* tables,
58 const char* buffer,
59 int buffer_length,
60 bool is_plain_text,
61 Language* language3,
62 int* percent3,
63 int* text_bytes,
64 bool* is_reliable) {
65 double normalized_score3[3];
66 bool allow_extended_lang = false;
67 int flags = 0;
68 Language plus_one = UNKNOWN_LANGUAGE;
69 const char* tld_hint = "";
70 int encoding_hint = UNKNOWN_ENCODING;
71 Language language_hint = UNKNOWN_LANGUAGE;
73 Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
74 tables,
75 buffer,
76 buffer_length,
77 is_plain_text,
78 tld_hint, // "id" boosts Indonesian
79 encoding_hint, // SJS boosts Japanese
80 language_hint, // ITALIAN boosts it
81 allow_extended_lang,
82 flags,
83 plus_one,
84 language3,
85 percent3,
86 normalized_score3,
87 text_bytes,
88 is_reliable);
89 // Default to English
90 if (lang == UNKNOWN_LANGUAGE) {
91 lang = ENGLISH;
93 return lang;
96 // Same as above, with hints supplied
97 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
98 Language CompactLangDet::DetectLanguageSummary(
99 const DetectionTables* tables,
100 const char* buffer,
101 int buffer_length,
102 bool is_plain_text,
103 const char* tld_hint, // "id" boosts Indonesian
104 int encoding_hint, // SJS boosts Japanese
105 Language language_hint, // ITALIAN boosts it
106 Language* language3,
107 int* percent3,
108 int* text_bytes,
109 bool* is_reliable) {
110 double normalized_score3[3];
111 bool allow_extended_lang = false;
112 int flags = 0;
113 Language plus_one = UNKNOWN_LANGUAGE;
115 Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
116 tables,
117 buffer,
118 buffer_length,
119 is_plain_text,
120 tld_hint, // "id" boosts Indonesian
121 encoding_hint, // SJS boosts Japanese
122 language_hint, // ITALIAN boosts it
123 allow_extended_lang,
124 flags,
125 plus_one,
126 language3,
127 percent3,
128 normalized_score3,
129 text_bytes,
130 is_reliable);
131 // Default to English
132 if (lang == UNKNOWN_LANGUAGE) {
133 lang = ENGLISH;
135 return lang;
139 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
140 // languages.
141 // Extended languages are additional Google interface languages and Unicode
142 // single-language scripts, from ext_lang_enc.h
143 Language CompactLangDet::ExtDetectLanguageSummary(
144 const DetectionTables* tables,
145 const char* buffer,
146 int buffer_length,
147 bool is_plain_text,
148 Language* language3,
149 int* percent3,
150 int* text_bytes,
151 bool* is_reliable) {
152 double normalized_score3[3];
153 bool allow_extended_lang = true;
154 int flags = 0;
155 Language plus_one = UNKNOWN_LANGUAGE;
156 const char* tld_hint = "";
157 int encoding_hint = UNKNOWN_ENCODING;
158 Language language_hint = UNKNOWN_LANGUAGE;
160 Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
161 tables,
162 buffer,
163 buffer_length,
164 is_plain_text,
165 tld_hint, // "id" boosts Indonesian
166 encoding_hint, // SJS boosts Japanese
167 language_hint, // ITALIAN boosts it
168 allow_extended_lang,
169 flags,
170 plus_one,
171 language3,
172 percent3,
173 normalized_score3,
174 text_bytes,
175 is_reliable);
176 // Do not default to English
177 return lang;
180 // Same as above, with hints supplied
181 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
182 // languages.
183 // Extended languages are additional Google interface languages and Unicode
184 // single-language scripts, from ext_lang_enc.h
185 Language CompactLangDet::ExtDetectLanguageSummary(
186 const DetectionTables* tables,
187 const char* buffer,
188 int buffer_length,
189 bool is_plain_text,
190 const char* tld_hint, // "id" boosts Indonesian
191 int encoding_hint, // SJS boosts Japanese
192 Language language_hint, // ITALIAN boosts it
193 Language* language3,
194 int* percent3,
195 int* text_bytes,
196 bool* is_reliable) {
197 double normalized_score3[3];
198 bool allow_extended_lang = true;
199 int flags = 0;
200 Language plus_one = UNKNOWN_LANGUAGE;
202 Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
203 tables,
204 buffer,
205 buffer_length,
206 is_plain_text,
207 tld_hint, // "id" boosts Indonesian
208 encoding_hint, // SJS boosts Japanese
209 language_hint, // ITALIAN boosts it
210 allow_extended_lang,
211 flags,
212 plus_one,
213 language3,
214 percent3,
215 normalized_score3,
216 text_bytes,
217 is_reliable);
218 // Do not default to English
219 return lang;
222 // Same as above, and also returns internal language scores as a ratio to
223 // normal score for real text in that language. Scores close to 1.0 indicate
224 // normal text, while scores far away from 1.0 indicate badly-skewed text or
225 // gibberish
227 Language CompactLangDet::ExtDetectLanguageSummary(
228 const DetectionTables* tables,
229 const char* buffer,
230 int buffer_length,
231 bool is_plain_text,
232 const char* tld_hint, // "id" boosts Indonesian
233 int encoding_hint, // SJS boosts Japanese
234 Language language_hint, // ITALIAN boosts it
235 Language* language3,
236 int* percent3,
237 double* normalized_score3,
238 int* text_bytes,
239 bool* is_reliable) {
240 bool allow_extended_lang = true;
241 int flags = 0;
242 Language plus_one = UNKNOWN_LANGUAGE;
244 Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
245 tables,
246 buffer,
247 buffer_length,
248 is_plain_text,
249 tld_hint, // "id" boosts Indonesian
250 encoding_hint, // SJS boosts Japanese
251 language_hint, // ITALIAN boosts it
252 allow_extended_lang,
253 flags,
254 plus_one,
255 language3,
256 percent3,
257 normalized_score3,
258 text_bytes,
259 is_reliable);
260 // Do not default to English
261 return lang;
266 // Return version text string
267 // String is "code_version - data_scrape_date"
268 const char* CompactLangDet::DetectLanguageVersion() {
269 return kDetectLanguageVersion;