1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "encodings/compact_lang_det/compact_lang_det.h"
6 #include "encodings/compact_lang_det/compact_lang_det_impl.h"
7 #include "encodings/compact_lang_det/win/cld_basictypes.h"
9 // String is "code_version - data_scrape_date"
10 static const char* kDetectLanguageVersion
= "V1.6 - 20081121";
12 // Large-table version for all ~160 languages (all Tiers)
14 // Scan interchange-valid UTF-8 bytes and detect most likely language
15 Language
CompactLangDet::DetectLanguage(
16 const DetectionTables
* tables
,
21 bool allow_extended_lang
= false;
22 Language language3
[3];
24 double normalized_score3
[3];
27 Language plus_one
= UNKNOWN_LANGUAGE
;
28 const char* tld_hint
= "";
29 int encoding_hint
= UNKNOWN_ENCODING
;
30 Language language_hint
= UNKNOWN_LANGUAGE
;
32 Language lang
= CompactLangDetImpl::DetectLanguageSummaryV25(
37 tld_hint
, // "id" boosts Indonesian
38 encoding_hint
, // SJS boosts Japanese
39 language_hint
, // ITALIAN boosts it
48 // Default to English.
49 if (lang
== UNKNOWN_LANGUAGE
) {
55 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
56 Language
CompactLangDet::DetectLanguageSummary(
57 const DetectionTables
* tables
,
65 double normalized_score3
[3];
66 bool allow_extended_lang
= false;
68 Language plus_one
= UNKNOWN_LANGUAGE
;
69 const char* tld_hint
= "";
70 int encoding_hint
= UNKNOWN_ENCODING
;
71 Language language_hint
= UNKNOWN_LANGUAGE
;
73 Language lang
= CompactLangDetImpl::DetectLanguageSummaryV25(
78 tld_hint
, // "id" boosts Indonesian
79 encoding_hint
, // SJS boosts Japanese
80 language_hint
, // ITALIAN boosts it
90 if (lang
== UNKNOWN_LANGUAGE
) {
96 // Same as above, with hints supplied
97 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
98 Language
CompactLangDet::DetectLanguageSummary(
99 const DetectionTables
* tables
,
103 const char* tld_hint
, // "id" boosts Indonesian
104 int encoding_hint
, // SJS boosts Japanese
105 Language language_hint
, // ITALIAN boosts it
110 double normalized_score3
[3];
111 bool allow_extended_lang
= false;
113 Language plus_one
= UNKNOWN_LANGUAGE
;
115 Language lang
= CompactLangDetImpl::DetectLanguageSummaryV25(
120 tld_hint
, // "id" boosts Indonesian
121 encoding_hint
, // SJS boosts Japanese
122 language_hint
, // ITALIAN boosts it
131 // Default to English
132 if (lang
== UNKNOWN_LANGUAGE
) {
139 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
141 // Extended languages are additional Google interface languages and Unicode
142 // single-language scripts, from ext_lang_enc.h
143 Language
CompactLangDet::ExtDetectLanguageSummary(
144 const DetectionTables
* tables
,
152 double normalized_score3
[3];
153 bool allow_extended_lang
= true;
155 Language plus_one
= UNKNOWN_LANGUAGE
;
156 const char* tld_hint
= "";
157 int encoding_hint
= UNKNOWN_ENCODING
;
158 Language language_hint
= UNKNOWN_LANGUAGE
;
160 Language lang
= CompactLangDetImpl::DetectLanguageSummaryV25(
165 tld_hint
, // "id" boosts Indonesian
166 encoding_hint
, // SJS boosts Japanese
167 language_hint
, // ITALIAN boosts it
176 // Do not default to English
180 // Same as above, with hints supplied
181 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
183 // Extended languages are additional Google interface languages and Unicode
184 // single-language scripts, from ext_lang_enc.h
185 Language
CompactLangDet::ExtDetectLanguageSummary(
186 const DetectionTables
* tables
,
190 const char* tld_hint
, // "id" boosts Indonesian
191 int encoding_hint
, // SJS boosts Japanese
192 Language language_hint
, // ITALIAN boosts it
197 double normalized_score3
[3];
198 bool allow_extended_lang
= true;
200 Language plus_one
= UNKNOWN_LANGUAGE
;
202 Language lang
= CompactLangDetImpl::DetectLanguageSummaryV25(
207 tld_hint
, // "id" boosts Indonesian
208 encoding_hint
, // SJS boosts Japanese
209 language_hint
, // ITALIAN boosts it
218 // Do not default to English
222 // Same as above, and also returns internal language scores as a ratio to
223 // normal score for real text in that language. Scores close to 1.0 indicate
224 // normal text, while scores far away from 1.0 indicate badly-skewed text or
227 Language
CompactLangDet::ExtDetectLanguageSummary(
228 const DetectionTables
* tables
,
232 const char* tld_hint
, // "id" boosts Indonesian
233 int encoding_hint
, // SJS boosts Japanese
234 Language language_hint
, // ITALIAN boosts it
237 double* normalized_score3
,
240 bool allow_extended_lang
= true;
242 Language plus_one
= UNKNOWN_LANGUAGE
;
244 Language lang
= CompactLangDetImpl::DetectLanguageSummaryV25(
249 tld_hint
, // "id" boosts Indonesian
250 encoding_hint
, // SJS boosts Japanese
251 language_hint
, // ITALIAN boosts it
260 // Do not default to English
266 // Return version text string
267 // String is "code_version - data_scrape_date"
268 const char* CompactLangDet::DetectLanguageVersion() {
269 return kDetectLanguageVersion
;