1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/translate/core/language_detection/language_detection_util.h"
7 #include "base/logging.h"
8 #include "base/metrics/field_trial.h"
9 #include "base/strings/string_split.h"
10 #include "base/strings/string_util.h"
11 #include "base/strings/utf_string_conversions.h"
12 #include "base/time/time.h"
13 #include "components/translate/core/common/translate_constants.h"
14 #include "components/translate/core/common/translate_metrics.h"
15 #include "components/translate/core/common/translate_util.h"
17 #if !defined(CLD_VERSION) || CLD_VERSION==1
18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
22 #if !defined(CLD_VERSION) || CLD_VERSION==2
23 #include "third_party/cld_2/src/public/compact_lang_det.h"
28 // Similar language code list. Some languages are very similar and difficult
29 // for CLD to distinguish.
30 struct SimilarLanguageCode
{
31 const char* const code
;
35 const SimilarLanguageCode kSimilarLanguageCodes
[] = {
42 // Checks |kSimilarLanguageCodes| and returns group code.
43 int GetSimilarLanguageGroupCode(const std::string
& language
) {
44 for (size_t i
= 0; i
< arraysize(kSimilarLanguageCodes
); ++i
) {
45 if (language
.find(kSimilarLanguageCodes
[i
].code
) != 0)
47 return kSimilarLanguageCodes
[i
].group
;
52 // Well-known languages which often have wrong server configuration of
53 // Content-Language: en.
54 // TODO(toyoshim): Remove these static tables and caller functions to
55 // translate/common, and implement them as std::set<>.
56 const char* kWellKnownCodesOnWrongConfiguration
[] = {
57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
60 // Applies a series of language code modification in proper order.
61 void ApplyLanguageCodeCorrection(std::string
* code
) {
62 // Correct well-known format errors.
63 translate::CorrectLanguageCodeTypo(code
);
65 if (!translate::IsValidLanguageCode(*code
)) {
66 *code
= std::string();
70 translate::ToTranslateLanguageSynonym(code
);
73 int GetCLDMajorVersion() {
74 #if !defined(CLD_VERSION)
75 std::string group_name
= base::FieldTrialList::FindFullName("CLD1VsCLD2");
76 if (group_name
== "CLD2")
85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
88 std::string
DetermineTextLanguage(const base::string16
& text
,
89 bool* is_cld_reliable
) {
90 std::string language
= translate::kUnknownLanguageCode
;
92 bool is_reliable
= false;
94 // Language or CLD2::Language
96 bool is_valid_language
= false;
98 switch (GetCLDMajorVersion()) {
99 #if !defined(CLD_VERSION) || CLD_VERSION==1
101 int num_languages
= 0;
103 DetectLanguageOfUnicodeText(NULL
, text
.c_str(), true, &is_reliable
,
104 &num_languages
, NULL
, &text_bytes
);
105 is_valid_language
= cld_language
!= NUM_LANGUAGES
&&
106 cld_language
!= UNKNOWN_LANGUAGE
&&
107 cld_language
!= TG_UNKNOWN_LANGUAGE
;
111 #if !defined(CLD_VERSION) || CLD_VERSION==2
113 std::string
utf8_text(base::UTF16ToUTF8(text
));
114 CLD2::Language language3
[3];
116 CLD2::DetectLanguageSummary(
117 utf8_text
.c_str(), (int)utf8_text
.size(), true, language3
, percent3
,
118 &text_bytes
, &is_reliable
);
119 cld_language
= language3
[0];
120 is_valid_language
= cld_language
!= CLD2::NUM_LANGUAGES
&&
121 cld_language
!= CLD2::UNKNOWN_LANGUAGE
&&
122 cld_language
!= CLD2::TG_UNKNOWN_LANGUAGE
;
130 if (is_cld_reliable
!= NULL
)
131 *is_cld_reliable
= is_reliable
;
133 // We don't trust the result if the CLD reports that the detection is not
134 // reliable, or if the actual text used to detect the language was less than
135 // 100 bytes (short texts can often lead to wrong results).
136 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
137 // the determined language code is correct with 50% confidence. Chrome should
138 // handle the real confidence value to judge.
139 if (is_reliable
&& text_bytes
>= 100 && is_valid_language
) {
140 // We should not use LanguageCode_ISO_639_1 because it does not cover all
141 // the languages CLD can detect. As a result, it'll return the invalid
142 // language code for tradtional Chinese among others.
143 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
145 // for Simplified Chinese.
146 switch (GetCLDMajorVersion()) {
147 #if !defined(CLD_VERSION) || CLD_VERSION==1
150 LanguageCodeWithDialects(static_cast<Language
>(cld_language
));
153 #if !defined(CLD_VERSION) || CLD_VERSION==2
155 // (1) CLD2's LanguageCode returns general Chinese 'zh' for
156 // CLD2::CHINESE, but Translate server doesn't accept it. This is
157 // converted to 'zh-CN' in the same way as CLD1's
158 // LanguageCodeWithDialects.
160 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for
161 // CLD2::CHINESE_T. This is technically more precise for the language
162 // code of traditional Chinese, while Translate server hasn't accepted
164 if (cld_language
== CLD2::CHINESE
) {
166 } else if (cld_language
== CLD2::CHINESE_T
) {
170 CLD2::LanguageCode(static_cast<CLD2::Language
>(cld_language
));
178 VLOG(9) << "Detected lang_id: " << language
<< ", from Text:\n" << text
179 << "\n*************************************\n";
183 // Checks if CLD can complement a sub code when the page language doesn't know
185 bool CanCLDComplementSubCode(
186 const std::string
& page_language
, const std::string
& cld_language
) {
187 // Translate server cannot treat general Chinese. If Content-Language and
188 // CLD agree that the language is Chinese and Content-Language doesn't know
189 // which dialect is used, CLD language has priority.
190 // TODO(hajimehoshi): How about the other dialects like zh-MO?
191 return page_language
== "zh" && StartsWithASCII(cld_language
, "zh-", false);
196 namespace translate
{
198 std::string
DeterminePageLanguage(const std::string
& code
,
199 const std::string
& html_lang
,
200 const base::string16
& contents
,
201 std::string
* cld_language_p
,
202 bool* is_cld_reliable_p
) {
203 base::TimeTicks begin_time
= base::TimeTicks::Now();
204 bool is_cld_reliable
;
205 std::string cld_language
= DetermineTextLanguage(contents
, &is_cld_reliable
);
206 translate::ReportLanguageDetectionTime(begin_time
, base::TimeTicks::Now());
208 if (cld_language_p
!= NULL
)
209 *cld_language_p
= cld_language
;
210 if (is_cld_reliable_p
!= NULL
)
211 *is_cld_reliable_p
= is_cld_reliable
;
212 translate::ToTranslateLanguageSynonym(&cld_language
);
214 // Check if html lang attribute is valid.
215 std::string modified_html_lang
;
216 if (!html_lang
.empty()) {
217 modified_html_lang
= html_lang
;
218 ApplyLanguageCodeCorrection(&modified_html_lang
);
219 translate::ReportHtmlLang(html_lang
, modified_html_lang
);
220 VLOG(9) << "html lang based language code: " << modified_html_lang
;
223 // Check if Content-Language is valid.
224 std::string modified_code
;
226 modified_code
= code
;
227 ApplyLanguageCodeCorrection(&modified_code
);
228 translate::ReportContentLanguage(code
, modified_code
);
231 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
233 std::string language
= modified_html_lang
.empty() ? modified_code
:
236 // If |language| is empty, just use CLD result even though it might be
237 // translate::kUnknownLanguageCode.
238 if (language
.empty()) {
239 translate::ReportLanguageVerification(
240 translate::LANGUAGE_VERIFICATION_CLD_ONLY
);
244 if (cld_language
== kUnknownLanguageCode
) {
245 translate::ReportLanguageVerification(
246 translate::LANGUAGE_VERIFICATION_UNKNOWN
);
250 if (CanCLDComplementSubCode(language
, cld_language
)) {
251 translate::ReportLanguageVerification(
252 translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE
);
256 if (IsSameOrSimilarLanguages(language
, cld_language
)) {
257 translate::ReportLanguageVerification(
258 translate::LANGUAGE_VERIFICATION_CLD_AGREE
);
262 if (MaybeServerWrongConfiguration(language
, cld_language
)) {
263 translate::ReportLanguageVerification(
264 translate::LANGUAGE_VERIFICATION_TRUST_CLD
);
268 // Content-Language value might be wrong because CLD says that this page is
269 // written in another language with confidence. In this case, Chrome doesn't
270 // rely on any of the language codes, and gives up suggesting a translation.
271 translate::ReportLanguageVerification(
272 translate::LANGUAGE_VERIFICATION_CLD_DISAGREE
);
273 return kUnknownLanguageCode
;
276 void CorrectLanguageCodeTypo(std::string
* code
) {
279 size_t coma_index
= code
->find(',');
280 if (coma_index
!= std::string::npos
) {
281 // There are more than 1 language specified, just keep the first one.
282 *code
= code
->substr(0, coma_index
);
284 base::TrimWhitespaceASCII(*code
, base::TRIM_ALL
, code
);
286 // An underscore instead of a dash is a frequent mistake.
287 size_t underscore_index
= code
->find('_');
288 if (underscore_index
!= std::string::npos
)
289 (*code
)[underscore_index
] = '-';
291 // Change everything up to a dash to lower-case and everything after to upper.
292 size_t dash_index
= code
->find('-');
293 if (dash_index
!= std::string::npos
) {
294 *code
= base::StringToLowerASCII(code
->substr(0, dash_index
)) +
295 StringToUpperASCII(code
->substr(dash_index
));
297 *code
= base::StringToLowerASCII(*code
);
301 bool IsValidLanguageCode(const std::string
& code
) {
302 // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
303 // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
304 std::vector
<std::string
> chunks
;
305 base::SplitString(code
, '-', &chunks
);
307 if (chunks
.size() < 1 || 2 < chunks
.size())
310 const std::string
& main_code
= chunks
[0];
312 if (main_code
.size() < 1 || 3 < main_code
.size())
315 for (std::string::const_iterator it
= main_code
.begin();
316 it
!= main_code
.end(); ++it
) {
317 if (!IsAsciiAlpha(*it
))
321 if (chunks
.size() == 1)
324 const std::string
& sub_code
= chunks
[1];
326 if (sub_code
.size() != 2)
329 for (std::string::const_iterator it
= sub_code
.begin();
330 it
!= sub_code
.end(); ++it
) {
331 if (!IsAsciiAlpha(*it
))
338 bool IsSameOrSimilarLanguages(const std::string
& page_language
,
339 const std::string
& cld_language
) {
340 std::vector
<std::string
> chunks
;
342 base::SplitString(page_language
, '-', &chunks
);
343 if (chunks
.size() == 0)
345 std::string page_language_main_part
= chunks
[0];
347 base::SplitString(cld_language
, '-', &chunks
);
348 if (chunks
.size() == 0)
350 std::string cld_language_main_part
= chunks
[0];
352 // Language code part of |page_language| is matched to one of |cld_language|.
353 // Country code is ignored here.
354 if (page_language_main_part
== cld_language_main_part
) {
355 // Languages are matched strictly. Reports false to metrics, but returns
357 translate::ReportSimilarLanguageMatch(false);
361 // Check if |page_language| and |cld_language| are in the similar language
362 // list and belong to the same language group.
363 int page_code
= GetSimilarLanguageGroupCode(page_language
);
364 bool match
= page_code
!= 0 &&
365 page_code
== GetSimilarLanguageGroupCode(cld_language
);
367 translate::ReportSimilarLanguageMatch(match
);
371 bool MaybeServerWrongConfiguration(const std::string
& page_language
,
372 const std::string
& cld_language
) {
373 // If |page_language| is not "en-*", respect it and just return false here.
374 if (!StartsWithASCII(page_language
, "en", false))
377 // A server provides a language meta information representing "en-*". But it
378 // might be just a default value due to missing user configuration.
379 // Let's trust |cld_language| if the determined language is not difficult to
380 // distinguish from English, and the language is one of well-known languages
381 // which often provide "en-*" meta information mistakenly.
382 for (size_t i
= 0; i
< arraysize(kWellKnownCodesOnWrongConfiguration
); ++i
) {
383 if (cld_language
== kWellKnownCodesOnWrongConfiguration
[i
])
389 } // namespace translate