1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/translate/core/language_detection/language_detection_util.h"
7 #include "base/logging.h"
8 #include "base/strings/string_split.h"
9 #include "base/strings/string_util.h"
10 #include "base/strings/utf_string_conversions.h"
11 #include "base/time/time.h"
12 #include "components/translate/core/common/translate_constants.h"
13 #include "components/translate/core/common/translate_metrics.h"
14 #include "components/translate/core/common/translate_util.h"
17 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
18 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
22 #include "third_party/cld_2/src/public/compact_lang_det.h"
23 #include "third_party/cld_2/src/public/encodings.h"
28 // Similar language code list. Some languages are very similar and difficult
29 // for CLD to distinguish.
30 struct SimilarLanguageCode
{
31 const char* const code
;
35 const SimilarLanguageCode kSimilarLanguageCodes
[] = {
42 // Checks |kSimilarLanguageCodes| and returns group code.
43 int GetSimilarLanguageGroupCode(const std::string
& language
) {
44 for (size_t i
= 0; i
< arraysize(kSimilarLanguageCodes
); ++i
) {
45 if (language
.find(kSimilarLanguageCodes
[i
].code
) != 0)
47 return kSimilarLanguageCodes
[i
].group
;
52 // Well-known languages which often have wrong server configuration of
53 // Content-Language: en.
54 // TODO(toyoshim): Remove these static tables and caller functions to
55 // translate/common, and implement them as std::set<>.
56 const char* kWellKnownCodesOnWrongConfiguration
[] = {
57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
60 // Applies a series of language code modification in proper order.
61 void ApplyLanguageCodeCorrection(std::string
* code
) {
62 // Correct well-known format errors.
63 translate::CorrectLanguageCodeTypo(code
);
65 if (!translate::IsValidLanguageCode(*code
)) {
66 *code
= std::string();
70 translate::ToTranslateLanguageSynonym(code
);
73 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
75 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
76 std::string
DetermineTextLanguage(const base::string16
& text
,
77 bool* is_cld_reliable
,
79 std::string
& html_lang
) {
80 std::string language
= translate::kUnknownLanguageCode
;
81 int num_bytes_evaluated
= 0;
82 bool is_reliable
= false;
83 const bool is_plain_text
= true;
85 // Language or CLD2::Language
87 bool is_valid_language
= false;
90 int num_languages
= 0;
91 cld_language
= DetectLanguageOfUnicodeText(NULL
, text
.c_str(), is_plain_text
,
92 &is_reliable
, &num_languages
, NULL
,
93 &num_bytes_evaluated
);
94 is_valid_language
= cld_language
!= NUM_LANGUAGES
&&
95 cld_language
!= UNKNOWN_LANGUAGE
&&
96 cld_language
!= TG_UNKNOWN_LANGUAGE
;
98 const std::string
utf8_text(base::UTF16ToUTF8(text
));
99 const int num_utf8_bytes
= static_cast<int>(utf8_text
.size());
100 const char* raw_utf8_bytes
= utf8_text
.c_str();
102 CLD2::Language language3
[3];
104 int flags
= 0; // No flags, see compact_lang_det.h for details.
105 int text_bytes
; // Amount of non-tag/letters-only text (assumed 0).
106 double normalized_score3
[3];
108 const char* tld_hint
= "";
109 int encoding_hint
= CLD2::UNKNOWN_ENCODING
;
110 CLD2::Language language_hint
= CLD2::GetLanguageFromName(html_lang
.c_str());
111 CLD2::CLDHints cldhints
= {code
.c_str(), tld_hint
, encoding_hint
,
114 cld_language
= CLD2::ExtDetectLanguageSummaryCheckUTF8(
115 raw_utf8_bytes
, num_utf8_bytes
, is_plain_text
, &cldhints
, flags
,
116 language3
, percent3
, normalized_score3
,
117 nullptr /* No ResultChunkVector used */, &text_bytes
, &is_reliable
,
118 &num_bytes_evaluated
);
120 if (num_bytes_evaluated
< num_utf8_bytes
&&
121 cld_language
== CLD2::UNKNOWN_LANGUAGE
) {
122 // Invalid UTF8 encountered, see bug http://crbug.com/444258.
123 // Retry using only the valid characters. This time the check for valid
124 // UTF8 can be skipped since the precise number of valid bytes is known.
125 cld_language
= CLD2::ExtDetectLanguageSummary(
126 raw_utf8_bytes
, num_utf8_bytes
, is_plain_text
, &cldhints
, flags
,
127 language3
, percent3
, normalized_score3
,
128 nullptr /* No ResultChunkVector used */, &text_bytes
, &is_reliable
);
130 is_valid_language
= cld_language
!= CLD2::NUM_LANGUAGES
&&
131 cld_language
!= CLD2::UNKNOWN_LANGUAGE
&&
132 cld_language
!= CLD2::TG_UNKNOWN_LANGUAGE
;
134 // Choose top language.
135 cld_language
= language3
[0];
137 # error "CLD_VERSION must be 1 or 2"
140 if (is_cld_reliable
!= NULL
)
141 *is_cld_reliable
= is_reliable
;
143 // We don't trust the result if the CLD reports that the detection is not
144 // reliable, or if the actual text used to detect the language was less than
145 // 100 bytes (short texts can often lead to wrong results).
146 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
147 // the determined language code is correct with 50% confidence. Chrome should
148 // handle the real confidence value to judge.
149 if (is_reliable
&& num_bytes_evaluated
>= 100 && is_valid_language
) {
150 // We should not use LanguageCode_ISO_639_1 because it does not cover all
151 // the languages CLD can detect. As a result, it'll return the invalid
152 // language code for tradtional Chinese among others.
153 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
154 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
155 // for Simplified Chinese.
157 language
= LanguageCodeWithDialects(static_cast<Language
>(cld_language
));
159 // (1) CLD2's LanguageCode returns general Chinese 'zh' for
160 // CLD2::CHINESE, but Translate server doesn't accept it. This is
161 // converted to 'zh-CN' in the same way as CLD1's
162 // LanguageCodeWithDialects.
164 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for
165 // CLD2::CHINESE_T. This is technically more precise for the language
166 // code of traditional Chinese, while Translate server hasn't accepted
168 if (cld_language
== CLD2::CHINESE
)
170 else if (cld_language
== CLD2::CHINESE_T
)
173 language
= CLD2::LanguageCode(static_cast<CLD2::Language
>(cld_language
));
175 # error "CLD_VERSION must be 1 or 2"
178 VLOG(9) << "Detected lang_id: " << language
<< ", from Text:\n" << text
179 << "\n*************************************\n";
183 // Checks if CLD can complement a sub code when the page language doesn't know
185 bool CanCLDComplementSubCode(
186 const std::string
& page_language
, const std::string
& cld_language
) {
187 // Translate server cannot treat general Chinese. If Content-Language and
188 // CLD agree that the language is Chinese and Content-Language doesn't know
189 // which dialect is used, CLD language has priority.
190 // TODO(hajimehoshi): How about the other dialects like zh-MO?
191 return page_language
== "zh" &&
192 base::StartsWith(cld_language
, "zh-",
193 base::CompareCase::INSENSITIVE_ASCII
);
198 namespace translate
{
200 std::string
DeterminePageLanguage(const std::string
& code
,
201 const std::string
& html_lang
,
202 const base::string16
& contents
,
203 std::string
* cld_language_p
,
204 bool* is_cld_reliable_p
) {
205 base::TimeTicks begin_time
= base::TimeTicks::Now();
206 bool is_cld_reliable
;
207 // Check if html lang attribute is valid.
208 std::string modified_html_lang
;
209 if (!html_lang
.empty()) {
210 modified_html_lang
= html_lang
;
211 ApplyLanguageCodeCorrection(&modified_html_lang
);
212 translate::ReportHtmlLang(html_lang
, modified_html_lang
);
213 VLOG(9) << "html lang based language code: " << modified_html_lang
;
216 // Check if Content-Language is valid.
217 std::string modified_code
;
219 modified_code
= code
;
220 ApplyLanguageCodeCorrection(&modified_code
);
221 translate::ReportContentLanguage(code
, modified_code
);
224 std::string cld_language
= DetermineTextLanguage(
225 contents
, &is_cld_reliable
, modified_code
, modified_html_lang
);
226 translate::ReportLanguageDetectionTime(begin_time
, base::TimeTicks::Now());
228 if (cld_language_p
!= NULL
)
229 *cld_language_p
= cld_language
;
230 if (is_cld_reliable_p
!= NULL
)
231 *is_cld_reliable_p
= is_cld_reliable
;
232 translate::ToTranslateLanguageSynonym(&cld_language
);
234 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
236 std::string language
= modified_html_lang
.empty() ? modified_code
:
239 // If |language| is empty, just use CLD result even though it might be
240 // translate::kUnknownLanguageCode.
241 if (language
.empty()) {
242 translate::ReportLanguageVerification(
243 translate::LANGUAGE_VERIFICATION_CLD_ONLY
);
247 if (cld_language
== kUnknownLanguageCode
) {
248 translate::ReportLanguageVerification(
249 translate::LANGUAGE_VERIFICATION_UNKNOWN
);
253 if (CanCLDComplementSubCode(language
, cld_language
)) {
254 translate::ReportLanguageVerification(
255 translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE
);
259 if (IsSameOrSimilarLanguages(language
, cld_language
)) {
260 translate::ReportLanguageVerification(
261 translate::LANGUAGE_VERIFICATION_CLD_AGREE
);
265 if (MaybeServerWrongConfiguration(language
, cld_language
)) {
266 translate::ReportLanguageVerification(
267 translate::LANGUAGE_VERIFICATION_TRUST_CLD
);
271 // Content-Language value might be wrong because CLD says that this page is
272 // written in another language with confidence. In this case, Chrome doesn't
273 // rely on any of the language codes, and gives up suggesting a translation.
274 translate::ReportLanguageVerification(
275 translate::LANGUAGE_VERIFICATION_CLD_DISAGREE
);
276 return kUnknownLanguageCode
;
279 void CorrectLanguageCodeTypo(std::string
* code
) {
282 size_t coma_index
= code
->find(',');
283 if (coma_index
!= std::string::npos
) {
284 // There are more than 1 language specified, just keep the first one.
285 *code
= code
->substr(0, coma_index
);
287 base::TrimWhitespaceASCII(*code
, base::TRIM_ALL
, code
);
289 // An underscore instead of a dash is a frequent mistake.
290 size_t underscore_index
= code
->find('_');
291 if (underscore_index
!= std::string::npos
)
292 (*code
)[underscore_index
] = '-';
294 // Change everything up to a dash to lower-case and everything after to upper.
295 size_t dash_index
= code
->find('-');
296 if (dash_index
!= std::string::npos
) {
297 *code
= base::ToLowerASCII(code
->substr(0, dash_index
)) +
298 base::ToUpperASCII(code
->substr(dash_index
));
300 *code
= base::ToLowerASCII(*code
);
304 bool IsValidLanguageCode(const std::string
& code
) {
305 // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
306 // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
307 std::vector
<base::StringPiece
> chunks
= base::SplitStringPiece(
308 code
, "-", base::TRIM_WHITESPACE
, base::SPLIT_WANT_ALL
);
310 if (chunks
.size() < 1 || 2 < chunks
.size())
313 const base::StringPiece
& main_code
= chunks
[0];
315 if (main_code
.size() < 1 || 3 < main_code
.size())
318 for (char c
: main_code
) {
319 if (!base::IsAsciiAlpha(c
))
323 if (chunks
.size() == 1)
326 const base::StringPiece
& sub_code
= chunks
[1];
328 if (sub_code
.size() != 2)
331 for (char c
: sub_code
) {
332 if (!base::IsAsciiAlpha(c
))
339 bool IsSameOrSimilarLanguages(const std::string
& page_language
,
340 const std::string
& cld_language
) {
341 std::vector
<std::string
> chunks
= base::SplitString(
342 page_language
, "-", base::TRIM_WHITESPACE
, base::SPLIT_WANT_ALL
);
343 if (chunks
.size() == 0)
345 std::string page_language_main_part
= chunks
[0]; // Need copy.
347 chunks
= base::SplitString(
348 cld_language
, "-", base::TRIM_WHITESPACE
, base::SPLIT_WANT_ALL
);
349 if (chunks
.size() == 0)
351 const std::string
& cld_language_main_part
= chunks
[0];
353 // Language code part of |page_language| is matched to one of |cld_language|.
354 // Country code is ignored here.
355 if (page_language_main_part
== cld_language_main_part
) {
356 // Languages are matched strictly. Reports false to metrics, but returns
358 translate::ReportSimilarLanguageMatch(false);
362 // Check if |page_language| and |cld_language| are in the similar language
363 // list and belong to the same language group.
364 int page_code
= GetSimilarLanguageGroupCode(page_language
);
365 bool match
= page_code
!= 0 &&
366 page_code
== GetSimilarLanguageGroupCode(cld_language
);
368 translate::ReportSimilarLanguageMatch(match
);
372 bool MaybeServerWrongConfiguration(const std::string
& page_language
,
373 const std::string
& cld_language
) {
374 // If |page_language| is not "en-*", respect it and just return false here.
375 if (!base::StartsWith(page_language
, "en",
376 base::CompareCase::INSENSITIVE_ASCII
))
379 // A server provides a language meta information representing "en-*". But it
380 // might be just a default value due to missing user configuration.
381 // Let's trust |cld_language| if the determined language is not difficult to
382 // distinguish from English, and the language is one of well-known languages
383 // which often provide "en-*" meta information mistakenly.
384 for (size_t i
= 0; i
< arraysize(kWellKnownCodesOnWrongConfiguration
); ++i
) {
385 if (cld_language
== kWellKnownCodesOnWrongConfiguration
[i
])
391 } // namespace translate