Roll src/third_party/WebKit 3aea697:d9c6159 (svn 201973:201974)
[chromium-blink-merge.git] / components / translate / core / language_detection / language_detection_util.cc
blobc4699d4d7d1d01893be63fe4e0f104afea37f8d3
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/translate/core/language_detection/language_detection_util.h"
7 #include "base/logging.h"
8 #include "base/strings/string_split.h"
9 #include "base/strings/string_util.h"
10 #include "base/strings/utf_string_conversions.h"
11 #include "base/time/time.h"
12 #include "components/translate/core/common/translate_constants.h"
13 #include "components/translate/core/common/translate_metrics.h"
14 #include "components/translate/core/common/translate_util.h"
16 #if CLD_VERSION==1
17 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
18 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
19 #endif
21 #if CLD_VERSION==2
22 #include "third_party/cld_2/src/public/compact_lang_det.h"
23 #include "third_party/cld_2/src/public/encodings.h"
24 #endif
26 namespace {
28 // Similar language code list. Some languages are very similar and difficult
29 // for CLD to distinguish.
30 struct SimilarLanguageCode {
31 const char* const code;
32 int group;
35 const SimilarLanguageCode kSimilarLanguageCodes[] = {
36 {"bs", 1},
37 {"hr", 1},
38 {"hi", 2},
39 {"ne", 2},
42 // Checks |kSimilarLanguageCodes| and returns group code.
43 int GetSimilarLanguageGroupCode(const std::string& language) {
44 for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {
45 if (language.find(kSimilarLanguageCodes[i].code) != 0)
46 continue;
47 return kSimilarLanguageCodes[i].group;
49 return 0;
52 // Well-known languages which often have wrong server configuration of
53 // Content-Language: en.
54 // TODO(toyoshim): Remove these static tables and caller functions to
55 // translate/common, and implement them as std::set<>.
56 const char* kWellKnownCodesOnWrongConfiguration[] = {
57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
60 // Applies a series of language code modification in proper order.
61 void ApplyLanguageCodeCorrection(std::string* code) {
62 // Correct well-known format errors.
63 translate::CorrectLanguageCodeTypo(code);
65 if (!translate::IsValidLanguageCode(*code)) {
66 *code = std::string();
67 return;
70 translate::ToTranslateLanguageSynonym(code);
73 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
74 // failed.
75 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
76 std::string DetermineTextLanguage(const base::string16& text,
77 bool* is_cld_reliable,
78 std::string& code,
79 std::string& html_lang) {
80 std::string language = translate::kUnknownLanguageCode;
81 int num_bytes_evaluated = 0;
82 bool is_reliable = false;
83 const bool is_plain_text = true;
85 // Language or CLD2::Language
86 int cld_language = 0;
87 bool is_valid_language = false;
89 #if CLD_VERSION==1
90 int num_languages = 0;
91 cld_language = DetectLanguageOfUnicodeText(NULL, text.c_str(), is_plain_text,
92 &is_reliable, &num_languages, NULL,
93 &num_bytes_evaluated);
94 is_valid_language = cld_language != NUM_LANGUAGES &&
95 cld_language != UNKNOWN_LANGUAGE &&
96 cld_language != TG_UNKNOWN_LANGUAGE;
97 #elif CLD_VERSION==2
98 const std::string utf8_text(base::UTF16ToUTF8(text));
99 const int num_utf8_bytes = static_cast<int>(utf8_text.size());
100 const char* raw_utf8_bytes = utf8_text.c_str();
102 CLD2::Language language3[3];
103 int percent3[3];
104 int flags = 0; // No flags, see compact_lang_det.h for details.
105 int text_bytes; // Amount of non-tag/letters-only text (assumed 0).
106 double normalized_score3[3];
108 const char* tld_hint = "";
109 int encoding_hint = CLD2::UNKNOWN_ENCODING;
110 CLD2::Language language_hint = CLD2::GetLanguageFromName(html_lang.c_str());
111 CLD2::CLDHints cldhints = {code.c_str(), tld_hint, encoding_hint,
112 language_hint};
114 cld_language = CLD2::ExtDetectLanguageSummaryCheckUTF8(
115 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags,
116 language3, percent3, normalized_score3,
117 nullptr /* No ResultChunkVector used */, &text_bytes, &is_reliable,
118 &num_bytes_evaluated);
120 if (num_bytes_evaluated < num_utf8_bytes &&
121 cld_language == CLD2::UNKNOWN_LANGUAGE) {
122 // Invalid UTF8 encountered, see bug http://crbug.com/444258.
123 // Retry using only the valid characters. This time the check for valid
124 // UTF8 can be skipped since the precise number of valid bytes is known.
125 cld_language = CLD2::ExtDetectLanguageSummary(
126 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags,
127 language3, percent3, normalized_score3,
128 nullptr /* No ResultChunkVector used */, &text_bytes, &is_reliable);
130 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
131 cld_language != CLD2::UNKNOWN_LANGUAGE &&
132 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
134 // Choose top language.
135 cld_language = language3[0];
136 #else
137 # error "CLD_VERSION must be 1 or 2"
138 #endif
140 if (is_cld_reliable != NULL)
141 *is_cld_reliable = is_reliable;
143 // We don't trust the result if the CLD reports that the detection is not
144 // reliable, or if the actual text used to detect the language was less than
145 // 100 bytes (short texts can often lead to wrong results).
146 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
147 // the determined language code is correct with 50% confidence. Chrome should
148 // handle the real confidence value to judge.
149 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) {
150 // We should not use LanguageCode_ISO_639_1 because it does not cover all
151 // the languages CLD can detect. As a result, it'll return the invalid
152 // language code for tradtional Chinese among others.
153 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
154 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
155 // for Simplified Chinese.
156 #if CLD_VERSION==1
157 language = LanguageCodeWithDialects(static_cast<Language>(cld_language));
158 #elif CLD_VERSION==2
159 // (1) CLD2's LanguageCode returns general Chinese 'zh' for
160 // CLD2::CHINESE, but Translate server doesn't accept it. This is
161 // converted to 'zh-CN' in the same way as CLD1's
162 // LanguageCodeWithDialects.
164 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for
165 // CLD2::CHINESE_T. This is technically more precise for the language
166 // code of traditional Chinese, while Translate server hasn't accepted
167 // zh-Hant yet.
168 if (cld_language == CLD2::CHINESE)
169 language = "zh-CN";
170 else if (cld_language == CLD2::CHINESE_T)
171 language = "zh-TW";
172 else
173 language = CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));
174 #else
175 # error "CLD_VERSION must be 1 or 2"
176 #endif
178 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
179 << "\n*************************************\n";
180 return language;
183 // Checks if CLD can complement a sub code when the page language doesn't know
184 // the sub code.
185 bool CanCLDComplementSubCode(
186 const std::string& page_language, const std::string& cld_language) {
187 // Translate server cannot treat general Chinese. If Content-Language and
188 // CLD agree that the language is Chinese and Content-Language doesn't know
189 // which dialect is used, CLD language has priority.
190 // TODO(hajimehoshi): How about the other dialects like zh-MO?
191 return page_language == "zh" &&
192 base::StartsWith(cld_language, "zh-",
193 base::CompareCase::INSENSITIVE_ASCII);
196 } // namespace
198 namespace translate {
200 std::string DeterminePageLanguage(const std::string& code,
201 const std::string& html_lang,
202 const base::string16& contents,
203 std::string* cld_language_p,
204 bool* is_cld_reliable_p) {
205 base::TimeTicks begin_time = base::TimeTicks::Now();
206 bool is_cld_reliable;
207 // Check if html lang attribute is valid.
208 std::string modified_html_lang;
209 if (!html_lang.empty()) {
210 modified_html_lang = html_lang;
211 ApplyLanguageCodeCorrection(&modified_html_lang);
212 translate::ReportHtmlLang(html_lang, modified_html_lang);
213 VLOG(9) << "html lang based language code: " << modified_html_lang;
216 // Check if Content-Language is valid.
217 std::string modified_code;
218 if (!code.empty()) {
219 modified_code = code;
220 ApplyLanguageCodeCorrection(&modified_code);
221 translate::ReportContentLanguage(code, modified_code);
224 std::string cld_language = DetermineTextLanguage(
225 contents, &is_cld_reliable, modified_code, modified_html_lang);
226 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());
228 if (cld_language_p != NULL)
229 *cld_language_p = cld_language;
230 if (is_cld_reliable_p != NULL)
231 *is_cld_reliable_p = is_cld_reliable;
232 translate::ToTranslateLanguageSynonym(&cld_language);
234 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
235 // |modified_code|.
236 std::string language = modified_html_lang.empty() ? modified_code :
237 modified_html_lang;
239 // If |language| is empty, just use CLD result even though it might be
240 // translate::kUnknownLanguageCode.
241 if (language.empty()) {
242 translate::ReportLanguageVerification(
243 translate::LANGUAGE_VERIFICATION_CLD_ONLY);
244 return cld_language;
247 if (cld_language == kUnknownLanguageCode) {
248 translate::ReportLanguageVerification(
249 translate::LANGUAGE_VERIFICATION_UNKNOWN);
250 return language;
253 if (CanCLDComplementSubCode(language, cld_language)) {
254 translate::ReportLanguageVerification(
255 translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
256 return cld_language;
259 if (IsSameOrSimilarLanguages(language, cld_language)) {
260 translate::ReportLanguageVerification(
261 translate::LANGUAGE_VERIFICATION_CLD_AGREE);
262 return language;
265 if (MaybeServerWrongConfiguration(language, cld_language)) {
266 translate::ReportLanguageVerification(
267 translate::LANGUAGE_VERIFICATION_TRUST_CLD);
268 return cld_language;
271 // Content-Language value might be wrong because CLD says that this page is
272 // written in another language with confidence. In this case, Chrome doesn't
273 // rely on any of the language codes, and gives up suggesting a translation.
274 translate::ReportLanguageVerification(
275 translate::LANGUAGE_VERIFICATION_CLD_DISAGREE);
276 return kUnknownLanguageCode;
279 void CorrectLanguageCodeTypo(std::string* code) {
280 DCHECK(code);
282 size_t coma_index = code->find(',');
283 if (coma_index != std::string::npos) {
284 // There are more than 1 language specified, just keep the first one.
285 *code = code->substr(0, coma_index);
287 base::TrimWhitespaceASCII(*code, base::TRIM_ALL, code);
289 // An underscore instead of a dash is a frequent mistake.
290 size_t underscore_index = code->find('_');
291 if (underscore_index != std::string::npos)
292 (*code)[underscore_index] = '-';
294 // Change everything up to a dash to lower-case and everything after to upper.
295 size_t dash_index = code->find('-');
296 if (dash_index != std::string::npos) {
297 *code = base::ToLowerASCII(code->substr(0, dash_index)) +
298 base::ToUpperASCII(code->substr(dash_index));
299 } else {
300 *code = base::ToLowerASCII(*code);
304 bool IsValidLanguageCode(const std::string& code) {
305 // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
306 // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
307 std::vector<base::StringPiece> chunks = base::SplitStringPiece(
308 code, "-", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
310 if (chunks.size() < 1 || 2 < chunks.size())
311 return false;
313 const base::StringPiece& main_code = chunks[0];
315 if (main_code.size() < 1 || 3 < main_code.size())
316 return false;
318 for (char c : main_code) {
319 if (!base::IsAsciiAlpha(c))
320 return false;
323 if (chunks.size() == 1)
324 return true;
326 const base::StringPiece& sub_code = chunks[1];
328 if (sub_code.size() != 2)
329 return false;
331 for (char c : sub_code) {
332 if (!base::IsAsciiAlpha(c))
333 return false;
336 return true;
339 bool IsSameOrSimilarLanguages(const std::string& page_language,
340 const std::string& cld_language) {
341 std::vector<std::string> chunks = base::SplitString(
342 page_language, "-", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
343 if (chunks.size() == 0)
344 return false;
345 std::string page_language_main_part = chunks[0]; // Need copy.
347 chunks = base::SplitString(
348 cld_language, "-", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
349 if (chunks.size() == 0)
350 return false;
351 const std::string& cld_language_main_part = chunks[0];
353 // Language code part of |page_language| is matched to one of |cld_language|.
354 // Country code is ignored here.
355 if (page_language_main_part == cld_language_main_part) {
356 // Languages are matched strictly. Reports false to metrics, but returns
357 // true.
358 translate::ReportSimilarLanguageMatch(false);
359 return true;
362 // Check if |page_language| and |cld_language| are in the similar language
363 // list and belong to the same language group.
364 int page_code = GetSimilarLanguageGroupCode(page_language);
365 bool match = page_code != 0 &&
366 page_code == GetSimilarLanguageGroupCode(cld_language);
368 translate::ReportSimilarLanguageMatch(match);
369 return match;
372 bool MaybeServerWrongConfiguration(const std::string& page_language,
373 const std::string& cld_language) {
374 // If |page_language| is not "en-*", respect it and just return false here.
375 if (!base::StartsWith(page_language, "en",
376 base::CompareCase::INSENSITIVE_ASCII))
377 return false;
379 // A server provides a language meta information representing "en-*". But it
380 // might be just a default value due to missing user configuration.
381 // Let's trust |cld_language| if the determined language is not difficult to
382 // distinguish from English, and the language is one of well-known languages
383 // which often provide "en-*" meta information mistakenly.
384 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
385 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
386 return true;
388 return false;
391 } // namespace translate