1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Baybayin (ancient script of the Philippines) is detected as TAGALOG.
6 // Chu Nom (Vietnamese ancient Han characters) is detected as VIETNAMESE.
7 // HAITIAN_CREOLE is detected as such.
8 // NORWEGIAN and NORWEGIAN_N are detected separately (but not robustly)
9 // PORTUGUESE, PORTUGUESE_P, and PORTUGUESE_B are all detected as PORTUGUESE.
10 // ROMANIAN-Latin is detected as ROMANIAN; ROMANIAN-Cyrillic as MOLDAVIAN.
11 // SERBO_CROATIAN, BOSNIAN, CROATIAN, SERBIAN, MONTENEGRIN in the Latin script
12 // are all detected as CROATIAN; in the Cyrillic script as SERBIAN.
13 // Zhuang is detected in the Latin script only.
15 // The Google interface languages X_PIG_LATIN and X_KLINGON are detected in the
16 // extended calls ExtDetectLanguageSummary(). BorkBorkBork, ElmerFudd, and
17 // Hacker are not detected (too little training data).
19 // UNKNOWN_LANGUAGE is returned if no language's internal reliablity measure
20 // is high enough. This happens with non-text input such as the bytes of a
21 // JPEG, and also with some text in languages outside the Google Language
22 // enum, such as Ilonggo.
24 // The following languages are detected in multiple scripts:
25 // AZERBAIJANI (Latin, Cyrillic*, Arabic*)
26 // BURMESE (Latin, Myanmar)
27 // HAUSA (Latin, Arabic)
28 // KASHMIRI (Arabic, Devanagari)
29 // KAZAKH (Latin, Cyrillic, Arabic)
30 // KURDISH (Latin*, Arabic)
31 // KYRGYZ (Cyrillic, Arabic)
32 // LIMBU (Devanagari, Limbu)
33 // MONGOLIAN (Cyrillic, Mongolian)
34 // SANSKRIT (Latin, Devanagari)
35 // SINDHI (Arabic, Devanagari)
36 // TAGALOG (Latin, Tagalog)
37 // TAJIK (Cyrillic, Arabic*)
38 // TATAR (Latin, Cyrillic, Arabic)
39 // TURKMEN (Latin, Cyrillic, Arabic)
40 // UIGHUR (Latin, Cyrillic, Arabic)
41 // UZBEK (Latin, Cyrillic, Arabic)
43 // * Due to a shortage of training text, AZERBAIJANI is not currently detected
44 // in Arabic or Cyrillic scripts, nor KURDISH in Latin script, nor TAJIK in
48 #ifndef ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_
49 #define ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_
51 #include "languages/public/languages.h"
52 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
55 struct CLDTableSummary
;
58 namespace CompactLangDet
{
59 // Scan interchange-valid UTF-8 bytes and detect most likely language,
60 // or set of languages.
63 // Skip over big stretches of HTML tags
64 // Able to return ranges of different languages
65 // Relatively small tables and relatively fast processing
68 // For HTML documents, tags are skipped, along with <script> ... </script>
69 // and <style> ... </style> sequences, and entities are expanded.
71 // We distinguish between bytes of the raw input buffer and bytes of non-tag
72 // text letters. Since tags can be over 50% of the bytes of an HTML Page,
73 // and are nearly all seven-bit ASCII English, we prefer to distinguish
74 // language mixture fractions based on just the non-tag text.
76 // Inputs: text and text_length
77 // Code skips HTML tags and expands HTML entities, unless
78 // is_plain_text is true
80 // language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
81 // percent3 is an array of the text percentages 0..100 of the top 3 languages
82 // text_bytes is the amount of non-tag/letters-only text found
83 // is_reliable set true if the returned Language is some amount more
84 // probable then the second-best Language. Calculation is a complex function
85 // of the length of the text and the different-script runs of text.
86 // Return value: the most likely Language for the majority of the input text
87 // Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
88 // defaults to ENGLISH.
90 // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
91 // backwards compatibility with LLD.
93 // The third version may return UNKNOWN_LANGUAGE, and also returns extended
94 // language codes from ext_lang_enc.h
96 // Subsetting: For fast detection over large documents, these routines will
97 // scan non-tag text of the initial part of a document, then will
98 // skip 4-16 bytes and subsample text in the rest of the document, up to a
99 // fixed limit (currently 160KB of non-tag letters).
102 struct DetectionTables
{
103 const cld::CLDTableSummary
* quadgram_obj
;
104 const UTF8PropObj
* unigram_obj
;
107 // Scan interchange-valid UTF-8 bytes and detect most likely language
108 Language
DetectLanguage(const DetectionTables
* tables
,
114 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
115 // language3[0] is also the return value
116 Language
DetectLanguageSummary(
117 const DetectionTables
* tables
,
126 // Same as above, with hints supplied
127 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
128 // language3[0] is also the return value
129 Language
DetectLanguageSummary(
130 const DetectionTables
* tables
,
134 const char* tld_hint
, // "id" boosts Indonesian
135 int encoding_hint
, // SJS boosts Japanese
136 Language language_hint
, // ITALIAN boosts it
142 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
145 // Extended languages are additional Google interface languages and Unicode
146 // single-language scripts, from ext_lang_enc.h. They are experimental and
147 // this call may be removed.
149 // language3[0] is also the return value
150 Language
ExtDetectLanguageSummary(
151 const DetectionTables
* tables
,
160 // Same as above, with hints supplied
161 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
164 // Extended languages are additional Google interface languages and Unicode
165 // single-language scripts, from ext_lang_enc.h. They are experimental and
166 // this call may be removed.
168 // language3[0] is also the return value
169 Language
ExtDetectLanguageSummary(
170 const DetectionTables
* tables
,
174 const char* tld_hint
, // "id" boosts Indonesian
175 int encoding_hint
, // SJS boosts Japanese
176 Language language_hint
, // ITALIAN boosts it
182 // Same as above, and also returns internal language scores as a ratio to
183 // normal score for real text in that language. Scores close to 1.0 indicate
184 // normal text, while scores far away from 1.0 indicate badly-skewed text or
187 Language
ExtDetectLanguageSummary(
188 const DetectionTables
* tables
,
192 const char* tld_hint
, // "id" boosts Indonesian
193 int encoding_hint
, // SJS boosts Japanese
194 Language language_hint
, // ITALIAN boosts it
197 double* normalized_score3
,
201 // Return version text string
202 // String is "code_version - data_scrape_date"
203 const char* DetectLanguageVersion();
204 }; // End namespace CompactLangDet
206 #endif // ENCODINGS_COMPACT_LANG_DET_COMPACT_LANG_DET_H_