1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/i18n/icu_encoding_detection.h"
9 #include "base/strings/string_util.h"
10 #include "third_party/icu/source/i18n/unicode/ucsdet.h"
14 bool DetectEncoding(const std::string
& text
, std::string
* encoding
) {
15 if (IsStringASCII(text
)) {
16 *encoding
= std::string();
20 UErrorCode status
= U_ZERO_ERROR
;
21 UCharsetDetector
* detector
= ucsdet_open(&status
);
22 ucsdet_setText(detector
, text
.data(), static_cast<int32_t>(text
.length()),
24 const UCharsetMatch
* match
= ucsdet_detect(detector
, &status
);
27 const char* detected_encoding
= ucsdet_getName(match
, &status
);
28 ucsdet_close(detector
);
30 if (U_FAILURE(status
))
33 *encoding
= detected_encoding
;
37 bool DetectAllEncodings(const std::string
& text
,
38 std::vector
<std::string
>* encodings
) {
39 UErrorCode status
= U_ZERO_ERROR
;
40 UCharsetDetector
* detector
= ucsdet_open(&status
);
41 ucsdet_setText(detector
, text
.data(), static_cast<int32_t>(text
.length()),
43 int matches_count
= 0;
44 const UCharsetMatch
** matches
= ucsdet_detectAll(detector
,
47 if (U_FAILURE(status
)) {
48 ucsdet_close(detector
);
52 // ICU has some heuristics for encoding detection, such that the more likely
53 // encodings should be returned first. However, it doesn't always return
54 // all encodings that properly decode |text|, so we'll append more encodings
55 // later. To make that efficient, keep track of encodings sniffed in this
57 std::set
<std::string
> sniffed_encodings
;
60 for (int i
= 0; i
< matches_count
; i
++) {
61 UErrorCode get_name_status
= U_ZERO_ERROR
;
62 const char* encoding_name
= ucsdet_getName(matches
[i
], &get_name_status
);
64 // If we failed to get the encoding's name, ignore the error.
65 if (U_FAILURE(get_name_status
))
68 int32_t confidence
= ucsdet_getConfidence(matches
[i
], &get_name_status
);
70 // We also treat this error as non-fatal.
71 if (U_FAILURE(get_name_status
))
74 // A confidence level >= 10 means that the encoding is expected to properly
75 // decode the text. Drop all encodings with lower confidence level.
79 encodings
->push_back(encoding_name
);
80 sniffed_encodings
.insert(encoding_name
);
83 // Append all encodings not included earlier, in arbitrary order.
84 // TODO(jshin): This shouldn't be necessary, possible ICU bug.
85 // See also http://crbug.com/65917.
86 UEnumeration
* detectable_encodings
= ucsdet_getAllDetectableCharsets(detector
,
88 int detectable_count
= uenum_count(detectable_encodings
, &status
);
89 for (int i
= 0; i
< detectable_count
; i
++) {
91 const char* name_raw
= uenum_next(detectable_encodings
,
94 std::string
name(name_raw
, name_length
);
95 if (sniffed_encodings
.find(name
) == sniffed_encodings
.end())
96 encodings
->push_back(name
);
98 uenum_close(detectable_encodings
);
100 ucsdet_close(detector
);
101 return !encodings
->empty();