1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "content/renderer/hyphenator/hyphenator.h"
7 #include "base/files/memory_mapped_file.h"
8 #include "base/logging.h"
9 #include "base/memory/scoped_ptr.h"
10 #include "base/string_util.h"
11 #include "base/utf_string_conversions.h"
12 #include "content/common/hyphenator_messages.h"
13 #include "content/public/renderer/render_thread.h"
14 #include "third_party/hyphen/hyphen.h"
15 #include "third_party/icu/public/common/unicode/uscript.h"
19 // A class that converts a sequence of UTF-8 characters to UTF-16 ones and holds
20 // only the length of converted UTF-16 characters. This class is used for
21 // creating a mapping from the position of a UTF-8 string to a position of a
22 // UTF-16 string without unnecessary conversions. Even though the following
23 // snippet produces the same mapping, it needs to convert same characters many
24 // times. This class incrementally counts the number of converted UTF-16
25 // characters to avoid this problem.
27 // scoped_ptr<size_t[]> position(new size_t[text.length()]);
28 // for (size_t i = 0; i < text.length(); ++i)
29 // position[i] = UTF8ToUTF16(text.substr(0, i)).length();
31 class UTF16TextLength
{
36 // Returns the current position.
37 int utf16_length() const { return utf16_length_
; }
39 // Appends one UTF-8 character to this converter and advances the converted
40 // position. This converter increases the position by one when it finishes
41 // reading a BMP character and increases by two when it finish reading a
46 // The length of the converted UTF-16 text.
49 // The buffer that stores UTF-8 characters being converted.
50 std::string utf8_text_
;
52 DISALLOW_COPY_AND_ASSIGN(UTF16TextLength
);
55 UTF16TextLength::UTF16TextLength()
59 UTF16TextLength::~UTF16TextLength() {
62 void UTF16TextLength::Append(char c
) {
63 // Append the given character and try converting the UTF-8 characters in this
64 // buffer to Unicode codepoints. If this buffer includes a Unicode codepoint,
65 // get the number of UTF-16 characters representing this codepoint and advance
69 utf8_text_
.push_back(c
);
70 U8_NEXT(utf8_text_
.data(), index
, static_cast<int>(utf8_text_
.length()),
72 if (code
!= U_SENTINEL
) {
74 utf16_length_
+= U16_LENGTH(code
);
78 // A class that encapsulates a hyphenation query. This class owns resources
79 // temporarily needed for hyphenating one word, and deletes them when it is
80 // deleted as listed in the following snippet.
82 // std::vector<int> hyphens;
83 // QUery query(UTF8ToUTF16("hyphenate"));
84 // query.Hyphenate(dict, &hyphens);
88 explicit Query(const string16
& word
);
91 // Hyphenates a word with the specified dictionary. This function hyphenates
92 // the word provided to its constructor and returns a list of hyphenation
93 // points, positions where we can insert hyphens.
94 bool Hyphenate(HyphenDict
* dictionary
, std::vector
<int>* hyphen_offsets
);
97 // A word to be hyphenated.
98 std::string word_utf8_
;
100 // Return variables from the hyphen library.
101 scoped_ptr
<char[]> hyphen_vector_
;
106 DISALLOW_COPY_AND_ASSIGN(Query
);
109 Query::Query(const string16
& word
)
113 // Remove trailing punctuation characters. WebKit does not remove these
114 // characters when it hyphenates a word. These characters prevent the hyphen
115 // library from applying some rules, i.e. they prevent the library from adding
117 DCHECK(!word
.empty());
118 const char16
* data
= word
.data();
119 int length
= static_cast<int>(word
.length());
121 int previous
= length
;
123 U16_PREV(data
, 0, previous
, code
);
124 UErrorCode error
= U_ZERO_ERROR
;
125 if (uscript_getScript(code
, &error
) != USCRIPT_COMMON
)
129 UTF16ToUTF8(word
.c_str(), length
, &word_utf8_
);
130 // Create a hyphen vector used by hnj_hyphen_hyphenate2(). We allocate a
131 // buffer of |word_.length()| + 5 as written in Line 112 of
132 // <http://cs.chromium.org/src/third_party/hyphen/hyphen.h>.
133 hyphen_vector_
.reset(new char[word_utf8_
.length() + 5]);
138 for (size_t i
= 0; i
< word_utf8_
.length(); ++i
) {
150 bool Query::Hyphenate(HyphenDict
* dictionary
,
151 std::vector
<int>* hyphen_offsets
) {
153 DCHECK(hyphen_offsets
);
155 int error_code
= hnj_hyphen_hyphenate2(dictionary
,
157 static_cast<int>(word_utf8_
.length()),
158 hyphen_vector_
.get(),
166 // WebKit needs hyphenation points counted in UTF-16 characters. On the other
167 // hand, the hyphen library returns hyphenation points counted in UTF-8
168 // characters. We increamentally convert hyphenation points in UTF-8
169 // characters to hyphenation points in UTF-16 characters and write the
170 // converted hyphenation points to the output vector.
171 UTF16TextLength text_length
;
172 hyphen_offsets
->clear();
173 for (size_t i
= 0; i
< word_utf8_
.length(); ++i
) {
174 text_length
.Append(word_utf8_
[i
]);
175 if (hyphen_vector_
[i
] & 1)
176 hyphen_offsets
->push_back(text_length
.utf16_length());
178 return !hyphen_offsets
->empty();
185 Hyphenator::Hyphenator(base::PlatformFile file
)
187 dictionary_file_(base::FdopenPlatformFile(file
, "r")),
191 Hyphenator::~Hyphenator() {
193 hnj_hyphen_free(dictionary_
);
196 bool Hyphenator::Initialize() {
200 if (!dictionary_file_
.get())
202 dictionary_
= hnj_hyphen_load_file(dictionary_file_
.get());
203 return !!dictionary_
;
206 bool Hyphenator::Attach(RenderThread
* thread
, const string16
& locale
) {
209 locale_
.assign(locale
);
210 thread
->AddObserver(this);
211 return thread
->Send(new HyphenatorHostMsg_OpenDictionary(locale
));
214 bool Hyphenator::CanHyphenate(const string16
& locale
) {
215 return !locale_
.compare(locale
);
218 size_t Hyphenator::ComputeLastHyphenLocation(const string16
& word
,
219 size_t before_index
) {
220 if (!Initialize() || word
.empty())
223 // Call the hyphen library to get all hyphenation points, i.e. positions where
224 // we can insert hyphens. When WebKit finds a line-break, it calls this
225 // function twice or more with the same word to find the best hyphenation
226 // point. To avoid calling the hyphen library twice or more with the same
227 // word, we cache the last query.
231 result_
= query
.Hyphenate(dictionary_
, &hyphen_offsets_
);
235 for (std::vector
<int>::reverse_iterator it
= hyphen_offsets_
.rbegin();
236 it
!= hyphen_offsets_
.rend(); ++it
) {
237 if (static_cast<size_t>(*it
) < before_index
)
243 bool Hyphenator::OnControlMessageReceived(const IPC::Message
& message
) {
245 IPC_BEGIN_MESSAGE_MAP(Hyphenator
, message
)
246 IPC_MESSAGE_HANDLER(HyphenatorMsg_SetDictionary
, OnSetDictionary
)
247 IPC_MESSAGE_UNHANDLED(handled
= false)
248 IPC_END_MESSAGE_MAP()
252 void Hyphenator::OnSetDictionary(IPC::PlatformFileForTransit file
) {
253 base::PlatformFile rule_file
=
254 IPC::PlatformFileForTransitToPlatformFile(file
);
255 if (rule_file
== base::kInvalidPlatformFileValue
)
257 // Delete the current dictionary and save the given file to this object. We
258 // initialize the hyphen library the first time when WebKit actually
259 // hyphenates a word, i.e. when WebKit calls the ComputeLastHyphenLocation
260 // function. (WebKit does not always hyphenate words even when it calls the
261 // CanHyphenate function, e.g. WebKit does not have to hyphenate words when it
262 // does not have to break text into lines.)
264 hnj_hyphen_free(dictionary_
);
267 dictionary_file_
.Set(base::FdopenPlatformFile(rule_file
, "r"));
270 } // namespace content