1 // Copyright 2013 Google Inc.
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
15 #include "third_party/liblouis/nacl_wrapper/liblouis_wrapper.h"
19 #include "third_party/liblouis/overrides/liblouis/liblouis.h"
23 // Decodes UTF-8 into 16-bit wide characters.
24 // This implementation is very permissive and may miss encoding errors.
25 // It ignores charaters which are not in the Unicode Basic Multilingual Plane.
26 // TODO(jbroman): Handle more than BMP if liblouis changes to accept UTF-16.
27 static bool DecodeUtf8(const std::string
& in
, std::vector
<widechar
>* out
) {
28 int len
= in
.length();
29 std::vector
<widechar
> result
;
33 int ch
= static_cast<unsigned char>(in
[i
++]);
35 if ((ch
& 0x80) == 0x00) { // U+0000 - U+007F
37 } else if ((ch
& 0xe0) == 0xc0 && i
< len
) { // U+0080 - U+07FF
38 cp
= (ch
& 0x1f) << 6;
39 ch
= static_cast<unsigned char>(in
[i
++]);
41 } else if ((ch
& 0xf0) == 0xe0 && i
+1 < len
) { // U+0800 - U+FFFF
42 cp
= (ch
& 0x0f) << 12;
43 ch
= static_cast<unsigned char>(in
[i
++]);
44 cp
|= (ch
& 0x3f) << 6;
45 ch
= static_cast<unsigned char>(in
[i
++]);
47 } else if ((ch
& 0xf8) == 0xf0 && i
+2 < len
) { // U+10000 - U+1FFFFF
50 } else if ((ch
& 0xfc) == 0xf8 && i
+3 < len
) { // U+200000 - U+3FFFFFF
53 } else if ((ch
& 0xfe) == 0xfc && i
+4 < len
) { // U+4000000 - U+7FFFFFFF
57 // Invalid first code point.
66 // Encodes 16-bit wide characters into UTF-8.
67 // This implementation is very permissive and may miss invalid code points in
69 // TODO(jbroman): Handle more than BMP if widechar ever becomes larger.
70 static bool EncodeUtf8(const std::vector
<widechar
>& in
, std::string
* out
) {
72 result
.reserve(in
.size() * 2);
73 for (std::vector
<widechar
>::const_iterator it
= in
.begin(); it
!= in
.end();
75 unsigned int cp
= *it
;
76 if (cp
<= 0x007f) { // U+0000 - U+007F
77 result
.push_back(static_cast<char>(cp
));
78 } else if (cp
<= 0x07ff) { // U+0080 - U+07FF
79 result
.push_back(static_cast<char>(0xc0 | ((cp
>> 6) & 0x1f)));
80 result
.push_back(static_cast<char>(0x80 | (cp
& 0x3f)));
81 } else if (cp
<= 0xffff) { // U+0800 - U+FFFF
82 result
.push_back(static_cast<char>(0xe0 | ((cp
>> 12) & 0x0f)));
83 result
.push_back(static_cast<char>(0x80 | ((cp
>> 6) & 0x3f)));
84 result
.push_back(static_cast<char>(0x80 | (cp
& 0x3f)));
86 // This can't happen if widechar is 16 bits wide.
87 // TODO(jbroman): assert this
97 namespace liblouis_nacl
{
99 LibLouisWrapper::LibLouisWrapper() {
100 char data_path
[] = "/"; // Needed because lou_setDataPath takes a char*.
101 lou_setDataPath(data_path
);
104 LibLouisWrapper::~LibLouisWrapper() {
108 const char* LibLouisWrapper::tables_dir() const {
109 return "/liblouis/tables";
112 bool LibLouisWrapper::CheckTable(const std::string
& table_names
) {
113 return lou_getTable(table_names
.c_str()) != NULL
;
116 bool LibLouisWrapper::Translate(const TranslationParams
& params
,
117 TranslationResult
* out
) {
118 // Convert the character set of the input text.
119 std::vector
<widechar
> inbuf
;
120 if (!DecodeUtf8(params
.text
, &inbuf
)) {
121 // TODO(jbroman): log this
124 // To avoid unsigned/signed comparison warnings.
125 int inbufsize
= inbuf
.size();
127 std::vector
<widechar
> outbuf
;
128 std::vector
<int> text_to_braille(inbuf
.size());
129 std::vector
<int> braille_to_text
;
132 // Compute the cursor position pointer to pass to liblouis.
133 int out_cursor_position
;
134 int* out_cursor_position_ptr
;
135 if (params
.cursor_position
< 0) {
136 out_cursor_position
= -1;
137 out_cursor_position_ptr
= NULL
;
139 out_cursor_position
= params
.cursor_position
;
140 out_cursor_position_ptr
= &out_cursor_position
;
143 // Invoke liblouis. Do this in a loop since we can't precalculate the
144 // translated size. We add an extra slot in the output buffer so that
145 // common cases like single digits or capital letters won't always trigger
146 // retranslations (see the comments above the second exit condition inside
147 // the loop). We also set an arbitrary upper bound for the allocation
148 // to make sure the loop exits without running out of memory.
149 for (int outalloc
= (inbufsize
+ 1) * 2, maxoutalloc
= (inbufsize
+ 1) * 8;
150 outalloc
<= maxoutalloc
; outalloc
*= 2) {
151 int inlen
= inbufsize
;
153 outbuf
.resize(outalloc
);
154 braille_to_text
.resize(outalloc
);
155 int result
= lou_translate(params
.table_names
.c_str(),
156 &inbuf
[0], &inlen
, &outbuf
[0], &outlen
,
157 NULL
/* typeform */, NULL
/* spacing */,
158 &text_to_braille
[0], &braille_to_text
[0],
159 out_cursor_position_ptr
, dotsIO
/* mode */);
161 // TODO(jbroman): log this
164 // If all of inbuf was not consumed, the output buffer must be too small
165 // and we have to retry with a larger buffer.
166 // In addition, if all of outbuf was exhausted, there's no way to know if
167 // more space was needed, so we'll have to retry the translation in that
168 // corner case as well.
169 if (inlen
== inbufsize
&& outlen
< outalloc
)
172 braille_to_text
.clear();
175 // Massage the result.
176 std::vector
<unsigned char> cells
;
177 cells
.reserve(outlen
);
178 for (int i
= 0; i
< outlen
; i
++) {
179 cells
.push_back(outbuf
[i
]);
181 braille_to_text
.resize(outlen
);
183 // Return the translation result.
184 out
->cells
.swap(cells
);
185 out
->text_to_braille
.swap(text_to_braille
);
186 out
->braille_to_text
.swap(braille_to_text
);
187 out
->cursor_position
= out_cursor_position
;
191 bool LibLouisWrapper::BackTranslate(const std::string
& table_names
,
192 const std::vector
<unsigned char>& cells
, std::string
* out
) {
193 std::vector
<widechar
> inbuf
;
194 inbuf
.reserve(cells
.size());
195 for (std::vector
<unsigned char>::const_iterator it
= cells
.begin();
196 it
!= cells
.end(); ++it
) {
197 // Set the high-order bit to prevent liblouis from dropping empty cells.
198 inbuf
.push_back(*it
| 0x8000);
200 // To avoid unsigned/signed comparison warnings.
201 int inbufsize
= inbuf
.size();
202 std::vector
<widechar
> outbuf
;
205 // Invoke liblouis. Do this in a loop since we can't precalculate the
206 // translated size. We add an extra slot in the output buffer so that
207 // common cases like single digits or capital letters won't always trigger
208 // retranslations (see the comments above the second exit condition inside
209 // the loop). We also set an arbitrary upper bound for the allocation
210 // to make sure the loop exits without running out of memory.
211 for (int outalloc
= (inbufsize
+ 1) * 2, maxoutalloc
= (inbufsize
+ 1) * 8;
212 outalloc
<= maxoutalloc
; outalloc
*= 2) {
213 int inlen
= inbufsize
;
215 outbuf
.resize(outalloc
);
217 int result
= lou_backTranslateString(
218 table_names
.c_str(), &inbuf
[0], &inlen
, &outbuf
[0], &outlen
,
219 NULL
/* typeform */, NULL
/* spacing */, dotsIO
/* mode */);
221 // TODO(jbroman): log this
225 // If all of inbuf was not consumed, the output buffer must be too small
226 // and we have to retry with a larger buffer.
227 // In addition, if all of outbuf was exhausted, there's no way to know if
228 // more space was needed, so we'll have to retry the translation in that
229 // corner case as well.
230 if (inlen
== inbufsize
&& outlen
< outalloc
)
235 // Massage the result.
236 outbuf
.resize(outlen
);
238 if (!EncodeUtf8(outbuf
, &text
)) {
239 // TODO(jbroman): log this
243 // Return the back translation result.
248 } // namespace liblouis_nacl