Merge branch '138-toggle-free-look-with-hotkey' into 'main/atys-live'
[ryzomcore.git] / nel / src / misc / utf_string_view.cpp
blob8808930b4a1d32e94f9846e858d5f92f8b236d43
1 // NeL - MMORPG Framework <https://wiki.ryzom.dev/>
2 // Copyright (C) 2020 Jan BOON (Kaetemi) <jan.boon@kaetemi.be>
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU Affero General Public License as
6 // published by the Free Software Foundation, either version 3 of the
7 // License, or (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU Affero General Public License for more details.
14 // You should have received a copy of the GNU Affero General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
17 #include "stdmisc.h"
19 // Project includes
20 #include <nel/misc/utf_string_view.h>
21 #include <nel/misc/stream.h>
23 // References:
24 // - https://twiserandom.com/unicode/unicode-encoding-utf-8-utf-16-utf-32/
25 // - https://www.compart.com/en/unicode/U+1F30D
26 // - 0xF0 0x9F 0x8C 0x8D
27 // - 0xD83C 0xDF0D
28 // - 0x0001F30D
30 namespace NLMISC
33 NL_FORCE_INLINE void appendUtf8(std::string &str, u32char c)
35 if (c < 0x80)
37 // Encode as 1 byte
38 str += (char)c;
40 else if (c < 0x0800)
42 // Encode as 2 bytes
43 str += (char)((c & 0x07C0) >> 6) | 0xC0;
44 str += (char)(c & 0x3F) | 0x80;
46 else if (c < 0x010000)
48 // Encode as 3 bytes
49 str += (char)((c & 0xF000) >> 12) | 0xE0;
50 str += (char)((c & 0x0FC0) >> 6) | 0x80;
51 str += (char)(c & 0x3F) | 0x80;
53 else if (c < 0x110000)
55 // Encode as 4 bytes
56 str += (char)((c & 0x1C0000) >> 18) | 0xF0;
57 str += (char)((c & 0x03F000) >> 12) | 0x80;
58 str += (char)((c & 0x0FC0) >> 6) | 0x80;
59 str += (char)(c & 0x3F) | 0x80;
61 else
63 // Replacement character �
64 str += "\xEF\xB\xBD";
68 void CUtfStringView::append(std::string &str, u32char c)
70 appendUtf8(str, c);
73 void CUtfStringView::append(IStream &s, u32char c)
75 nlassert(!s.isReading());
76 std::string tmp;
77 tmp.reserve(4);
78 append(tmp, c);
79 s.serialBuffer((uint8 *)&tmp[0], tmp.size());
82 u32char CUtfStringView::get(IStream &s)
84 nlassert(s.isReading());
86 std::string tmp;
87 tmp.reserve(4);
88 uint8 c;
89 s.serial(c);
91 // Single byte
92 if (c < 0x80)
93 return c;
95 // Do a fast check of length
96 tmp += (char)c;
97 size_t len;
98 if ((c & 0xF0) == 0xF0) len = 4;
99 if ((c & 0xE0) == 0xE0) len = 3;
100 else len = 2;
102 // Read from stream
103 tmp.resize(len);
104 s.serialBuffer((uint8 *)&tmp[1], len - 1);
106 // Decode
107 const void *str = tmp.c_str();
108 return utf8Iterator(&str);
111 std::string CUtfStringView::toUtf8(bool reEncode) const
113 // Decode UTF and encode UTF-8
114 // This implementation makes no attempt at fixing invalid codepoints
115 if (m_Iterator == utf8Iterator && !reEncode)
116 return std::string((const char *)m_Str, (const char *)((ptrdiff_t)m_Str + m_Size));
117 std::string res;
118 res.reserve(m_Size);
119 for (iterator it(begin()), end(this->end()); it != end; ++it)
121 appendUtf8(res, *it);
123 return res;
126 ucstring CUtfStringView::toUtf16(bool reEncode) const
128 if (m_Iterator == utf16Iterator && !reEncode)
129 return ucstring((const ucchar *)m_Str, (const ucchar *)((ptrdiff_t)m_Str + m_Size));
130 ucstring res;
131 res.reserve(m_Size << 1);
132 for (iterator it(begin()), end(this->end()); it != end; ++it)
134 u32char c = *it;
135 if (c < 0x10000)
137 res += c;
139 else
141 c -= 0x10000;
142 res += (c >> 10) | 0xD800;
143 res += (c & 0x3FF) | 0xDC00;
146 return res;
149 ::u32string CUtfStringView::toUtf32() const
151 // Decode any UTF
152 // This implementation makes no attempt at fixing bad encoding
153 if (m_Iterator == utf32Iterator)
154 return ::u32string((const u32char *)m_Str, (const u32char *)((ptrdiff_t)m_Str + m_Size));
155 ::u32string res;
156 res.reserve(m_Size << 2);
157 for (iterator it(begin()), end(this->end()); it != end; ++it)
158 res += *it;
159 return res;
162 std::string CUtfStringView::toAscii() const
164 std::string res;
165 res.reserve(m_Size);
166 for (iterator it(begin()), end(this->end()); it != end; ++it)
168 u32char c = *it;
169 if (c < 0x80)
170 res += c;
171 else
172 res += '?';
174 return res;
177 std::string CUtfStringView::fromAscii(const std::string &str)
179 std::string res;
180 res.reserve(str.size());
181 for (std::string::const_iterator it(str.begin()), end(str.end()); it != end; ++it)
183 unsigned char c = *it;
184 if (c < 0x80)
185 res += (char)c;
186 else
187 res += '?';
189 return res;
192 std::wstring CUtfStringView::toWide() const
194 #ifdef NL_OS_WINDOWS
195 if (m_Iterator == utf16Iterator)
196 return std::wstring((const wchar_t *)m_Str, (const wchar_t *)((ptrdiff_t)m_Str + m_Size));
197 std::wstring res;
198 res.reserve(m_Size << 1);
199 for (iterator it(begin()), end(this->end()); it != end; ++it)
201 u32char c = *it;
202 if (c < 0x10000)
204 res += c;
206 else
208 c -= 0x10000;
209 res += (c >> 10) | 0xD800;
210 res += (c & 0x3FF) | 0xDC00;
213 return res;
214 #else
215 if (m_Iterator == utf32Iterator)
216 return std::wstring((const wchar_t *)m_Str, (const wchar_t *)((ptrdiff_t)m_Str + m_Size));
217 std::wstring res;
218 res.reserve(m_Size << 2);
219 for (iterator it(begin()), end(this->end()); it != end; ++it)
220 res += *it;
221 return res;
222 #endif
225 size_t CUtfStringView::count() const
227 size_t res = 0;
228 for (iterator it(begin()), end(this->end()); it != end; ++it)
229 ++res;
230 return res;
233 ptrdiff_t CUtfStringView::offset(ptrdiff_t i)
235 size_t res = 0;
236 for (iterator it(begin()), end(this->end()); it != end; ++it)
238 if (res == i)
239 return (ptrdiff_t)it.ptr() - (ptrdiff_t)ptr();
240 ++res;
242 return res;
245 u32char CUtfStringView::utf8Iterator(const void **addr)
247 // Decode UTF-8
248 // This implementation makes no attempt at fixing bad encoding, except for bad UTF-16 surrogate pairs
249 // Invalid characters are replaced with the replacement character
250 const uint8 **pp = reinterpret_cast<const uint8 **>(addr);
251 u32char c0 = **pp;
252 ++(*pp);
253 if (c0 >= 0x80)
255 if (c0 < 0xC0)
257 // Replacement character �
258 return 0xFFFD;
260 uint8 cx = **pp;
261 if ((cx & 0xC0) == 0x80)
263 ++(*pp);
264 c0 &= 0x3F; // Drop first two bits
265 c0 <<= 6;
266 c0 |= (cx & 0x3F); // 12 bits now (6 + 6), 2-byte encoding
267 if (c0 & 0x800)
269 cx = **pp;
270 if ((cx & 0xC0) == 0x80)
272 ++(*pp);
273 c0 &= 0x07FF; // Drop first bit
274 c0 <<= 6;
275 c0 |= (cx & 0x3F); // 17 bits now (12 - 1 + 6), 3-byte encoding
276 if (c0 & 0x10000)
278 cx = **pp;
279 if ((cx & 0xC0) == 0x80)
281 ++(*pp);
282 c0 &= 0xFFFF; // Drop first bit
283 c0 <<= 6;
284 c0 |= (cx & 0x3F); // 22 bits now (17 - 1 + 6), 4-byte encoding
285 if (c0 >= 0x110000)
287 // Replacement character �
288 return 0xFFFD;
290 else if (c0 < 0x10000)
292 // Invalid encoding
293 // Replacement character �
294 return 0xFFFD;
297 else
299 // Replacement character �
300 return 0xFFFD;
303 else if ((c0 & 0xFC00) == 0xD800) // Higher bits of nutcase UTF-16 encoded as UTF-8
305 uint8 cy;
306 if ((*pp)[0] == 0xED && ((cx = (*pp)[1]) & 0xF0) == 0xB0 && ((cy = (*pp)[2]) & 0xC0) == 0x80)
308 // Lower bits of nutcase UTF-16 encoded as UTF-8
309 (*pp) += 3;
310 uint16 c1 = (cx & 0x0F);
311 c1 <<= 6;
312 c1 |= (cy & 0x3F);
313 c0 &= 0x03FF;
314 c0 <<= 10;
315 c0 |= (c1 & 0x03FF);
316 c0 += 0x10000;
318 else
320 // Replacement character �
321 return 0xFFFD;
324 else if ((c0 & 0xFC00) == 0xDC00) // Lower bits of nutcase UTF-16 encoded as UTF-8
326 // Replacement character �
327 return 0xFFFD;
329 else if (c0 < 0x0800)
331 // Invalid encoding
332 // Replacement character �
333 return 0xFFFD;
336 else
338 // Replacement character �
339 return 0xFFFD;
342 else if (c0 < 0x80)
344 // Invalid encoding
345 // Replacement character �
346 return 0xFFFD;
349 else
351 // Replacement character �
352 return 0xFFFD;
355 return c0;
358 u32char CUtfStringView::utf16Iterator(const void **addr)
360 // Decode UTF-16
361 // This implementation makes no attempt at fixing bad encoding
362 const uint16 **pp = reinterpret_cast<const uint16 **>(addr);
363 u32char c0 = **pp;
364 ++(*pp);
365 if ((c0 & 0xFC00) == 0xD800) // Higher bits
367 uint16 c1 = **pp;
368 if ((c1 & 0xFC00) == 0xDC00) // Lower bits
370 ++(*pp);
371 c0 &= 0x03FF;
372 c0 <<= 10;
373 c0 |= (c1 & 0x03FF);
374 c0 += 0x10000;
377 return c0;
380 u32char CUtfStringView::utf32Iterator(const void **addr)
382 // UTF-32
383 // This implementation makes no attempt at fixing bad encoding
384 const u32char **pp = reinterpret_cast<const u32char **>(addr);
385 u32char c = **pp;
386 ++(*pp);
387 return c;
390 } /* namespace NLMISC */
392 /* end of file */