1 // NeL - MMORPG Framework <https://wiki.ryzom.dev/>
2 // Copyright (C) 2020 Jan BOON (Kaetemi) <jan.boon@kaetemi.be>
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU Affero General Public License as
6 // published by the Free Software Foundation, either version 3 of the
7 // License, or (at your option) any later version.
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU Affero General Public License for more details.
14 // You should have received a copy of the GNU Affero General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #include <nel/misc/utf_string_view.h>
21 #include <nel/misc/stream.h>
24 // - https://twiserandom.com/unicode/unicode-encoding-utf-8-utf-16-utf-32/
25 // - https://www.compart.com/en/unicode/U+1F30D
26 // - 0xF0 0x9F 0x8C 0x8D
33 NL_FORCE_INLINE
void appendUtf8(std::string
&str
, u32char c
)
43 str
+= (char)((c
& 0x07C0) >> 6) | 0xC0;
44 str
+= (char)(c
& 0x3F) | 0x80;
46 else if (c
< 0x010000)
49 str
+= (char)((c
& 0xF000) >> 12) | 0xE0;
50 str
+= (char)((c
& 0x0FC0) >> 6) | 0x80;
51 str
+= (char)(c
& 0x3F) | 0x80;
53 else if (c
< 0x110000)
56 str
+= (char)((c
& 0x1C0000) >> 18) | 0xF0;
57 str
+= (char)((c
& 0x03F000) >> 12) | 0x80;
58 str
+= (char)((c
& 0x0FC0) >> 6) | 0x80;
59 str
+= (char)(c
& 0x3F) | 0x80;
63 // Replacement character �
68 void CUtfStringView::append(std::string
&str
, u32char c
)
73 void CUtfStringView::append(IStream
&s
, u32char c
)
75 nlassert(!s
.isReading());
79 s
.serialBuffer((uint8
*)&tmp
[0], tmp
.size());
82 u32char
CUtfStringView::get(IStream
&s
)
84 nlassert(s
.isReading());
95 // Do a fast check of length
98 if ((c
& 0xF0) == 0xF0) len
= 4;
99 if ((c
& 0xE0) == 0xE0) len
= 3;
104 s
.serialBuffer((uint8
*)&tmp
[1], len
- 1);
107 const void *str
= tmp
.c_str();
108 return utf8Iterator(&str
);
111 std::string
CUtfStringView::toUtf8(bool reEncode
) const
113 // Decode UTF and encode UTF-8
114 // This implementation makes no attempt at fixing invalid codepoints
115 if (m_Iterator
== utf8Iterator
&& !reEncode
)
116 return std::string((const char *)m_Str
, (const char *)((ptrdiff_t)m_Str
+ m_Size
));
119 for (iterator
it(begin()), end(this->end()); it
!= end
; ++it
)
121 appendUtf8(res
, *it
);
126 ucstring
CUtfStringView::toUtf16(bool reEncode
) const
128 if (m_Iterator
== utf16Iterator
&& !reEncode
)
129 return ucstring((const ucchar
*)m_Str
, (const ucchar
*)((ptrdiff_t)m_Str
+ m_Size
));
131 res
.reserve(m_Size
<< 1);
132 for (iterator
it(begin()), end(this->end()); it
!= end
; ++it
)
142 res
+= (c
>> 10) | 0xD800;
143 res
+= (c
& 0x3FF) | 0xDC00;
149 ::u32string
CUtfStringView::toUtf32() const
152 // This implementation makes no attempt at fixing bad encoding
153 if (m_Iterator
== utf32Iterator
)
154 return ::u32string((const u32char
*)m_Str
, (const u32char
*)((ptrdiff_t)m_Str
+ m_Size
));
156 res
.reserve(m_Size
<< 2);
157 for (iterator
it(begin()), end(this->end()); it
!= end
; ++it
)
162 std::string
CUtfStringView::toAscii() const
166 for (iterator
it(begin()), end(this->end()); it
!= end
; ++it
)
177 std::string
CUtfStringView::fromAscii(const std::string
&str
)
180 res
.reserve(str
.size());
181 for (std::string::const_iterator
it(str
.begin()), end(str
.end()); it
!= end
; ++it
)
183 unsigned char c
= *it
;
192 std::wstring
CUtfStringView::toWide() const
195 if (m_Iterator
== utf16Iterator
)
196 return std::wstring((const wchar_t *)m_Str
, (const wchar_t *)((ptrdiff_t)m_Str
+ m_Size
));
198 res
.reserve(m_Size
<< 1);
199 for (iterator
it(begin()), end(this->end()); it
!= end
; ++it
)
209 res
+= (c
>> 10) | 0xD800;
210 res
+= (c
& 0x3FF) | 0xDC00;
215 if (m_Iterator
== utf32Iterator
)
216 return std::wstring((const wchar_t *)m_Str
, (const wchar_t *)((ptrdiff_t)m_Str
+ m_Size
));
218 res
.reserve(m_Size
<< 2);
219 for (iterator
it(begin()), end(this->end()); it
!= end
; ++it
)
225 size_t CUtfStringView::count() const
228 for (iterator
it(begin()), end(this->end()); it
!= end
; ++it
)
233 ptrdiff_t CUtfStringView::offset(ptrdiff_t i
)
236 for (iterator
it(begin()), end(this->end()); it
!= end
; ++it
)
239 return (ptrdiff_t)it
.ptr() - (ptrdiff_t)ptr();
245 u32char
CUtfStringView::utf8Iterator(const void **addr
)
248 // This implementation makes no attempt at fixing bad encoding, except for bad UTF-16 surrogate pairs
249 // Invalid characters are replaced with the replacement character
250 const uint8
**pp
= reinterpret_cast<const uint8
**>(addr
);
257 // Replacement character �
261 if ((cx
& 0xC0) == 0x80)
264 c0
&= 0x3F; // Drop first two bits
266 c0
|= (cx
& 0x3F); // 12 bits now (6 + 6), 2-byte encoding
270 if ((cx
& 0xC0) == 0x80)
273 c0
&= 0x07FF; // Drop first bit
275 c0
|= (cx
& 0x3F); // 17 bits now (12 - 1 + 6), 3-byte encoding
279 if ((cx
& 0xC0) == 0x80)
282 c0
&= 0xFFFF; // Drop first bit
284 c0
|= (cx
& 0x3F); // 22 bits now (17 - 1 + 6), 4-byte encoding
287 // Replacement character �
290 else if (c0
< 0x10000)
293 // Replacement character �
299 // Replacement character �
303 else if ((c0
& 0xFC00) == 0xD800) // Higher bits of nutcase UTF-16 encoded as UTF-8
306 if ((*pp
)[0] == 0xED && ((cx
= (*pp
)[1]) & 0xF0) == 0xB0 && ((cy
= (*pp
)[2]) & 0xC0) == 0x80)
308 // Lower bits of nutcase UTF-16 encoded as UTF-8
310 uint16 c1
= (cx
& 0x0F);
320 // Replacement character �
324 else if ((c0
& 0xFC00) == 0xDC00) // Lower bits of nutcase UTF-16 encoded as UTF-8
326 // Replacement character �
329 else if (c0
< 0x0800)
332 // Replacement character �
338 // Replacement character �
345 // Replacement character �
351 // Replacement character �
358 u32char
CUtfStringView::utf16Iterator(const void **addr
)
361 // This implementation makes no attempt at fixing bad encoding
362 const uint16
**pp
= reinterpret_cast<const uint16
**>(addr
);
365 if ((c0
& 0xFC00) == 0xD800) // Higher bits
368 if ((c1
& 0xFC00) == 0xDC00) // Lower bits
380 u32char
CUtfStringView::utf32Iterator(const void **addr
)
383 // This implementation makes no attempt at fixing bad encoding
384 const u32char
**pp
= reinterpret_cast<const u32char
**>(addr
);
390 } /* namespace NLMISC */