2 * IPRT - Unicode Code Points.
6 * Copyright (C) 2006-2024 Oracle and/or its affiliates.
8 * This file is part of VirtualBox base platform packages, as
9 * available from https://www.virtualbox.org.
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation, in version 3 of the
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 * The contents of this file may alternatively be used under the terms
25 * of the Common Development and Distribution License Version 1.0
26 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
27 * in the VirtualBox distribution, in which case the provisions of the
28 * CDDL are applicable instead of those of the GPL.
30 * You may elect to license modified versions of this file under the
31 * terms and conditions of either the GPL or the CDDL or both.
33 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
36 #ifndef IPRT_INCLUDED_uni_h
37 #define IPRT_INCLUDED_uni_h
38 #ifndef RT_WITHOUT_PRAGMA_ONCE
42 /** @defgroup grp_rt_uni RTUniCp - Unicode Code Points
47 /** @def RTUNI_USE_WCTYPE
48 * Define RTUNI_USE_WCTYPE to not use the IPRT unicode data but the
49 * data which the C runtime library provides. */
50 #ifdef DOXYGEN_RUNNING
51 # define RTUNI_USE_WCTYPE
54 #include <iprt/types.h>
55 #ifdef RTUNI_USE_WCTYPE
62 #ifndef RTUNI_USE_WCTYPE
65 * A unicode flags range.
68 typedef struct RTUNIFLAGSRANGE
70 /** The first code point of the range. */
72 /** The last + 1 code point of the range. */
74 /** Pointer to the array of case folded code points. */
75 const uint8_t *pafFlags
;
77 /** Pointer to a flags range.
79 typedef RTUNIFLAGSRANGE
*PRTUNIFLAGSRANGE
;
80 /** Pointer to a const flags range.
82 typedef const RTUNIFLAGSRANGE
*PCRTUNIFLAGSRANGE
;
85 * A unicode case folded range.
88 typedef struct RTUNICASERANGE
90 /** The first code point of the range. */
92 /** The last + 1 code point of the range. */
94 /** Pointer to the array of case folded code points. */
95 PCRTUNICP paFoldedCPs
;
97 /** Pointer to a case folded range.
99 typedef RTUNICASERANGE
*PRTUNICASERANGE
;
100 /** Pointer to a const case folded range.
102 typedef const RTUNICASERANGE
*PCRTUNICASERANGE
;
104 /** @name Unicode Code Point Flags.
107 #define RTUNI_UPPER RT_BIT(0)
108 #define RTUNI_LOWER RT_BIT(1)
109 #define RTUNI_ALPHA RT_BIT(2)
110 #define RTUNI_XDIGIT RT_BIT(3)
111 #define RTUNI_DDIGIT RT_BIT(4)
112 #define RTUNI_WSPACE RT_BIT(5)
113 /*#define RTUNI_BSPACE RT_BIT(6) - later */
114 /** When set, the codepoint requires further checking wrt NFC and NFD
115 * normalization. I.e. set when either of QC_NFD and QC_NFC are not Y. */
116 #define RTUNI_QC_NFX RT_BIT(7)
121 * Array of flags ranges.
124 extern RTDATADECL(const RTUNIFLAGSRANGE
) g_aRTUniFlagsRanges
[];
127 * Gets the flags for a unicode code point.
129 * @returns The flag mask. (RTUNI_*)
130 * @param CodePoint The unicode code point.
133 DECLINLINE(RTUNICP
) rtUniCpFlags(RTUNICP CodePoint
)
135 PCRTUNIFLAGSRANGE pCur
= &g_aRTUniFlagsRanges
[0];
138 if (pCur
->EndCP
> CodePoint
)
140 if (pCur
->BeginCP
<= CodePoint
)
141 return pCur
->pafFlags
[CodePoint
- pCur
->BeginCP
];
145 } while (pCur
->EndCP
!= RTUNICP_MAX
);
151 * Checks if a unicode code point is upper case.
153 * @returns true if it is.
154 * @returns false if it isn't.
155 * @param CodePoint The code point.
157 DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint
)
159 return (rtUniCpFlags(CodePoint
) & RTUNI_UPPER
) != 0;
164 * Checks if a unicode code point is lower case.
166 * @returns true if it is.
167 * @returns false if it isn't.
168 * @param CodePoint The code point.
170 DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint
)
172 return (rtUniCpFlags(CodePoint
) & RTUNI_LOWER
) != 0;
177 * Checks if a unicode code point is case foldable.
179 * @returns true if it is.
180 * @returns false if it isn't.
181 * @param CodePoint The code point.
183 DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint
)
186 return (rtUniCpFlags(CodePoint
) & (RTUNI_LOWER
| RTUNI_UPPER
)) != 0;
191 * Checks if a unicode code point is alphabetic.
193 * @returns true if it is.
194 * @returns false if it isn't.
195 * @param CodePoint The code point.
197 DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint
)
199 return (rtUniCpFlags(CodePoint
) & RTUNI_ALPHA
) != 0;
204 * Checks if a unicode code point is a decimal digit.
206 * @returns true if it is.
207 * @returns false if it isn't.
208 * @param CodePoint The code point.
210 DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint
)
212 return (rtUniCpFlags(CodePoint
) & RTUNI_DDIGIT
) != 0;
217 * Checks if a unicode code point is a hexadecimal digit.
219 * @returns true if it is.
220 * @returns false if it isn't.
221 * @param CodePoint The code point.
223 DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint
)
225 return (rtUniCpFlags(CodePoint
) & RTUNI_XDIGIT
) != 0;
230 * Checks if a unicode code point is white space.
232 * @returns true if it is.
233 * @returns false if it isn't.
234 * @param CodePoint The code point.
236 DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint
)
238 return (rtUniCpFlags(CodePoint
) & RTUNI_WSPACE
) != 0;
244 * Array of uppercase ranges.
247 extern RTDATADECL(const RTUNICASERANGE
) g_aRTUniUpperRanges
[];
250 * Array of lowercase ranges.
253 extern RTDATADECL(const RTUNICASERANGE
) g_aRTUniLowerRanges
[];
257 * Folds a unicode code point using the specified range array.
259 * @returns FOlded code point.
260 * @param CodePoint The unicode code point to fold.
261 * @param pCur The case folding range to use.
263 DECLINLINE(RTUNICP
) rtUniCpFold(RTUNICP CodePoint
, PCRTUNICASERANGE pCur
)
267 if (pCur
->EndCP
> CodePoint
)
269 if (pCur
->BeginCP
<= CodePoint
)
270 CodePoint
= pCur
->paFoldedCPs
[CodePoint
- pCur
->BeginCP
];
274 } while (pCur
->EndCP
!= RTUNICP_MAX
);
280 * Folds a unicode code point to upper case.
282 * @returns Folded code point.
283 * @param CodePoint The unicode code point to fold.
285 DECLINLINE(RTUNICP
) RTUniCpToUpper(RTUNICP CodePoint
)
287 return rtUniCpFold(CodePoint
, &g_aRTUniUpperRanges
[0]);
292 * Folds a unicode code point to lower case.
294 * @returns Folded code point.
295 * @param CodePoint The unicode code point to fold.
297 DECLINLINE(RTUNICP
) RTUniCpToLower(RTUNICP CodePoint
)
299 return rtUniCpFold(CodePoint
, &g_aRTUniLowerRanges
[0]);
303 #else /* RTUNI_USE_WCTYPE */
307 * Checks if a unicode code point is upper case.
309 * @returns true if it is.
310 * @returns false if it isn't.
311 * @param CodePoint The code point.
313 DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint
)
315 return !!iswupper(CodePoint
);
320 * Checks if a unicode code point is lower case.
322 * @returns true if it is.
323 * @returns false if it isn't.
324 * @param CodePoint The code point.
326 DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint
)
328 return !!iswlower(CodePoint
);
333 * Checks if a unicode code point is case foldable.
335 * @returns true if it is.
336 * @returns false if it isn't.
337 * @param CodePoint The code point.
339 DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint
)
342 return iswupper(CodePoint
) || iswlower(CodePoint
);
347 * Checks if a unicode code point is alphabetic.
349 * @returns true if it is.
350 * @returns false if it isn't.
351 * @param CodePoint The code point.
353 DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint
)
355 return !!iswalpha(CodePoint
);
360 * Checks if a unicode code point is a decimal digit.
362 * @returns true if it is.
363 * @returns false if it isn't.
364 * @param CodePoint The code point.
366 DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint
)
368 return !!iswdigit(CodePoint
);
373 * Checks if a unicode code point is a hexadecimal digit.
375 * @returns true if it is.
376 * @returns false if it isn't.
377 * @param CodePoint The code point.
379 DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint
)
381 return !!iswxdigit(CodePoint
);
386 * Checks if a unicode code point is white space.
388 * @returns true if it is.
389 * @returns false if it isn't.
390 * @param CodePoint The code point.
392 DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint
)
394 return !!iswspace(CodePoint
);
399 * Folds a unicode code point to upper case.
401 * @returns Folded code point.
402 * @param CodePoint The unicode code point to fold.
404 DECLINLINE(RTUNICP
) RTUniCpToUpper(RTUNICP CodePoint
)
406 return towupper(CodePoint
);
411 * Folds a unicode code point to lower case.
413 * @returns Folded code point.
414 * @param CodePoint The unicode code point to fold.
416 DECLINLINE(RTUNICP
) RTUniCpToLower(RTUNICP CodePoint
)
418 return towlower(CodePoint
);
422 #endif /* RTUNI_USE_WCTYPE */
426 * Frees a unicode string.
428 * @param pusz The string to free.
430 RTDECL(void) RTUniFree(PRTUNICP pusz
);
434 * Checks if a code point valid.
436 * Any code point (defined or not) within the 17 unicode planes (0 thru 16),
437 * except surrogates will be considered valid code points by this function.
439 * @returns true if in range, false if not.
440 * @param CodePoint The unicode code point to validate.
442 DECLINLINE(bool) RTUniCpIsValid(RTUNICP CodePoint
)
444 return CodePoint
<= 0x00d7ff
445 || ( CodePoint
<= 0x10ffff
446 && CodePoint
>= 0x00e000);
451 * Checks if the given code point is in the BMP range.
453 * Surrogates are not considered in the BMP range by this function.
455 * @returns true if in BMP, false if not.
456 * @param CodePoint The unicode code point to consider.
458 DECLINLINE(bool) RTUniCpIsBMP(RTUNICP CodePoint
)
460 return CodePoint
<= 0xd7ff
461 || ( CodePoint
<= 0xffff
462 && CodePoint
>= 0xe000);
467 * Folds a unicode code point to lower case.
469 * @returns Folded code point.
470 * @param CodePoint The unicode code point to fold.
472 DECLINLINE(size_t) RTUniCpCalcUtf8Len(RTUNICP CodePoint
)
474 if (CodePoint
< 0x80)
477 + (CodePoint
>= 0x00000800)
478 + (CodePoint
>= 0x00010000)
479 + (CodePoint
>= 0x00200000)
480 + (CodePoint
>= 0x04000000)
481 + (CodePoint
>= 0x80000000) /* illegal */;
490 #endif /* !IPRT_INCLUDED_uni_h */