2 * Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3 * Distributed under the terms of the MIT License.
6 * Axel Dörfler, axeld@pinc-software.de
7 * Siarzhuk Zharski, zharik@gmx.li
12 #include <UnicodeChar.h>
14 #include <unicode/uchar.h>
15 #include <unicode/utf8.h>
18 BUnicodeChar::BUnicodeChar()
23 // Returns the general category value for the code point.
25 BUnicodeChar::Type(uint32 c
)
32 // Determines whether the specified code point is a letter character.
33 // True for general categories "L" (letters).
35 BUnicodeChar::IsAlpha(uint32 c
)
42 // Determines whether the specified code point is an alphanumeric character
44 // True for characters with general categories
45 // "L" (letters) and "Nd" (decimal digit numbers).
47 BUnicodeChar::IsAlNum(uint32 c
)
54 // Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE).
56 BUnicodeChar::IsLower(uint32 c
)
59 return u_isULowercase(c
);
63 // Check if a code point has the Uppercase Unicode property (UCHAR_UPPERCASE).
65 BUnicodeChar::IsUpper(uint32 c
)
68 return u_isUUppercase(c
);
72 // Determines whether the specified code point is a titlecase letter.
73 // True for general category "Lt" (titlecase letter).
75 BUnicodeChar::IsTitle(uint32 c
)
82 // Determines whether the specified code point is a digit character.
83 // True for characters with general category "Nd" (decimal digit numbers).
84 // Beginning with Unicode 4, this is the same as
85 // testing for the Numeric_Type of Decimal.
87 BUnicodeChar::IsDigit(uint32 c
)
94 // Determines whether the specified code point is a hexadecimal digit.
95 // This is equivalent to u_digit(c, 16)>=0.
96 // True for characters with general category "Nd" (decimal digit numbers)
97 // as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII.
98 // (That is, for letters with code points
99 // 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.)
101 BUnicodeChar::IsHexDigit(uint32 c
)
104 return u_isxdigit(c
);
108 // Determines whether the specified code point is "defined",
109 // which usually means that it is assigned a character.
110 // True for general categories other than "Cn" (other, not assigned),
111 // i.e., true for all code points mentioned in UnicodeData.txt.
113 BUnicodeChar::IsDefined(uint32 c
)
116 return u_isdefined(c
);
120 // Determines whether the specified code point is a base character.
121 // True for general categories "L" (letters), "N" (numbers),
122 // "Mc" (spacing combining marks), and "Me" (enclosing marks).
124 BUnicodeChar::IsBase(uint32 c
)
131 // Determines whether the specified code point is a control character
132 // (as defined by this function).
133 // A control character is one of the following:
134 // - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f)
135 // - U_CONTROL_CHAR (Cc)
136 // - U_FORMAT_CHAR (Cf)
137 // - U_LINE_SEPARATOR (Zl)
138 // - U_PARAGRAPH_SEPARATOR (Zp)
140 BUnicodeChar::IsControl(uint32 c
)
147 // Determines whether the specified code point is a punctuation character.
148 // True for characters with general categories "P" (punctuation).
150 BUnicodeChar::IsPunctuation(uint32 c
)
157 // Determine if the specified code point is a space character according to Java.
158 // True for characters with general categories "Z" (separators),
159 // which does not include control codes (e.g., TAB or Line Feed).
161 BUnicodeChar::IsSpace(uint32 c
)
164 return u_isJavaSpaceChar(c
);
168 // Determines if the specified code point is a whitespace character
169 // A character is considered to be a whitespace character if and only
170 // if it satisfies one of the following criteria:
171 // - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"),
172 // but is not also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space
173 // or U+202F Narrow NBSP).
174 // - It is U+0009 HORIZONTAL TABULATION.
175 // - It is U+000A LINE FEED.
176 // - It is U+000B VERTICAL TABULATION.
177 // - It is U+000C FORM FEED.
178 // - It is U+000D CARRIAGE RETURN.
179 // - It is U+001C FILE SEPARATOR.
180 // - It is U+001D GROUP SEPARATOR.
181 // - It is U+001E RECORD SEPARATOR.
182 // - It is U+001F UNIT SEPARATOR.
184 BUnicodeChar::IsWhitespace(uint32 c
)
187 return u_isWhitespace(c
);
191 // Determines whether the specified code point is a printable character.
192 // True for general categories other than "C" (controls).
194 BUnicodeChar::IsPrintable(uint32 c
)
204 BUnicodeChar::ToLower(uint32 c
)
212 BUnicodeChar::ToUpper(uint32 c
)
220 BUnicodeChar::ToTitle(uint32 c
)
228 BUnicodeChar::DigitValue(uint32 c
)
231 return u_digit(c
, 10);
235 unicode_east_asian_width
236 BUnicodeChar::EastAsianWidth(uint32 c
)
238 return (unicode_east_asian_width
)u_getIntPropertyValue(c
,
239 UCHAR_EAST_ASIAN_WIDTH
);
244 BUnicodeChar::ToUTF8(uint32 c
, char** out
)
247 U8_APPEND_UNSAFE(*out
, i
, c
);
253 BUnicodeChar::FromUTF8(const char** in
)
257 U8_NEXT_UNSAFE(*in
, i
, c
);
265 BUnicodeChar::UTF8StringLength(const char* string
)
277 BUnicodeChar::UTF8StringLength(const char* string
, size_t maxLength
)
280 while (len
< maxLength
&& *string
) {