6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
13 * LOCATION: see http://www.boost.org for most recent version.
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Unicode regular expressions on top of the ICU Library.
18 #define BOOST_REGEX_SOURCE
20 #include <boost/regex/config.hpp>
22 #define BOOST_REGEX_ICU_INSTANTIATE
23 #include <boost/regex/icu.hpp>
26 #pragma warning(disable:981 2259 383)
33 icu_regex_traits_implementation::string_type
icu_regex_traits_implementation::do_transform(const char_type
* p1
, const char_type
* p2
, const U_NAMESPACE_QUALIFIER Collator
* pcoll
) const
35 // TODO make thread safe!!!! :
36 typedef u32_to_u16_iterator
<const char_type
*, ::UChar
> itt
;
38 #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
39 std::vector
< ::UChar
> t(i
, j
);
41 std::vector
< ::UChar
> t
;
45 ::uint8_t result
[100];
48 len
= pcoll
->getSortKey(&*t
.begin(), static_cast< ::int32_t>(t
.size()), result
, sizeof(result
));
50 len
= pcoll
->getSortKey(static_cast<UChar
const*>(0), static_cast< ::int32_t>(0), result
, sizeof(result
));
51 if(std::size_t(len
) > sizeof(result
))
53 scoped_array
< ::uint8_t> presult(new ::uint8_t[len
+1]);
55 len
= pcoll
->getSortKey(&*t
.begin(), static_cast< ::int32_t>(t
.size()), presult
.get(), len
+1);
57 len
= pcoll
->getSortKey(static_cast<UChar
const*>(0), static_cast< ::int32_t>(0), presult
.get(), len
+1);
58 if((0 == presult
[len
-1]) && (len
> 1))
60 #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
61 return string_type(presult
.get(), presult
.get()+len
);
64 ::uint8_t const* ia
= presult
.get();
65 ::uint8_t const* ib
= presult
.get()+len
;
67 sresult
.push_back(*ia
++);
71 if((0 == result
[len
-1]) && (len
> 1))
73 #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
74 return string_type(result
, result
+len
);
77 ::uint8_t const* ia
= result
;
78 ::uint8_t const* ib
= result
+len
;
80 sresult
.push_back(*ia
++);
87 icu_regex_traits::size_type
icu_regex_traits::length(const char_type
* p
)
99 // define our bitmasks:
101 const icu_regex_traits::char_class_type
icu_regex_traits::mask_blank
= icu_regex_traits::char_class_type(1) << offset_blank
;
102 const icu_regex_traits::char_class_type
icu_regex_traits::mask_space
= icu_regex_traits::char_class_type(1) << offset_space
;
103 const icu_regex_traits::char_class_type
icu_regex_traits::mask_xdigit
= icu_regex_traits::char_class_type(1) << offset_xdigit
;
104 const icu_regex_traits::char_class_type
icu_regex_traits::mask_underscore
= icu_regex_traits::char_class_type(1) << offset_underscore
;
105 const icu_regex_traits::char_class_type
icu_regex_traits::mask_unicode
= icu_regex_traits::char_class_type(1) << offset_unicode
;
106 const icu_regex_traits::char_class_type
icu_regex_traits::mask_any
= icu_regex_traits::char_class_type(1) << offset_any
;
107 const icu_regex_traits::char_class_type
icu_regex_traits::mask_ascii
= icu_regex_traits::char_class_type(1) << offset_ascii
;
108 const icu_regex_traits::char_class_type
icu_regex_traits::mask_horizontal
= icu_regex_traits::char_class_type(1) << offset_horizontal
;
109 const icu_regex_traits::char_class_type
icu_regex_traits::mask_vertical
= icu_regex_traits::char_class_type(1) << offset_vertical
;
111 icu_regex_traits::char_class_type
icu_regex_traits::lookup_icu_mask(const ::UChar32
* p1
, const ::UChar32
* p2
)
113 static const ::UChar32 prop_name_table
[] = {
114 /* any */ 'a', 'n', 'y',
115 /* ascii */ 'a', 's', 'c', 'i', 'i',
116 /* assigned */ 'a', 's', 's', 'i', 'g', 'n', 'e', 'd',
120 /* closepunctuation */ 'c', 'l', 'o', 's', 'e', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
123 /* connectorpunctuation */ 'c', 'o', 'n', 'n', 'e', 'c', 't', 'o', 'r', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
124 /* control */ 'c', 'o', 'n', 't', 'r', 'o', 'l',
126 /* currencysymbol */ 'c', 'u', 'r', 'r', 'e', 'n', 'c', 'y', 's', 'y', 'm', 'b', 'o', 'l',
127 /* dashpunctuation */ 'd', 'a', 's', 'h', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
128 /* decimaldigitnumber */ 'd', 'e', 'c', 'i', 'm', 'a', 'l', 'd', 'i', 'g', 'i', 't', 'n', 'u', 'm', 'b', 'e', 'r',
129 /* enclosingmark */ 'e', 'n', 'c', 'l', 'o', 's', 'i', 'n', 'g', 'm', 'a', 'r', 'k',
130 /* finalpunctuation */ 'f', 'i', 'n', 'a', 'l', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
131 /* format */ 'f', 'o', 'r', 'm', 'a', 't',
132 /* initialpunctuation */ 'i', 'n', 'i', 't', 'i', 'a', 'l', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
134 /* letter */ 'l', 'e', 't', 't', 'e', 'r',
135 /* letternumber */ 'l', 'e', 't', 't', 'e', 'r', 'n', 'u', 'm', 'b', 'e', 'r',
136 /* lineseparator */ 'l', 'i', 'n', 'e', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
140 /* lowercaseletter */ 'l', 'o', 'w', 'e', 'r', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r',
144 /* mark */ 'm', 'a', 'r', 'k',
145 /* mathsymbol */ 'm', 'a', 't', 'h', 's', 'y', 'm', 'b', 'o', 'l',
149 /* modifierletter */ 'm', 'o', 'd', 'i', 'f', 'i', 'e', 'r', 'l', 'e', 't', 't', 'e', 'r',
150 /* modifiersymbol */ 'm', 'o', 'd', 'i', 'f', 'i', 'e', 'r', 's', 'y', 'm', 'b', 'o', 'l',
155 /* nonspacingmark */ 'n', 'o', 'n', 's', 'p', 'a', 'c', 'i', 'n', 'g', 'm', 'a', 'r', 'k',
156 /* notassigned */ 'n', 'o', 't', 'a', 's', 's', 'i', 'g', 'n', 'e', 'd',
157 /* number */ 'n', 'u', 'm', 'b', 'e', 'r',
158 /* openpunctuation */ 'o', 'p', 'e', 'n', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
159 /* other */ 'o', 't', 'h', 'e', 'r',
160 /* otherletter */ 'o', 't', 'h', 'e', 'r', 'l', 'e', 't', 't', 'e', 'r',
161 /* othernumber */ 'o', 't', 'h', 'e', 'r', 'n', 'u', 'm', 'b', 'e', 'r',
162 /* otherpunctuation */ 'o', 't', 'h', 'e', 'r', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
163 /* othersymbol */ 'o', 't', 'h', 'e', 'r', 's', 'y', 'm', 'b', 'o', 'l',
165 /* paragraphseparator */ 'p', 'a', 'r', 'a', 'g', 'r', 'a', 'p', 'h', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
172 /* privateuse */ 'p', 'r', 'i', 'v', 'a', 't', 'e', 'u', 's', 'e',
174 /* punctuation */ 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
177 /* separator */ 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
181 /* spaceseparator */ 's', 'p', 'a', 'c', 'e', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
182 /* spacingcombiningmark */ 's', 'p', 'a', 'c', 'i', 'n', 'g', 'c', 'o', 'm', 'b', 'i', 'n', 'i', 'n', 'g', 'm', 'a', 'r', 'k',
183 /* surrogate */ 's', 'u', 'r', 'r', 'o', 'g', 'a', 't', 'e',
184 /* symbol */ 's', 'y', 'm', 'b', 'o', 'l',
185 /* titlecase */ 't', 'i', 't', 'l', 'e', 'c', 'a', 's', 'e',
186 /* titlecaseletter */ 't', 'i', 't', 'l', 'e', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r',
187 /* uppercaseletter */ 'u', 'p', 'p', 'e', 'r', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r',
194 static const re_detail::character_pointer_range
< ::UChar32
> range_data
[] = {
195 { prop_name_table
+0, prop_name_table
+3, }, // any
196 { prop_name_table
+3, prop_name_table
+8, }, // ascii
197 { prop_name_table
+8, prop_name_table
+16, }, // assigned
198 { prop_name_table
+16, prop_name_table
+18, }, // c*
199 { prop_name_table
+18, prop_name_table
+20, }, // cc
200 { prop_name_table
+20, prop_name_table
+22, }, // cf
201 { prop_name_table
+22, prop_name_table
+38, }, // closepunctuation
202 { prop_name_table
+38, prop_name_table
+40, }, // cn
203 { prop_name_table
+40, prop_name_table
+42, }, // co
204 { prop_name_table
+42, prop_name_table
+62, }, // connectorpunctuation
205 { prop_name_table
+62, prop_name_table
+69, }, // control
206 { prop_name_table
+69, prop_name_table
+71, }, // cs
207 { prop_name_table
+71, prop_name_table
+85, }, // currencysymbol
208 { prop_name_table
+85, prop_name_table
+100, }, // dashpunctuation
209 { prop_name_table
+100, prop_name_table
+118, }, // decimaldigitnumber
210 { prop_name_table
+118, prop_name_table
+131, }, // enclosingmark
211 { prop_name_table
+131, prop_name_table
+147, }, // finalpunctuation
212 { prop_name_table
+147, prop_name_table
+153, }, // format
213 { prop_name_table
+153, prop_name_table
+171, }, // initialpunctuation
214 { prop_name_table
+171, prop_name_table
+173, }, // l*
215 { prop_name_table
+173, prop_name_table
+179, }, // letter
216 { prop_name_table
+179, prop_name_table
+191, }, // letternumber
217 { prop_name_table
+191, prop_name_table
+204, }, // lineseparator
218 { prop_name_table
+204, prop_name_table
+206, }, // ll
219 { prop_name_table
+206, prop_name_table
+208, }, // lm
220 { prop_name_table
+208, prop_name_table
+210, }, // lo
221 { prop_name_table
+210, prop_name_table
+225, }, // lowercaseletter
222 { prop_name_table
+225, prop_name_table
+227, }, // lt
223 { prop_name_table
+227, prop_name_table
+229, }, // lu
224 { prop_name_table
+229, prop_name_table
+231, }, // m*
225 { prop_name_table
+231, prop_name_table
+235, }, // mark
226 { prop_name_table
+235, prop_name_table
+245, }, // mathsymbol
227 { prop_name_table
+245, prop_name_table
+247, }, // mc
228 { prop_name_table
+247, prop_name_table
+249, }, // me
229 { prop_name_table
+249, prop_name_table
+251, }, // mn
230 { prop_name_table
+251, prop_name_table
+265, }, // modifierletter
231 { prop_name_table
+265, prop_name_table
+279, }, // modifiersymbol
232 { prop_name_table
+279, prop_name_table
+281, }, // n*
233 { prop_name_table
+281, prop_name_table
+283, }, // nd
234 { prop_name_table
+283, prop_name_table
+285, }, // nl
235 { prop_name_table
+285, prop_name_table
+287, }, // no
236 { prop_name_table
+287, prop_name_table
+301, }, // nonspacingmark
237 { prop_name_table
+301, prop_name_table
+312, }, // notassigned
238 { prop_name_table
+312, prop_name_table
+318, }, // number
239 { prop_name_table
+318, prop_name_table
+333, }, // openpunctuation
240 { prop_name_table
+333, prop_name_table
+338, }, // other
241 { prop_name_table
+338, prop_name_table
+349, }, // otherletter
242 { prop_name_table
+349, prop_name_table
+360, }, // othernumber
243 { prop_name_table
+360, prop_name_table
+376, }, // otherpunctuation
244 { prop_name_table
+376, prop_name_table
+387, }, // othersymbol
245 { prop_name_table
+387, prop_name_table
+389, }, // p*
246 { prop_name_table
+389, prop_name_table
+407, }, // paragraphseparator
247 { prop_name_table
+407, prop_name_table
+409, }, // pc
248 { prop_name_table
+409, prop_name_table
+411, }, // pd
249 { prop_name_table
+411, prop_name_table
+413, }, // pe
250 { prop_name_table
+413, prop_name_table
+415, }, // pf
251 { prop_name_table
+415, prop_name_table
+417, }, // pi
252 { prop_name_table
+417, prop_name_table
+419, }, // po
253 { prop_name_table
+419, prop_name_table
+429, }, // privateuse
254 { prop_name_table
+429, prop_name_table
+431, }, // ps
255 { prop_name_table
+431, prop_name_table
+442, }, // punctuation
256 { prop_name_table
+442, prop_name_table
+444, }, // s*
257 { prop_name_table
+444, prop_name_table
+446, }, // sc
258 { prop_name_table
+446, prop_name_table
+455, }, // separator
259 { prop_name_table
+455, prop_name_table
+457, }, // sk
260 { prop_name_table
+457, prop_name_table
+459, }, // sm
261 { prop_name_table
+459, prop_name_table
+461, }, // so
262 { prop_name_table
+461, prop_name_table
+475, }, // spaceseparator
263 { prop_name_table
+475, prop_name_table
+495, }, // spacingcombiningmark
264 { prop_name_table
+495, prop_name_table
+504, }, // surrogate
265 { prop_name_table
+504, prop_name_table
+510, }, // symbol
266 { prop_name_table
+510, prop_name_table
+519, }, // titlecase
267 { prop_name_table
+519, prop_name_table
+534, }, // titlecaseletter
268 { prop_name_table
+534, prop_name_table
+549, }, // uppercaseletter
269 { prop_name_table
+549, prop_name_table
+551, }, // z*
270 { prop_name_table
+551, prop_name_table
+553, }, // zl
271 { prop_name_table
+553, prop_name_table
+555, }, // zp
272 { prop_name_table
+555, prop_name_table
+557, }, // zs
275 static const icu_regex_traits::char_class_type icu_class_map
[] = {
276 icu_regex_traits::mask_any
, // any
277 icu_regex_traits::mask_ascii
, // ascii
278 (0x3FFFFFFFu
) & ~(U_GC_CN_MASK
), // assigned
282 U_GC_PE_MASK
, // closepunctuation
285 U_GC_PC_MASK
, // connectorpunctuation
286 U_GC_CC_MASK
, // control
288 U_GC_SC_MASK
, // currencysymbol
289 U_GC_PD_MASK
, // dashpunctuation
290 U_GC_ND_MASK
, // decimaldigitnumber
291 U_GC_ME_MASK
, // enclosingmark
292 U_GC_PF_MASK
, // finalpunctuation
293 U_GC_CF_MASK
, // format
294 U_GC_PI_MASK
, // initialpunctuation
296 U_GC_L_MASK
, // letter
297 U_GC_NL_MASK
, // letternumber
298 U_GC_ZL_MASK
, // lineseparator
302 U_GC_LL_MASK
, // lowercaseletter
307 U_GC_SM_MASK
, // mathsymbol
311 U_GC_LM_MASK
, // modifierletter
312 U_GC_SK_MASK
, // modifiersymbol
317 U_GC_MN_MASK
, // nonspacingmark
318 U_GC_CN_MASK
, // notassigned
319 U_GC_N_MASK
, // number
320 U_GC_PS_MASK
, // openpunctuation
321 U_GC_C_MASK
, // other
322 U_GC_LO_MASK
, // otherletter
323 U_GC_NO_MASK
, // othernumber
324 U_GC_PO_MASK
, // otherpunctuation
325 U_GC_SO_MASK
, // othersymbol
327 U_GC_ZP_MASK
, // paragraphseparator
334 U_GC_CO_MASK
, // privateuse
336 U_GC_P_MASK
, // punctuation
339 U_GC_Z_MASK
, // separator
343 U_GC_ZS_MASK
, // spaceseparator
344 U_GC_MC_MASK
, // spacingcombiningmark
345 U_GC_CS_MASK
, // surrogate
346 U_GC_S_MASK
, // symbol
347 U_GC_LT_MASK
, // titlecase
348 U_GC_LT_MASK
, // titlecaseletter
349 U_GC_LU_MASK
, // uppercaseletter
357 static const re_detail::character_pointer_range
< ::UChar32
>* ranges_begin
= range_data
;
358 static const re_detail::character_pointer_range
< ::UChar32
>* ranges_end
= range_data
+ (sizeof(range_data
)/sizeof(range_data
[0]));
360 re_detail::character_pointer_range
< ::UChar32
> t
= { p1
, p2
, };
361 const re_detail::character_pointer_range
< ::UChar32
>* p
= std::lower_bound(ranges_begin
, ranges_end
, t
);
362 if((p
!= ranges_end
) && (t
== *p
))
363 return icu_class_map
[p
- ranges_begin
];
367 icu_regex_traits::char_class_type
icu_regex_traits::lookup_classname(const char_type
* p1
, const char_type
* p2
) const
369 static const char_class_type masks
[] =
372 U_GC_L_MASK
| U_GC_ND_MASK
,
375 U_GC_CC_MASK
| U_GC_CF_MASK
| U_GC_ZL_MASK
| U_GC_ZP_MASK
,
378 (0x3FFFFFFFu
) & ~(U_GC_CC_MASK
| U_GC_CF_MASK
| U_GC_CS_MASK
| U_GC_CN_MASK
| U_GC_Z_MASK
),
384 char_class_type(U_GC_Z_MASK
) | mask_space
,
385 char_class_type(U_GC_Z_MASK
) | mask_space
,
390 char_class_type(U_GC_L_MASK
| U_GC_ND_MASK
| U_GC_MN_MASK
) | mask_underscore
,
391 char_class_type(U_GC_L_MASK
| U_GC_ND_MASK
| U_GC_MN_MASK
) | mask_underscore
,
392 char_class_type(U_GC_ND_MASK
) | mask_xdigit
,
395 int idx
= ::boost::re_detail::get_default_class_id(p1
, p2
);
398 char_class_type result
= lookup_icu_mask(p1
, p2
);
404 string_type
s(p1
, p2
);
405 string_type::size_type i
= 0;
408 s
[i
] = static_cast<char>((::u_tolower
)(s
[i
]));
409 if(::u_isspace(s
[i
]) || (s
[i
] == '-') || (s
[i
] == '_'))
410 s
.erase(s
.begin()+i
, s
.begin()+i
+1);
413 s
[i
] = static_cast<char>((::u_tolower
)(s
[i
]));
418 idx
= ::boost::re_detail::get_default_class_id(&*s
.begin(), &*s
.begin() + s
.size());
422 result
= lookup_icu_mask(&*s
.begin(), &*s
.begin() + s
.size());
426 BOOST_ASSERT(std::size_t(idx
+1) < sizeof(masks
) / sizeof(masks
[0]));
430 icu_regex_traits::string_type
icu_regex_traits::lookup_collatename(const char_type
* p1
, const char_type
* p2
) const
433 if(std::find_if(p1
, p2
, std::bind2nd(std::greater
< ::UChar32
>(), 0x7f)) == p2
)
435 #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
436 std::string
s(p1
, p2
);
439 const char_type
* p3
= p1
;
444 UErrorCode err
= U_ZERO_ERROR
;
445 UChar32 c
= ::u_charFromName(U_UNICODE_CHAR_NAME
, s
.c_str(), &err
);
451 // Try Unicode-extended name:
453 c
= ::u_charFromName(U_EXTENDED_CHAR_NAME
, s
.c_str(), &err
);
460 s
= ::boost::re_detail::lookup_default_collate_name(s
);
461 #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
462 result
.assign(s
.begin(), s
.end());
465 std::string::const_iterator si
, sj
;
469 result
.push_back(*si
++);
472 if(result
.empty() && (p2
-p1
== 1))
473 result
.push_back(*p1
);
477 bool icu_regex_traits::isctype(char_type c
, char_class_type f
) const
479 // check for standard catagories first:
480 char_class_type m
= char_class_type(1u << u_charType(c
));
483 // now check for special cases:
484 if(((f
& mask_blank
) != 0) && u_isblank(c
))
486 if(((f
& mask_space
) != 0) && u_isspace(c
))
488 if(((f
& mask_xdigit
) != 0) && (u_digit(c
, 16) >= 0))
490 if(((f
& mask_unicode
) != 0) && (c
>= 0x100))
492 if(((f
& mask_underscore
) != 0) && (c
== '_'))
494 if(((f
& mask_any
) != 0) && (c
<= 0x10FFFF))
496 if(((f
& mask_ascii
) != 0) && (c
<= 0x7F))
498 if(((f
& mask_vertical
) != 0) && (::boost::re_detail::is_separator(c
) || (c
== static_cast<char_type
>('\v')) || (m
== U_GC_ZL_MASK
) || (m
== U_GC_ZP_MASK
)))
500 if(((f
& mask_horizontal
) != 0) && !::boost::re_detail::is_separator(c
) && u_isspace(c
) && (c
!= static_cast<char_type
>('\v')))
507 #endif // BOOST_HAS_ICU