external_libraries/boost/libs/regex/src/icu.cpp

   1 /*
   2  *
   3  * Copyright (c) 2004
   4  * John Maddock
   5  *
   6  * Use, modification and distribution are subject to the
   7  * Boost Software License, Version 1.0. (See accompanying file
   8  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
   9  *
  10  */
  11
  12  /*
  13   *   LOCATION:    see http://www.boost.org for most recent version.
  14   *   FILE         icu.cpp
  15   *   VERSION      see <boost/version.hpp>
  16   *   DESCRIPTION: Unicode regular expressions on top of the ICU Library.
  17   */
  18 #define BOOST_REGEX_SOURCE
  19
  20 #include <boost/regex/config.hpp>
  21 #ifdef BOOST_HAS_ICU
  22 #define BOOST_REGEX_ICU_INSTANTIATE
  23 #include <boost/regex/icu.hpp>
  24
  25 #ifdef BOOST_INTEL
  26 #pragma warning(disable:981 2259 383)
  27 #endif
  28
  29 namespace boost{
  30
  31 namespace re_detail{
  32
  33 icu_regex_traits_implementation::string_type icu_regex_traits_implementation::do_transform(const char_type* p1, const char_type* p2, const U_NAMESPACE_QUALIFIER Collator* pcoll) const
  34 {
  35    // TODO make thread safe!!!! :
  36    typedef u32_to_u16_iterator<const char_type*, ::UChar> itt;
  37    itt i(p1), j(p2);
  38 #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
  39    std::vector< ::UChar> t(i, j);
  40 #else
  41    std::vector< ::UChar> t;
  42    while(i != j)
  43       t.push_back(*i++);
  44 #endif
  45    ::uint8_t result[100];
  46    ::int32_t len;
  47    if(t.size())
  48       len = pcoll->getSortKey(&*t.begin(), static_cast< ::int32_t>(t.size()), result, sizeof(result));
  49    else
  50       len = pcoll->getSortKey(static_cast<UChar const*>(0), static_cast< ::int32_t>(0), result, sizeof(result));
  51    if(std::size_t(len) > sizeof(result))
  52    {
  53       scoped_array< ::uint8_t> presult(new ::uint8_t[len+1]);
  54       if(t.size())
  55          len = pcoll->getSortKey(&*t.begin(), static_cast< ::int32_t>(t.size()), presult.get(), len+1);
  56       else
  57          len = pcoll->getSortKey(static_cast<UChar const*>(0), static_cast< ::int32_t>(0), presult.get(), len+1);
  58       if((0 == presult[len-1]) && (len > 1))
  59          --len;
  60 #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
  61       return string_type(presult.get(), presult.get()+len);
  62 #else
  63       string_type sresult;
  64       ::uint8_t const* ia = presult.get();
  65       ::uint8_t const* ib = presult.get()+len;
  66       while(ia != ib)
  67          sresult.push_back(*ia++);
  68       return sresult;
  69 #endif
  70    }
  71    if((0 == result[len-1]) && (len > 1))
  72       --len;
  73 #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
  74    return string_type(result, result+len);
  75 #else
  76    string_type sresult;
  77    ::uint8_t const* ia = result;
  78    ::uint8_t const* ib = result+len;
  79    while(ia != ib)
  80       sresult.push_back(*ia++);
  81    return sresult;
  82 #endif
  83 }
  84
  85 }
  86
  87 icu_regex_traits::size_type icu_regex_traits::length(const char_type* p)
  88 {
  89    size_type result = 0;
  90    while(*p)
  91    {
  92       ++p;
  93       ++result;
  94    }
  95    return result;
  96 }
  97
  98 //
  99 // define our bitmasks:
 100 //
 101 const icu_regex_traits::char_class_type icu_regex_traits::mask_blank = icu_regex_traits::char_class_type(1) << offset_blank;
 102 const icu_regex_traits::char_class_type icu_regex_traits::mask_space = icu_regex_traits::char_class_type(1) << offset_space;
 103 const icu_regex_traits::char_class_type icu_regex_traits::mask_xdigit = icu_regex_traits::char_class_type(1) << offset_xdigit;
 104 const icu_regex_traits::char_class_type icu_regex_traits::mask_underscore = icu_regex_traits::char_class_type(1) << offset_underscore;
 105 const icu_regex_traits::char_class_type icu_regex_traits::mask_unicode = icu_regex_traits::char_class_type(1) << offset_unicode;
 106 const icu_regex_traits::char_class_type icu_regex_traits::mask_any = icu_regex_traits::char_class_type(1) << offset_any;
 107 const icu_regex_traits::char_class_type icu_regex_traits::mask_ascii = icu_regex_traits::char_class_type(1) << offset_ascii;
 108 const icu_regex_traits::char_class_type icu_regex_traits::mask_horizontal = icu_regex_traits::char_class_type(1) << offset_horizontal;
 109 const icu_regex_traits::char_class_type icu_regex_traits::mask_vertical = icu_regex_traits::char_class_type(1) << offset_vertical;
 110
 111 icu_regex_traits::char_class_type icu_regex_traits::lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2)
 112 {
 113    static const ::UChar32 prop_name_table[] = {
 114       /* any */  'a', 'n', 'y',
 115       /* ascii */  'a', 's', 'c', 'i', 'i',
 116       /* assigned */  'a', 's', 's', 'i', 'g', 'n', 'e', 'd',
 117       /* c* */  'c', '*',
 118       /* cc */  'c', 'c',
 119       /* cf */  'c', 'f',
 120       /* closepunctuation */  'c', 'l', 'o', 's', 'e', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
 121       /* cn */  'c', 'n',
 122       /* co */  'c', 'o',
 123       /* connectorpunctuation */  'c', 'o', 'n', 'n', 'e', 'c', 't', 'o', 'r', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
 124       /* control */  'c', 'o', 'n', 't', 'r', 'o', 'l',
 125       /* cs */  'c', 's',
 126       /* currencysymbol */  'c', 'u', 'r', 'r', 'e', 'n', 'c', 'y', 's', 'y', 'm', 'b', 'o', 'l',
 127       /* dashpunctuation */  'd', 'a', 's', 'h', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
 128       /* decimaldigitnumber */  'd', 'e', 'c', 'i', 'm', 'a', 'l', 'd', 'i', 'g', 'i', 't', 'n', 'u', 'm', 'b', 'e', 'r',
 129       /* enclosingmark */  'e', 'n', 'c', 'l', 'o', 's', 'i', 'n', 'g', 'm', 'a', 'r', 'k',
 130       /* finalpunctuation */  'f', 'i', 'n', 'a', 'l', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
 131       /* format */  'f', 'o', 'r', 'm', 'a', 't',
 132       /* initialpunctuation */  'i', 'n', 'i', 't', 'i', 'a', 'l', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
 133       /* l* */  'l', '*',
 134       /* letter */  'l', 'e', 't', 't', 'e', 'r',
 135       /* letternumber */  'l', 'e', 't', 't', 'e', 'r', 'n', 'u', 'm', 'b', 'e', 'r',
 136       /* lineseparator */  'l', 'i', 'n', 'e', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
 137       /* ll */  'l', 'l',
 138       /* lm */  'l', 'm',
 139       /* lo */  'l', 'o',
 140       /* lowercaseletter */  'l', 'o', 'w', 'e', 'r', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r',
 141       /* lt */  'l', 't',
 142       /* lu */  'l', 'u',
 143       /* m* */  'm', '*',
 144       /* mark */  'm', 'a', 'r', 'k',
 145       /* mathsymbol */  'm', 'a', 't', 'h', 's', 'y', 'm', 'b', 'o', 'l',
 146       /* mc */  'm', 'c',
 147       /* me */  'm', 'e',
 148       /* mn */  'm', 'n',
 149       /* modifierletter */  'm', 'o', 'd', 'i', 'f', 'i', 'e', 'r', 'l', 'e', 't', 't', 'e', 'r',
 150       /* modifiersymbol */  'm', 'o', 'd', 'i', 'f', 'i', 'e', 'r', 's', 'y', 'm', 'b', 'o', 'l',
 151       /* n* */  'n', '*',
 152       /* nd */  'n', 'd',
 153       /* nl */  'n', 'l',
 154       /* no */  'n', 'o',
 155       /* nonspacingmark */  'n', 'o', 'n', 's', 'p', 'a', 'c', 'i', 'n', 'g', 'm', 'a', 'r', 'k',
 156       /* notassigned */  'n', 'o', 't', 'a', 's', 's', 'i', 'g', 'n', 'e', 'd',
 157       /* number */  'n', 'u', 'm', 'b', 'e', 'r',
 158       /* openpunctuation */  'o', 'p', 'e', 'n', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
 159       /* other */  'o', 't', 'h', 'e', 'r',
 160       /* otherletter */  'o', 't', 'h', 'e', 'r', 'l', 'e', 't', 't', 'e', 'r',
 161       /* othernumber */  'o', 't', 'h', 'e', 'r', 'n', 'u', 'm', 'b', 'e', 'r',
 162       /* otherpunctuation */  'o', 't', 'h', 'e', 'r', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
 163       /* othersymbol */  'o', 't', 'h', 'e', 'r', 's', 'y', 'm', 'b', 'o', 'l',
 164       /* p* */  'p', '*',
 165       /* paragraphseparator */  'p', 'a', 'r', 'a', 'g', 'r', 'a', 'p', 'h', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
 166       /* pc */  'p', 'c',
 167       /* pd */  'p', 'd',
 168       /* pe */  'p', 'e',
 169       /* pf */  'p', 'f',
 170       /* pi */  'p', 'i',
 171       /* po */  'p', 'o',
 172       /* privateuse */  'p', 'r', 'i', 'v', 'a', 't', 'e', 'u', 's', 'e',
 173       /* ps */  'p', 's',
 174       /* punctuation */  'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
 175       /* s* */  's', '*',
 176       /* sc */  's', 'c',
 177       /* separator */  's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
 178       /* sk */  's', 'k',
 179       /* sm */  's', 'm',
 180       /* so */  's', 'o',
 181       /* spaceseparator */  's', 'p', 'a', 'c', 'e', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
 182       /* spacingcombiningmark */  's', 'p', 'a', 'c', 'i', 'n', 'g', 'c', 'o', 'm', 'b', 'i', 'n', 'i', 'n', 'g', 'm', 'a', 'r', 'k',
 183       /* surrogate */  's', 'u', 'r', 'r', 'o', 'g', 'a', 't', 'e',
 184       /* symbol */  's', 'y', 'm', 'b', 'o', 'l',
 185       /* titlecase */  't', 'i', 't', 'l', 'e', 'c', 'a', 's', 'e',
 186       /* titlecaseletter */  't', 'i', 't', 'l', 'e', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r',
 187       /* uppercaseletter */  'u', 'p', 'p', 'e', 'r', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r',
 188       /* z* */  'z', '*',
 189       /* zl */  'z', 'l',
 190       /* zp */  'z', 'p',
 191       /* zs */  'z', 's',
 192    };
 193
 194    static const re_detail::character_pointer_range< ::UChar32> range_data[] = {
 195       { prop_name_table+0, prop_name_table+3, }, // any
 196       { prop_name_table+3, prop_name_table+8, }, // ascii
 197       { prop_name_table+8, prop_name_table+16, }, // assigned
 198       { prop_name_table+16, prop_name_table+18, }, // c*
 199       { prop_name_table+18, prop_name_table+20, }, // cc
 200       { prop_name_table+20, prop_name_table+22, }, // cf
 201       { prop_name_table+22, prop_name_table+38, }, // closepunctuation
 202       { prop_name_table+38, prop_name_table+40, }, // cn
 203       { prop_name_table+40, prop_name_table+42, }, // co
 204       { prop_name_table+42, prop_name_table+62, }, // connectorpunctuation
 205       { prop_name_table+62, prop_name_table+69, }, // control
 206       { prop_name_table+69, prop_name_table+71, }, // cs
 207       { prop_name_table+71, prop_name_table+85, }, // currencysymbol
 208       { prop_name_table+85, prop_name_table+100, }, // dashpunctuation
 209       { prop_name_table+100, prop_name_table+118, }, // decimaldigitnumber
 210       { prop_name_table+118, prop_name_table+131, }, // enclosingmark
 211       { prop_name_table+131, prop_name_table+147, }, // finalpunctuation
 212       { prop_name_table+147, prop_name_table+153, }, // format
 213       { prop_name_table+153, prop_name_table+171, }, // initialpunctuation
 214       { prop_name_table+171, prop_name_table+173, }, // l*
 215       { prop_name_table+173, prop_name_table+179, }, // letter
 216       { prop_name_table+179, prop_name_table+191, }, // letternumber
 217       { prop_name_table+191, prop_name_table+204, }, // lineseparator
 218       { prop_name_table+204, prop_name_table+206, }, // ll
 219       { prop_name_table+206, prop_name_table+208, }, // lm
 220       { prop_name_table+208, prop_name_table+210, }, // lo
 221       { prop_name_table+210, prop_name_table+225, }, // lowercaseletter
 222       { prop_name_table+225, prop_name_table+227, }, // lt
 223       { prop_name_table+227, prop_name_table+229, }, // lu
 224       { prop_name_table+229, prop_name_table+231, }, // m*
 225       { prop_name_table+231, prop_name_table+235, }, // mark
 226       { prop_name_table+235, prop_name_table+245, }, // mathsymbol
 227       { prop_name_table+245, prop_name_table+247, }, // mc
 228       { prop_name_table+247, prop_name_table+249, }, // me
 229       { prop_name_table+249, prop_name_table+251, }, // mn
 230       { prop_name_table+251, prop_name_table+265, }, // modifierletter
 231       { prop_name_table+265, prop_name_table+279, }, // modifiersymbol
 232       { prop_name_table+279, prop_name_table+281, }, // n*
 233       { prop_name_table+281, prop_name_table+283, }, // nd
 234       { prop_name_table+283, prop_name_table+285, }, // nl
 235       { prop_name_table+285, prop_name_table+287, }, // no
 236       { prop_name_table+287, prop_name_table+301, }, // nonspacingmark
 237       { prop_name_table+301, prop_name_table+312, }, // notassigned
 238       { prop_name_table+312, prop_name_table+318, }, // number
 239       { prop_name_table+318, prop_name_table+333, }, // openpunctuation
 240       { prop_name_table+333, prop_name_table+338, }, // other
 241       { prop_name_table+338, prop_name_table+349, }, // otherletter
 242       { prop_name_table+349, prop_name_table+360, }, // othernumber
 243       { prop_name_table+360, prop_name_table+376, }, // otherpunctuation
 244       { prop_name_table+376, prop_name_table+387, }, // othersymbol
 245       { prop_name_table+387, prop_name_table+389, }, // p*
 246       { prop_name_table+389, prop_name_table+407, }, // paragraphseparator
 247       { prop_name_table+407, prop_name_table+409, }, // pc
 248       { prop_name_table+409, prop_name_table+411, }, // pd
 249       { prop_name_table+411, prop_name_table+413, }, // pe
 250       { prop_name_table+413, prop_name_table+415, }, // pf
 251       { prop_name_table+415, prop_name_table+417, }, // pi
 252       { prop_name_table+417, prop_name_table+419, }, // po
 253       { prop_name_table+419, prop_name_table+429, }, // privateuse
 254       { prop_name_table+429, prop_name_table+431, }, // ps
 255       { prop_name_table+431, prop_name_table+442, }, // punctuation
 256       { prop_name_table+442, prop_name_table+444, }, // s*
 257       { prop_name_table+444, prop_name_table+446, }, // sc
 258       { prop_name_table+446, prop_name_table+455, }, // separator
 259       { prop_name_table+455, prop_name_table+457, }, // sk
 260       { prop_name_table+457, prop_name_table+459, }, // sm
 261       { prop_name_table+459, prop_name_table+461, }, // so
 262       { prop_name_table+461, prop_name_table+475, }, // spaceseparator
 263       { prop_name_table+475, prop_name_table+495, }, // spacingcombiningmark
 264       { prop_name_table+495, prop_name_table+504, }, // surrogate
 265       { prop_name_table+504, prop_name_table+510, }, // symbol
 266       { prop_name_table+510, prop_name_table+519, }, // titlecase
 267       { prop_name_table+519, prop_name_table+534, }, // titlecaseletter
 268       { prop_name_table+534, prop_name_table+549, }, // uppercaseletter
 269       { prop_name_table+549, prop_name_table+551, }, // z*
 270       { prop_name_table+551, prop_name_table+553, }, // zl
 271       { prop_name_table+553, prop_name_table+555, }, // zp
 272       { prop_name_table+555, prop_name_table+557, }, // zs
 273    };
 274
 275    static const icu_regex_traits::char_class_type icu_class_map[] = {
 276       icu_regex_traits::mask_any, // any
 277       icu_regex_traits::mask_ascii, // ascii
 278       (0x3FFFFFFFu) & ~(U_GC_CN_MASK), // assigned
 279       U_GC_C_MASK, // c*
 280       U_GC_CC_MASK, // cc
 281       U_GC_CF_MASK, // cf
 282       U_GC_PE_MASK, // closepunctuation
 283       U_GC_CN_MASK, // cn
 284       U_GC_CO_MASK, // co
 285       U_GC_PC_MASK, // connectorpunctuation
 286       U_GC_CC_MASK, // control
 287       U_GC_CS_MASK, // cs
 288       U_GC_SC_MASK, // currencysymbol
 289       U_GC_PD_MASK, // dashpunctuation
 290       U_GC_ND_MASK, // decimaldigitnumber
 291       U_GC_ME_MASK, // enclosingmark
 292       U_GC_PF_MASK, // finalpunctuation
 293       U_GC_CF_MASK, // format
 294       U_GC_PI_MASK, // initialpunctuation
 295       U_GC_L_MASK, // l*
 296       U_GC_L_MASK, // letter
 297       U_GC_NL_MASK, // letternumber
 298       U_GC_ZL_MASK, // lineseparator
 299       U_GC_LL_MASK, // ll
 300       U_GC_LM_MASK, // lm
 301       U_GC_LO_MASK, // lo
 302       U_GC_LL_MASK, // lowercaseletter
 303       U_GC_LT_MASK, // lt
 304       U_GC_LU_MASK, // lu
 305       U_GC_M_MASK, // m*
 306       U_GC_M_MASK, // mark
 307       U_GC_SM_MASK, // mathsymbol
 308       U_GC_MC_MASK, // mc
 309       U_GC_ME_MASK, // me
 310       U_GC_MN_MASK, // mn
 311       U_GC_LM_MASK, // modifierletter
 312       U_GC_SK_MASK, // modifiersymbol
 313       U_GC_N_MASK, // n*
 314       U_GC_ND_MASK, // nd
 315       U_GC_NL_MASK, // nl
 316       U_GC_NO_MASK, // no
 317       U_GC_MN_MASK, // nonspacingmark
 318       U_GC_CN_MASK, // notassigned
 319       U_GC_N_MASK, // number
 320       U_GC_PS_MASK, // openpunctuation
 321       U_GC_C_MASK, // other
 322       U_GC_LO_MASK, // otherletter
 323       U_GC_NO_MASK, // othernumber
 324       U_GC_PO_MASK, // otherpunctuation
 325       U_GC_SO_MASK, // othersymbol
 326       U_GC_P_MASK, // p*
 327       U_GC_ZP_MASK, // paragraphseparator
 328       U_GC_PC_MASK, // pc
 329       U_GC_PD_MASK, // pd
 330       U_GC_PE_MASK, // pe
 331       U_GC_PF_MASK, // pf
 332       U_GC_PI_MASK, // pi
 333       U_GC_PO_MASK, // po
 334       U_GC_CO_MASK, // privateuse
 335       U_GC_PS_MASK, // ps
 336       U_GC_P_MASK, // punctuation
 337       U_GC_S_MASK, // s*
 338       U_GC_SC_MASK, // sc
 339       U_GC_Z_MASK, // separator
 340       U_GC_SK_MASK, // sk
 341       U_GC_SM_MASK, // sm
 342       U_GC_SO_MASK, // so
 343       U_GC_ZS_MASK, // spaceseparator
 344       U_GC_MC_MASK, // spacingcombiningmark
 345       U_GC_CS_MASK, // surrogate
 346       U_GC_S_MASK, // symbol
 347       U_GC_LT_MASK, // titlecase
 348       U_GC_LT_MASK, // titlecaseletter
 349       U_GC_LU_MASK, // uppercaseletter
 350       U_GC_Z_MASK, // z*
 351       U_GC_ZL_MASK, // zl
 352       U_GC_ZP_MASK, // zp
 353       U_GC_ZS_MASK, // zs
 354    };
 355
 356
 357    static const re_detail::character_pointer_range< ::UChar32>* ranges_begin = range_data;
 358    static const re_detail::character_pointer_range< ::UChar32>* ranges_end = range_data + (sizeof(range_data)/sizeof(range_data[0]));
 359
 360    re_detail::character_pointer_range< ::UChar32> t = { p1, p2, };
 361    const re_detail::character_pointer_range< ::UChar32>* p = std::lower_bound(ranges_begin, ranges_end, t);
 362    if((p != ranges_end) && (t == *p))
 363       return icu_class_map[p - ranges_begin];
 364    return 0;
 365 }
 366
 367 icu_regex_traits::char_class_type icu_regex_traits::lookup_classname(const char_type* p1, const char_type* p2) const
 368 {
 369    static const char_class_type masks[] =
 370    {
 371       0,
 372       U_GC_L_MASK | U_GC_ND_MASK,
 373       U_GC_L_MASK,
 374       mask_blank,
 375       U_GC_CC_MASK | U_GC_CF_MASK | U_GC_ZL_MASK | U_GC_ZP_MASK,
 376       U_GC_ND_MASK,
 377       U_GC_ND_MASK,
 378       (0x3FFFFFFFu) & ~(U_GC_CC_MASK | U_GC_CF_MASK | U_GC_CS_MASK | U_GC_CN_MASK | U_GC_Z_MASK),
 379       mask_horizontal,
 380       U_GC_LL_MASK,
 381       U_GC_LL_MASK,
 382       ~(U_GC_C_MASK),
 383       U_GC_P_MASK,
 384       char_class_type(U_GC_Z_MASK) | mask_space,
 385       char_class_type(U_GC_Z_MASK) | mask_space,
 386       U_GC_LU_MASK,
 387       mask_unicode,
 388       U_GC_LU_MASK,
 389       mask_vertical,
 390       char_class_type(U_GC_L_MASK | U_GC_ND_MASK | U_GC_MN_MASK) | mask_underscore,
 391       char_class_type(U_GC_L_MASK | U_GC_ND_MASK | U_GC_MN_MASK) | mask_underscore,
 392       char_class_type(U_GC_ND_MASK) | mask_xdigit,
 393    };
 394
 395    int idx = ::boost::re_detail::get_default_class_id(p1, p2);
 396    if(idx >= 0)
 397       return masks[idx+1];
 398    char_class_type result = lookup_icu_mask(p1, p2);
 399    if(result != 0)
 400       return result;
 401
 402    if(idx < 0)
 403    {
 404       string_type s(p1, p2);
 405       string_type::size_type i = 0;
 406       while(i < s.size())
 407       {
 408          s[i] = static_cast<char>((::u_tolower)(s[i]));
 409          if(::u_isspace(s[i]) || (s[i] == '-') || (s[i] == '_'))
 410             s.erase(s.begin()+i, s.begin()+i+1);
 411          else
 412          {
 413             s[i] = static_cast<char>((::u_tolower)(s[i]));
 414             ++i;
 415          }
 416       }
 417       if(s.size())
 418          idx = ::boost::re_detail::get_default_class_id(&*s.begin(), &*s.begin() + s.size());
 419       if(idx >= 0)
 420          return masks[idx+1];
 421       if(s.size())
 422          result = lookup_icu_mask(&*s.begin(), &*s.begin() + s.size());
 423       if(result != 0)
 424          return result;
 425    }
 426    BOOST_ASSERT(std::size_t(idx+1) < sizeof(masks) / sizeof(masks[0]));
 427    return masks[idx+1];
 428 }
 429
 430 icu_regex_traits::string_type icu_regex_traits::lookup_collatename(const char_type* p1, const char_type* p2) const
 431 {
 432    string_type result;
 433    if(std::find_if(p1, p2, std::bind2nd(std::greater< ::UChar32>(), 0x7f)) == p2)
 434    {
 435 #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
 436       std::string s(p1, p2);
 437 #else
 438       std::string s;
 439       const char_type* p3 = p1;
 440       while(p3 != p2)
 441          s.append(1, *p3++);
 442 #endif
 443       // Try Unicode name:
 444       UErrorCode err = U_ZERO_ERROR;
 445       UChar32 c = ::u_charFromName(U_UNICODE_CHAR_NAME, s.c_str(), &err);
 446       if(U_SUCCESS(err))
 447       {
 448          result.push_back(c);
 449          return result;
 450       }
 451       // Try Unicode-extended name:
 452       err = U_ZERO_ERROR;
 453       c = ::u_charFromName(U_EXTENDED_CHAR_NAME, s.c_str(), &err);
 454       if(U_SUCCESS(err))
 455       {
 456          result.push_back(c);
 457          return result;
 458       }
 459       // try POSIX name:
 460       s = ::boost::re_detail::lookup_default_collate_name(s);
 461 #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
 462       result.assign(s.begin(), s.end());
 463 #else
 464       result.clear();
 465       std::string::const_iterator si, sj;
 466       si = s.begin();
 467       sj = s.end();
 468       while(si != sj)
 469          result.push_back(*si++);
 470 #endif
 471    }
 472    if(result.empty() && (p2-p1 == 1))
 473       result.push_back(*p1);
 474    return result;
 475 }
 476
 477 bool icu_regex_traits::isctype(char_type c, char_class_type f) const
 478 {
 479    // check for standard catagories first:
 480    char_class_type m = char_class_type(1u << u_charType(c));
 481    if((m & f) != 0)
 482       return true;
 483    // now check for special cases:
 484    if(((f & mask_blank) != 0) && u_isblank(c))
 485       return true;
 486    if(((f & mask_space) != 0) && u_isspace(c))
 487       return true;
 488    if(((f & mask_xdigit) != 0) && (u_digit(c, 16) >= 0))
 489       return true;
 490    if(((f & mask_unicode) != 0) && (c >= 0x100))
 491       return true;
 492    if(((f & mask_underscore) != 0) && (c == '_'))
 493       return true;
 494    if(((f & mask_any) != 0) && (c <= 0x10FFFF))
 495       return true;
 496    if(((f & mask_ascii) != 0) && (c <= 0x7F))
 497       return true;
 498    if(((f & mask_vertical) != 0) && (::boost::re_detail::is_separator(c) || (c == static_cast<char_type>('\v')) || (m == U_GC_ZL_MASK) || (m == U_GC_ZP_MASK)))
 499       return true;
 500    if(((f & mask_horizontal) != 0) && !::boost::re_detail::is_separator(c) && u_isspace(c) && (c != static_cast<char_type>('\v')))
 501       return true;
 502    return false;
 503 }
 504
 505 }
 506
 507 #endif // BOOST_HAS_ICU