6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
13 * LOCATION: see http://www.boost.org for most recent version.
14 * FILE basic_regex_creator.cpp
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Declares template class basic_regex_creator which fills in
17 * the data members of a regex_data object.
20 #ifndef BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP
21 #define BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP
25 #pragma warning(disable: 4103)
27 #ifdef BOOST_HAS_ABI_HEADERS
28 # include BOOST_ABI_PREFIX
35 # pragma warning(push)
36 # pragma warning(disable: 4800)
43 template <class charT
>
44 struct digraph
: public std::pair
<charT
, charT
>
46 digraph() : std::pair
<charT
, charT
>(0, 0){}
47 digraph(charT c1
) : std::pair
<charT
, charT
>(c1
, 0){}
48 digraph(charT c1
, charT c2
) : std::pair
<charT
, charT
>(c1
, c2
)
50 #if !BOOST_WORKAROUND(BOOST_MSVC, < 1300)
51 digraph(const digraph
<charT
>& d
) : std::pair
<charT
, charT
>(d
.first
, d
.second
){}
54 digraph(const Seq
& s
) : std::pair
<charT
, charT
>()
56 BOOST_ASSERT(s
.size() <= 2);
57 BOOST_ASSERT(s
.size());
59 this->second
= (s
.size() > 1) ? s
[1] : 0;
63 template <class charT
, class traits
>
67 typedef digraph
<charT
> digraph_type
;
68 typedef typename
traits::string_type string_type
;
69 typedef typename
traits::char_class_type mask_type
;
74 m_has_digraphs
= false;
76 m_negated_classes
= 0;
80 void add_single(const digraph_type
& s
)
82 m_singles
.insert(m_singles
.end(), s
);
84 m_has_digraphs
= true;
87 void add_range(const digraph_type
& first
, const digraph_type
& end
)
89 m_ranges
.insert(m_ranges
.end(), first
);
90 m_ranges
.insert(m_ranges
.end(), end
);
93 m_has_digraphs
= true;
98 m_has_digraphs
= true;
103 void add_class(mask_type m
)
108 void add_negated_class(mask_type m
)
110 m_negated_classes
|= m
;
113 void add_equivalent(const digraph_type
& s
)
115 m_equivalents
.insert(m_equivalents
.end(), s
);
118 m_has_digraphs
= true;
130 // accessor functions:
132 bool has_digraphs()const
134 return m_has_digraphs
;
136 bool is_negated()const
140 typedef typename
std::vector
<digraph_type
>::const_iterator list_iterator
;
141 list_iterator
singles_begin()const
143 return m_singles
.begin();
145 list_iterator
singles_end()const
147 return m_singles
.end();
149 list_iterator
ranges_begin()const
151 return m_ranges
.begin();
153 list_iterator
ranges_end()const
155 return m_ranges
.end();
157 list_iterator
equivalents_begin()const
159 return m_equivalents
.begin();
161 list_iterator
equivalents_end()const
163 return m_equivalents
.end();
165 mask_type
classes()const
169 mask_type
negated_classes()const
171 return m_negated_classes
;
178 std::vector
<digraph_type
> m_singles
; // a list of single characters to match
179 std::vector
<digraph_type
> m_ranges
; // a list of end points of our ranges
180 bool m_negate
; // true if the set is to be negated
181 bool m_has_digraphs
; // true if we have digraphs present
182 mask_type m_classes
; // character classes to match
183 mask_type m_negated_classes
; // negated character classes to match
184 bool m_empty
; // whether we've added anything yet
185 std::vector
<digraph_type
> m_equivalents
; // a list of equivalence classes
188 template <class charT
, class traits
>
189 class basic_regex_creator
192 basic_regex_creator(regex_data
<charT
, traits
>* data
);
193 std::ptrdiff_t getoffset(void* addr
)
195 return getoffset(addr
, m_pdata
->m_data
.data());
197 std::ptrdiff_t getoffset(const void* addr
, const void* base
)
199 return static_cast<const char*>(addr
) - static_cast<const char*>(base
);
201 re_syntax_base
* getaddress(std::ptrdiff_t off
)
203 return getaddress(off
, m_pdata
->m_data
.data());
205 re_syntax_base
* getaddress(std::ptrdiff_t off
, void* base
)
207 return static_cast<re_syntax_base
*>(static_cast<void*>(static_cast<char*>(base
) + off
));
209 void init(unsigned l_flags
)
211 m_pdata
->m_flags
= l_flags
;
212 m_icase
= l_flags
& regex_constants::icase
;
214 regbase::flag_type
flags()
216 return m_pdata
->m_flags
;
218 void flags(regbase::flag_type f
)
220 m_pdata
->m_flags
= f
;
221 if(m_icase
!= static_cast<bool>(f
& regbase::icase
))
223 m_icase
= static_cast<bool>(f
& regbase::icase
);
226 re_syntax_base
* append_state(syntax_element_type t
, std::size_t s
= sizeof(re_syntax_base
));
227 re_syntax_base
* insert_state(std::ptrdiff_t pos
, syntax_element_type t
, std::size_t s
= sizeof(re_syntax_base
));
228 re_literal
* append_literal(charT c
);
229 re_syntax_base
* append_set(const basic_char_set
<charT
, traits
>& char_set
);
230 re_syntax_base
* append_set(const basic_char_set
<charT
, traits
>& char_set
, mpl::false_
*);
231 re_syntax_base
* append_set(const basic_char_set
<charT
, traits
>& char_set
, mpl::true_
*);
232 void finalize(const charT
* p1
, const charT
* p2
);
234 regex_data
<charT
, traits
>* m_pdata
; // pointer to the basic_regex_data struct we are filling in
235 const ::boost::regex_traits_wrapper
<traits
>&
236 m_traits
; // convenience reference to traits class
237 re_syntax_base
* m_last_state
; // the last state we added
238 bool m_icase
; // true for case insensitive matches
239 unsigned m_repeater_id
; // the state_id of the next repeater
240 bool m_has_backrefs
; // true if there are actually any backrefs
241 unsigned m_backrefs
; // bitmask of permitted backrefs
242 boost::uintmax_t m_bad_repeats
; // bitmask of repeats we can't deduce a startmap for;
243 typename
traits::char_class_type m_word_mask
; // mask used to determine if a character is a word character
244 typename
traits::char_class_type m_mask_space
; // mask used to determine if a character is a word character
245 typename
traits::char_class_type m_lower_mask
; // mask used to determine if a character is a lowercase character
246 typename
traits::char_class_type m_upper_mask
; // mask used to determine if a character is an uppercase character
247 typename
traits::char_class_type m_alpha_mask
; // mask used to determine if a character is an alphabetic character
249 basic_regex_creator
& operator=(const basic_regex_creator
&);
250 basic_regex_creator(const basic_regex_creator
&);
252 void fixup_pointers(re_syntax_base
* state
);
253 void create_startmaps(re_syntax_base
* state
);
254 int calculate_backstep(re_syntax_base
* state
);
255 void create_startmap(re_syntax_base
* state
, unsigned char* l_map
, unsigned int* pnull
, unsigned char mask
);
256 unsigned get_restart_type(re_syntax_base
* state
);
257 void set_all_masks(unsigned char* bits
, unsigned char);
258 bool is_bad_repeat(re_syntax_base
* pt
);
259 void set_bad_repeat(re_syntax_base
* pt
);
260 syntax_element_type
get_repeat_type(re_syntax_base
* state
);
261 void probe_leading_repeat(re_syntax_base
* state
);
264 template <class charT
, class traits
>
265 basic_regex_creator
<charT
, traits
>::basic_regex_creator(regex_data
<charT
, traits
>* data
)
266 : m_pdata(data
), m_traits(*(data
->m_ptraits
)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_backrefs(0)
268 m_pdata
->m_data
.clear();
269 m_pdata
->m_status
= ::boost::regex_constants::error_ok
;
270 static const charT w
= 'w';
271 static const charT s
= 's';
272 static const charT l
[5] = { 'l', 'o', 'w', 'e', 'r', };
273 static const charT u
[5] = { 'u', 'p', 'p', 'e', 'r', };
274 static const charT a
[5] = { 'a', 'l', 'p', 'h', 'a', };
275 m_word_mask
= m_traits
.lookup_classname(&w
, &w
+1);
276 m_mask_space
= m_traits
.lookup_classname(&s
, &s
+1);
277 m_lower_mask
= m_traits
.lookup_classname(l
, l
+ 5);
278 m_upper_mask
= m_traits
.lookup_classname(u
, u
+ 5);
279 m_alpha_mask
= m_traits
.lookup_classname(a
, a
+ 5);
280 m_pdata
->m_word_mask
= m_word_mask
;
281 BOOST_ASSERT(m_word_mask
!= 0);
282 BOOST_ASSERT(m_mask_space
!= 0);
283 BOOST_ASSERT(m_lower_mask
!= 0);
284 BOOST_ASSERT(m_upper_mask
!= 0);
285 BOOST_ASSERT(m_alpha_mask
!= 0);
288 template <class charT
, class traits
>
289 re_syntax_base
* basic_regex_creator
<charT
, traits
>::append_state(syntax_element_type t
, std::size_t s
)
291 // if the state is a backref then make a note of it:
292 if(t
== syntax_element_backref
)
293 this->m_has_backrefs
= true;
294 // append a new state, start by aligning our last one:
295 m_pdata
->m_data
.align();
296 // set the offset to the next state in our last one:
298 m_last_state
->next
.i
= m_pdata
->m_data
.size() - getoffset(m_last_state
);
299 // now actually extent our data:
300 m_last_state
= static_cast<re_syntax_base
*>(m_pdata
->m_data
.extend(s
));
301 // fill in boilerplate options in the new state:
302 m_last_state
->next
.i
= 0;
303 m_last_state
->type
= t
;
307 template <class charT
, class traits
>
308 re_syntax_base
* basic_regex_creator
<charT
, traits
>::insert_state(std::ptrdiff_t pos
, syntax_element_type t
, std::size_t s
)
310 // append a new state, start by aligning our last one:
311 m_pdata
->m_data
.align();
312 // set the offset to the next state in our last one:
314 m_last_state
->next
.i
= m_pdata
->m_data
.size() - getoffset(m_last_state
);
315 // remember the last state position:
316 std::ptrdiff_t off
= getoffset(m_last_state
) + s
;
317 // now actually insert our data:
318 re_syntax_base
* new_state
= static_cast<re_syntax_base
*>(m_pdata
->m_data
.insert(pos
, s
));
319 // fill in boilerplate options in the new state:
320 new_state
->next
.i
= s
;
322 m_last_state
= getaddress(off
);
326 template <class charT
, class traits
>
327 re_literal
* basic_regex_creator
<charT
, traits
>::append_literal(charT c
)
330 // start by seeing if we have an existing re_literal we can extend:
331 if((0 == m_last_state
) || (m_last_state
->type
!= syntax_element_literal
))
333 // no existing re_literal, create a new one:
334 result
= static_cast<re_literal
*>(append_state(syntax_element_literal
, sizeof(re_literal
) + sizeof(charT
)));
336 *static_cast<charT
*>(static_cast<void*>(result
+1)) = m_traits
.translate(c
, m_icase
);
340 // we have an existing re_literal, extend it:
341 std::ptrdiff_t off
= getoffset(m_last_state
);
342 m_pdata
->m_data
.extend(sizeof(charT
));
343 m_last_state
= result
= static_cast<re_literal
*>(getaddress(off
));
344 charT
* characters
= static_cast<charT
*>(static_cast<void*>(result
+1));
345 characters
[result
->length
] = m_traits
.translate(c
, m_icase
);
351 template <class charT
, class traits
>
352 inline re_syntax_base
* basic_regex_creator
<charT
, traits
>::append_set(
353 const basic_char_set
<charT
, traits
>& char_set
)
355 typedef mpl::bool_
< (sizeof(charT
) == 1) > truth_type
;
356 return char_set
.has_digraphs()
357 ? append_set(char_set
, static_cast<mpl::false_
*>(0))
358 : append_set(char_set
, static_cast<truth_type
*>(0));
361 template <class charT
, class traits
>
362 re_syntax_base
* basic_regex_creator
<charT
, traits
>::append_set(
363 const basic_char_set
<charT
, traits
>& char_set
, mpl::false_
*)
365 typedef typename
traits::string_type string_type
;
366 typedef typename basic_char_set
<charT
, traits
>::list_iterator item_iterator
;
367 typedef typename
traits::char_class_type mask_type
;
369 re_set_long
<mask_type
>* result
= static_cast<re_set_long
<mask_type
>*>(append_state(syntax_element_long_set
, sizeof(re_set_long
<mask_type
>)));
371 // fill in the basics:
373 result
->csingles
= static_cast<unsigned int>(::boost::re_detail::distance(char_set
.singles_begin(), char_set
.singles_end()));
374 result
->cranges
= static_cast<unsigned int>(::boost::re_detail::distance(char_set
.ranges_begin(), char_set
.ranges_end())) / 2;
375 result
->cequivalents
= static_cast<unsigned int>(::boost::re_detail::distance(char_set
.equivalents_begin(), char_set
.equivalents_end()));
376 result
->cclasses
= char_set
.classes();
377 result
->cnclasses
= char_set
.negated_classes();
378 if(flags() & regbase::icase
)
380 // adjust classes as needed:
381 if(((result
->cclasses
& m_lower_mask
) == m_lower_mask
) || ((result
->cclasses
& m_upper_mask
) == m_upper_mask
))
382 result
->cclasses
|= m_alpha_mask
;
383 if(((result
->cnclasses
& m_lower_mask
) == m_lower_mask
) || ((result
->cnclasses
& m_upper_mask
) == m_upper_mask
))
384 result
->cnclasses
|= m_alpha_mask
;
387 result
->isnot
= char_set
.is_negated();
388 result
->singleton
= !char_set
.has_digraphs();
390 // remember where the state is for later:
392 std::ptrdiff_t offset
= getoffset(result
);
394 // now extend with all the singles:
396 item_iterator first
, last
;
397 first
= char_set
.singles_begin();
398 last
= char_set
.singles_end();
401 charT
* p
= static_cast<charT
*>(this->m_pdata
->m_data
.extend(sizeof(charT
) * (first
->second
? 3 : 2)));
402 p
[0] = m_traits
.translate(first
->first
, m_icase
);
405 p
[1] = m_traits
.translate(first
->second
, m_icase
);
413 // now extend with all the ranges:
415 first
= char_set
.ranges_begin();
416 last
= char_set
.ranges_end();
419 // first grab the endpoints of the range:
420 digraph
<charT
> c1
= *first
;
421 c1
.first
= this->m_traits
.translate(c1
.first
, this->m_icase
);
422 c1
.second
= this->m_traits
.translate(c1
.second
, this->m_icase
);
424 digraph
<charT
> c2
= *first
;
425 c2
.first
= this->m_traits
.translate(c2
.first
, this->m_icase
);
426 c2
.second
= this->m_traits
.translate(c2
.second
, this->m_icase
);
429 // different actions now depending upon whether collation is turned on:
430 if(flags() & regex_constants::collate
)
432 // we need to transform our range into sort keys:
433 #if BOOST_WORKAROUND(__GNUC__, < 3)
434 string_type
in(3, charT(0));
437 s1
= this->m_traits
.transform(in
.c_str(), (in
[1] ? in
.c_str()+2 : in
.c_str()+1));
440 s2
= this->m_traits
.transform(in
.c_str(), (in
[1] ? in
.c_str()+2 : in
.c_str()+1));
442 charT a1
[3] = { c1
.first
, c1
.second
, charT(0), };
443 charT a2
[3] = { c2
.first
, c2
.second
, charT(0), };
444 s1
= this->m_traits
.transform(a1
, (a1
[1] ? a1
+2 : a1
+1));
445 s2
= this->m_traits
.transform(a2
, (a2
[1] ? a2
+2 : a2
+1));
448 s1
= string_type(1, charT(0));
450 s2
= string_type(1, charT(0));
456 s1
.insert(s1
.end(), c1
.first
);
457 s1
.insert(s1
.end(), c1
.second
);
460 s1
= string_type(1, c1
.first
);
463 s2
.insert(s2
.end(), c2
.first
);
464 s2
.insert(s2
.end(), c2
.second
);
467 s2
.insert(s2
.end(), c2
.first
);
474 charT
* p
= static_cast<charT
*>(this->m_pdata
->m_data
.extend(sizeof(charT
) * (s1
.size() + s2
.size() + 2) ) );
475 re_detail::copy(s1
.begin(), s1
.end(), p
);
476 p
[s1
.size()] = charT(0);
478 re_detail::copy(s2
.begin(), s2
.end(), p
);
479 p
[s2
.size()] = charT(0);
482 // now process the equivalence classes:
484 first
= char_set
.equivalents_begin();
485 last
= char_set
.equivalents_end();
491 #if BOOST_WORKAROUND(__GNUC__, < 3)
492 string_type
in(3, charT(0));
493 in
[0] = first
->first
;
494 in
[1] = first
->second
;
495 s
= m_traits
.transform_primary(in
.c_str(), in
.c_str()+2);
497 charT cs
[3] = { first
->first
, first
->second
, charT(0), };
498 s
= m_traits
.transform_primary(cs
, cs
+2);
502 s
= m_traits
.transform_primary(&first
->first
, &first
->first
+1);
504 return 0; // invalid or unsupported equivalence class
505 charT
* p
= static_cast<charT
*>(this->m_pdata
->m_data
.extend(sizeof(charT
) * (s
.size()+1) ) );
506 re_detail::copy(s
.begin(), s
.end(), p
);
507 p
[s
.size()] = charT(0);
511 // finally reset the address of our last state:
513 m_last_state
= result
= static_cast<re_set_long
<mask_type
>*>(getaddress(offset
));
520 inline bool char_less(T t1
, T t2
)
525 inline bool char_less
<char>(char t1
, char t2
)
527 return static_cast<unsigned char>(t1
) < static_cast<unsigned char>(t2
);
530 inline bool char_less
<signed char>(signed char t1
, signed char t2
)
532 return static_cast<unsigned char>(t1
) < static_cast<unsigned char>(t2
);
536 template <class charT
, class traits
>
537 re_syntax_base
* basic_regex_creator
<charT
, traits
>::append_set(
538 const basic_char_set
<charT
, traits
>& char_set
, mpl::true_
*)
540 typedef typename
traits::string_type string_type
;
541 typedef typename basic_char_set
<charT
, traits
>::list_iterator item_iterator
;
543 re_set
* result
= static_cast<re_set
*>(append_state(syntax_element_set
, sizeof(re_set
)));
544 bool negate
= char_set
.is_negated();
545 std::memset(result
->_map
, 0, sizeof(result
->_map
));
547 // handle singles first:
549 item_iterator first
, last
;
550 first
= char_set
.singles_begin();
551 last
= char_set
.singles_end();
554 for(unsigned int i
= 0; i
< (1 << CHAR_BIT
); ++i
)
556 if(this->m_traits
.translate(static_cast<charT
>(i
), this->m_icase
)
557 == this->m_traits
.translate(first
->first
, this->m_icase
))
558 result
->_map
[i
] = true;
563 // OK now handle ranges:
565 first
= char_set
.ranges_begin();
566 last
= char_set
.ranges_end();
569 // first grab the endpoints of the range:
570 charT c1
= this->m_traits
.translate(first
->first
, this->m_icase
);
572 charT c2
= this->m_traits
.translate(first
->first
, this->m_icase
);
574 // different actions now depending upon whether collation is turned on:
575 if(flags() & regex_constants::collate
)
577 // we need to transform our range into sort keys:
578 charT c3
[2] = { c1
, charT(0), };
579 string_type s1
= this->m_traits
.transform(c3
, c3
+1);
581 string_type s2
= this->m_traits
.transform(c3
, c3
+1);
587 BOOST_ASSERT(c3
[1] == charT(0));
588 for(unsigned i
= 0; i
< (1u << CHAR_BIT
); ++i
)
590 c3
[0] = static_cast<charT
>(i
);
591 string_type s3
= this->m_traits
.transform(c3
, c3
+1);
592 if((s1
<= s3
) && (s3
<= s2
))
593 result
->_map
[i
] = true;
598 if(char_less
<charT
>(c2
, c1
))
603 // everything in range matches:
604 std::memset(result
->_map
+ static_cast<unsigned char>(c1
), true, 1 + static_cast<unsigned char>(c2
) - static_cast<unsigned char>(c1
));
608 // and now the classes:
610 typedef typename
traits::char_class_type mask_type
;
611 mask_type m
= char_set
.classes();
612 if(flags() & regbase::icase
)
614 // adjust m as needed:
615 if(((m
& m_lower_mask
) == m_lower_mask
) || ((m
& m_upper_mask
) == m_upper_mask
))
620 for(unsigned i
= 0; i
< (1u << CHAR_BIT
); ++i
)
622 if(this->m_traits
.isctype(static_cast<charT
>(i
), m
))
623 result
->_map
[i
] = true;
627 // and now the negated classes:
629 m
= char_set
.negated_classes();
630 if(flags() & regbase::icase
)
632 // adjust m as needed:
633 if(((m
& m_lower_mask
) == m_lower_mask
) || ((m
& m_upper_mask
) == m_upper_mask
))
638 for(unsigned i
= 0; i
< (1u << CHAR_BIT
); ++i
)
640 if(0 == this->m_traits
.isctype(static_cast<charT
>(i
), m
))
641 result
->_map
[i
] = true;
645 // now process the equivalence classes:
647 first
= char_set
.equivalents_begin();
648 last
= char_set
.equivalents_end();
652 BOOST_ASSERT(static_cast<charT
>(0) == first
->second
);
653 s
= m_traits
.transform_primary(&first
->first
, &first
->first
+1);
655 return 0; // invalid or unsupported equivalence class
656 for(unsigned i
= 0; i
< (1u << CHAR_BIT
); ++i
)
658 charT c
[2] = { (static_cast<charT
>(i
)), charT(0), };
659 string_type s2
= this->m_traits
.transform_primary(c
, c
+1);
661 result
->_map
[i
] = true;
667 for(unsigned i
= 0; i
< (1u << CHAR_BIT
); ++i
)
669 result
->_map
[i
] = !(result
->_map
[i
]);
675 template <class charT
, class traits
>
676 void basic_regex_creator
<charT
, traits
>::finalize(const charT
* p1
, const charT
* p2
)
678 // we've added all the states we need, now finish things off.
679 // start by adding a terminating state:
680 append_state(syntax_element_match
);
681 // extend storage to store original expression:
682 std::ptrdiff_t len
= p2
- p1
;
683 m_pdata
->m_expression_len
= len
;
684 charT
* ps
= static_cast<charT
*>(m_pdata
->m_data
.extend(sizeof(charT
) * (1 + (p2
- p1
))));
685 m_pdata
->m_expression
= ps
;
686 re_detail::copy(p1
, p2
, ps
);
688 // fill in our other data...
689 // successful parsing implies a zero status:
690 m_pdata
->m_status
= 0;
691 // get the first state of the machine:
692 m_pdata
->m_first_state
= static_cast<re_syntax_base
*>(m_pdata
->m_data
.data());
693 // fixup pointers in the machine:
694 fixup_pointers(m_pdata
->m_first_state
);
695 // create nested startmaps:
696 create_startmaps(m_pdata
->m_first_state
);
697 // create main startmap:
698 std::memset(m_pdata
->m_startmap
, 0, sizeof(m_pdata
->m_startmap
));
699 m_pdata
->m_can_be_null
= 0;
702 create_startmap(m_pdata
->m_first_state
, m_pdata
->m_startmap
, &(m_pdata
->m_can_be_null
), mask_all
);
703 // get the restart type:
704 m_pdata
->m_restart_type
= get_restart_type(m_pdata
->m_first_state
);
705 // optimise a leading repeat if there is one:
706 probe_leading_repeat(m_pdata
->m_first_state
);
709 template <class charT
, class traits
>
710 void basic_regex_creator
<charT
, traits
>::fixup_pointers(re_syntax_base
* state
)
716 case syntax_element_rep
:
717 case syntax_element_dot_rep
:
718 case syntax_element_char_rep
:
719 case syntax_element_short_set_rep
:
720 case syntax_element_long_set_rep
:
721 // set the state_id of this repeat:
722 static_cast<re_repeat
*>(state
)->state_id
= m_repeater_id
++;
724 case syntax_element_alt
:
725 std::memset(static_cast<re_alt
*>(state
)->_map
, 0, sizeof(static_cast<re_alt
*>(state
)->_map
));
726 static_cast<re_alt
*>(state
)->can_be_null
= 0;
728 case syntax_element_jump
:
729 static_cast<re_jump
*>(state
)->alt
.p
= getaddress(static_cast<re_jump
*>(state
)->alt
.i
, state
);
730 // fall through again:
733 state
->next
.p
= getaddress(state
->next
.i
, state
);
737 state
= state
->next
.p
;
741 template <class charT
, class traits
>
742 void basic_regex_creator
<charT
, traits
>::create_startmaps(re_syntax_base
* state
)
744 // non-recursive implementation:
745 // create the last map in the machine first, so that earlier maps
746 // can make use of the result...
748 // This was originally a recursive implementation, but that caused stack
749 // overflows with complex expressions on small stacks (think COM+).
751 // start by saving the case setting:
752 bool l_icase
= m_icase
;
753 std::vector
<std::pair
<bool, re_syntax_base
*> > v
;
759 case syntax_element_toggle_case
:
760 // we need to track case changes here:
761 m_icase
= static_cast<re_case
*>(state
)->icase
;
762 state
= state
->next
.p
;
764 case syntax_element_alt
:
765 case syntax_element_rep
:
766 case syntax_element_dot_rep
:
767 case syntax_element_char_rep
:
768 case syntax_element_short_set_rep
:
769 case syntax_element_long_set_rep
:
770 // just push the state onto our stack for now:
771 v
.push_back(std::pair
<bool, re_syntax_base
*>(m_icase
, state
));
772 state
= state
->next
.p
;
774 case syntax_element_backstep
:
775 // we need to calculate how big the backstep is:
776 static_cast<re_brace
*>(state
)->index
777 = this->calculate_backstep(state
->next
.p
);
778 if(static_cast<re_brace
*>(state
)->index
< 0)
781 if(0 == this->m_pdata
->m_status
) // update the error code if not already set
782 this->m_pdata
->m_status
= boost::regex_constants::error_bad_pattern
;
784 // clear the expression, we should be empty:
786 this->m_pdata
->m_expression
= 0;
787 this->m_pdata
->m_expression_len
= 0;
789 // and throw if required:
791 if(0 == (this->flags() & regex_constants::no_except
))
793 std::string message
= this->m_pdata
->m_ptraits
->error_string(boost::regex_constants::error_bad_pattern
);
794 boost::regex_error
e(message
, boost::regex_constants::error_bad_pattern
, 0);
800 state
= state
->next
.p
;
803 // now work through our list, building all the maps as we go:
806 const std::pair
<bool, re_syntax_base
*>& p
= v
.back();
813 create_startmap(state
->next
.p
, static_cast<re_alt
*>(state
)->_map
, &static_cast<re_alt
*>(state
)->can_be_null
, mask_take
);
815 create_startmap(static_cast<re_alt
*>(state
)->alt
.p
, static_cast<re_alt
*>(state
)->_map
, &static_cast<re_alt
*>(state
)->can_be_null
, mask_skip
);
816 // adjust the type of the state to allow for faster matching:
817 state
->type
= this->get_repeat_type(state
);
819 // restore case sensitivity:
823 template <class charT
, class traits
>
824 int basic_regex_creator
<charT
, traits
>::calculate_backstep(re_syntax_base
* state
)
826 typedef typename
traits::char_class_type mask_type
;
832 case syntax_element_startmark
:
833 if((static_cast<re_brace
*>(state
)->index
== -1)
834 || (static_cast<re_brace
*>(state
)->index
== -2))
836 state
= static_cast<re_jump
*>(state
->next
.p
)->alt
.p
->next
.p
;
839 else if(static_cast<re_brace
*>(state
)->index
== -3)
841 state
= state
->next
.p
->next
.p
;
845 case syntax_element_endmark
:
846 if((static_cast<re_brace
*>(state
)->index
== -1)
847 || (static_cast<re_brace
*>(state
)->index
== -2))
850 case syntax_element_literal
:
851 result
+= static_cast<re_literal
*>(state
)->length
;
853 case syntax_element_wild
:
854 case syntax_element_set
:
857 case syntax_element_dot_rep
:
858 case syntax_element_char_rep
:
859 case syntax_element_short_set_rep
:
860 case syntax_element_backref
:
861 case syntax_element_rep
:
862 case syntax_element_combining
:
863 case syntax_element_long_set_rep
:
864 case syntax_element_backstep
:
866 re_repeat
* rep
= static_cast<re_repeat
*>(state
);
867 // adjust the type of the state to allow for faster matching:
868 state
->type
= this->get_repeat_type(state
);
869 if((state
->type
== syntax_element_dot_rep
)
870 || (state
->type
== syntax_element_char_rep
)
871 || (state
->type
== syntax_element_short_set_rep
))
873 if(rep
->max
!= rep
->min
)
875 result
+= static_cast<int>(rep
->min
);
879 else if((state
->type
== syntax_element_long_set_rep
))
881 BOOST_ASSERT(rep
->next
.p
->type
== syntax_element_long_set
);
882 if(static_cast<re_set_long
<mask_type
>*>(rep
->next
.p
)->singleton
== 0)
884 if(rep
->max
!= rep
->min
)
886 result
+= static_cast<int>(rep
->min
);
892 case syntax_element_long_set
:
893 if(static_cast<re_set_long
<mask_type
>*>(state
)->singleton
== 0)
897 case syntax_element_jump
:
898 state
= static_cast<re_jump
*>(state
)->alt
.p
;
903 state
= state
->next
.p
;
908 template <class charT
, class traits
>
909 void basic_regex_creator
<charT
, traits
>::create_startmap(re_syntax_base
* state
, unsigned char* l_map
, unsigned int* pnull
, unsigned char mask
)
911 int not_last_jump
= 1;
913 // track case sensitivity:
914 bool l_icase
= m_icase
;
920 case syntax_element_toggle_case
:
921 l_icase
= static_cast<re_case
*>(state
)->icase
;
922 state
= state
->next
.p
;
924 case syntax_element_literal
:
926 // don't set anything in *pnull, set each element in l_map
927 // that could match the first character in the literal:
930 l_map
[0] |= mask_init
;
931 charT first_char
= *static_cast<charT
*>(static_cast<void*>(static_cast<re_literal
*>(state
) + 1));
932 for(unsigned int i
= 0; i
< (1u << CHAR_BIT
); ++i
)
934 if(m_traits
.translate(static_cast<charT
>(i
), l_icase
) == first_char
)
940 case syntax_element_end_line
:
942 // next character must be a line separator (if there is one):
945 l_map
[0] |= mask_init
;
951 // now figure out if we can match a NULL string at this point:
953 create_startmap(state
->next
.p
, 0, pnull
, mask
);
956 case syntax_element_backref
:
957 // can be null, and any character can match:
961 case syntax_element_wild
:
963 // can't be null, any character can match:
964 set_all_masks(l_map
, mask
);
967 case syntax_element_match
:
969 // must be null, any character can match:
970 set_all_masks(l_map
, mask
);
975 case syntax_element_word_start
:
977 // recurse, then AND with all the word characters:
978 create_startmap(state
->next
.p
, l_map
, pnull
, mask
);
981 l_map
[0] |= mask_init
;
982 for(unsigned int i
= 0; i
< (1u << CHAR_BIT
); ++i
)
984 if(!m_traits
.isctype(static_cast<charT
>(i
), m_word_mask
))
985 l_map
[i
] &= static_cast<unsigned char>(~mask
);
990 case syntax_element_word_end
:
992 // recurse, then AND with all the word characters:
993 create_startmap(state
->next
.p
, l_map
, pnull
, mask
);
996 l_map
[0] |= mask_init
;
997 for(unsigned int i
= 0; i
< (1u << CHAR_BIT
); ++i
)
999 if(m_traits
.isctype(static_cast<charT
>(i
), m_word_mask
))
1000 l_map
[i
] &= static_cast<unsigned char>(~mask
);
1005 case syntax_element_buffer_end
:
1007 // we *must be null* :
1012 case syntax_element_long_set
:
1015 typedef typename
traits::char_class_type mask_type
;
1016 if(static_cast<re_set_long
<mask_type
>*>(state
)->singleton
)
1018 l_map
[0] |= mask_init
;
1019 for(unsigned int i
= 0; i
< (1u << CHAR_BIT
); ++i
)
1021 charT c
= static_cast<charT
>(i
);
1022 if(&c
!= re_is_set_member(&c
, &c
+ 1, static_cast<re_set_long
<mask_type
>*>(state
), *m_pdata
, m_icase
))
1027 set_all_masks(l_map
, mask
);
1030 case syntax_element_set
:
1033 l_map
[0] |= mask_init
;
1034 for(unsigned int i
= 0; i
< (1u << CHAR_BIT
); ++i
)
1036 if(static_cast<re_set
*>(state
)->_map
[
1037 static_cast<unsigned char>(m_traits
.translate(static_cast<charT
>(i
), l_icase
))])
1042 case syntax_element_jump
:
1044 state
= static_cast<re_alt
*>(state
)->alt
.p
;
1047 case syntax_element_alt
:
1048 case syntax_element_rep
:
1049 case syntax_element_dot_rep
:
1050 case syntax_element_char_rep
:
1051 case syntax_element_short_set_rep
:
1052 case syntax_element_long_set_rep
:
1054 re_alt
* rep
= static_cast<re_alt
*>(state
);
1055 if(rep
->_map
[0] & mask_init
)
1059 // copy previous results:
1060 l_map
[0] |= mask_init
;
1061 for(unsigned int i
= 0; i
<= UCHAR_MAX
; ++i
)
1063 if(rep
->_map
[i
] & mask_any
)
1069 if(rep
->can_be_null
& mask_any
)
1075 // we haven't created a startmap for this alternative yet
1076 // so take the union of the two options:
1077 if(is_bad_repeat(state
))
1079 set_all_masks(l_map
, mask
);
1084 set_bad_repeat(state
);
1085 create_startmap(state
->next
.p
, l_map
, pnull
, mask
);
1086 if((state
->type
== syntax_element_alt
)
1087 || (static_cast<re_repeat
*>(state
)->min
== 0)
1088 || (not_last_jump
== 0))
1089 create_startmap(rep
->alt
.p
, l_map
, pnull
, mask
);
1093 case syntax_element_soft_buffer_end
:
1094 // match newline or null:
1097 l_map
[0] |= mask_init
;
1098 l_map
['\n'] |= mask
;
1099 l_map
['\r'] |= mask
;
1104 case syntax_element_endmark
:
1105 // need to handle independent subs as a special case:
1106 if(static_cast<re_brace
*>(state
)->index
< 0)
1108 // can be null, any character can match:
1109 set_all_masks(l_map
, mask
);
1116 state
= state
->next
.p
;
1120 case syntax_element_startmark
:
1121 // need to handle independent subs as a special case:
1122 if(static_cast<re_brace
*>(state
)->index
== -3)
1124 state
= state
->next
.p
->next
.p
;
1127 // otherwise fall through:
1129 state
= state
->next
.p
;
1135 template <class charT
, class traits
>
1136 unsigned basic_regex_creator
<charT
, traits
>::get_restart_type(re_syntax_base
* state
)
1139 // find out how the machine starts, so we can optimise the search:
1145 case syntax_element_startmark
:
1146 case syntax_element_endmark
:
1147 state
= state
->next
.p
;
1149 case syntax_element_start_line
:
1150 return regbase::restart_line
;
1151 case syntax_element_word_start
:
1152 return regbase::restart_word
;
1153 case syntax_element_buffer_start
:
1154 return regbase::restart_buf
;
1155 case syntax_element_restart_continue
:
1156 return regbase::restart_continue
;
1162 return regbase::restart_any
;
1165 template <class charT
, class traits
>
1166 void basic_regex_creator
<charT
, traits
>::set_all_masks(unsigned char* bits
, unsigned char mask
)
1169 // set mask in all of bits elements,
1170 // if bits[0] has mask_init not set then we can
1171 // optimise this to a call to memset:
1176 (std::memset
)(bits
, mask
, 1u << CHAR_BIT
);
1179 for(unsigned i
= 0; i
< (1u << CHAR_BIT
); ++i
)
1182 bits
[0] |= mask_init
;
1186 template <class charT
, class traits
>
1187 bool basic_regex_creator
<charT
, traits
>::is_bad_repeat(re_syntax_base
* pt
)
1191 case syntax_element_rep
:
1192 case syntax_element_dot_rep
:
1193 case syntax_element_char_rep
:
1194 case syntax_element_short_set_rep
:
1195 case syntax_element_long_set_rep
:
1197 unsigned state_id
= static_cast<re_repeat
*>(pt
)->state_id
;
1198 if(state_id
> sizeof(m_bad_repeats
) * CHAR_BIT
)
1199 return true; // run out of bits, assume we can't traverse this one.
1200 static const boost::uintmax_t one
= 1uL;
1201 return m_bad_repeats
& (one
<< state_id
);
1208 template <class charT
, class traits
>
1209 void basic_regex_creator
<charT
, traits
>::set_bad_repeat(re_syntax_base
* pt
)
1213 case syntax_element_rep
:
1214 case syntax_element_dot_rep
:
1215 case syntax_element_char_rep
:
1216 case syntax_element_short_set_rep
:
1217 case syntax_element_long_set_rep
:
1219 unsigned state_id
= static_cast<re_repeat
*>(pt
)->state_id
;
1220 static const boost::uintmax_t one
= 1uL;
1221 if(state_id
<= sizeof(m_bad_repeats
) * CHAR_BIT
)
1222 m_bad_repeats
|= (one
<< state_id
);
1229 template <class charT
, class traits
>
1230 syntax_element_type basic_regex_creator
<charT
, traits
>::get_repeat_type(re_syntax_base
* state
)
1232 typedef typename
traits::char_class_type mask_type
;
1233 if(state
->type
== syntax_element_rep
)
1235 // check to see if we are repeating a single state:
1236 if(state
->next
.p
->next
.p
->next
.p
== static_cast<re_alt
*>(state
)->alt
.p
)
1238 switch(state
->next
.p
->type
)
1240 case re_detail::syntax_element_wild
:
1241 return re_detail::syntax_element_dot_rep
;
1242 case re_detail::syntax_element_literal
:
1243 return re_detail::syntax_element_char_rep
;
1244 case re_detail::syntax_element_set
:
1245 return re_detail::syntax_element_short_set_rep
;
1246 case re_detail::syntax_element_long_set
:
1247 if(static_cast<re_detail::re_set_long
<mask_type
>*>(state
->next
.p
)->singleton
)
1248 return re_detail::syntax_element_long_set_rep
;
1258 template <class charT
, class traits
>
1259 void basic_regex_creator
<charT
, traits
>::probe_leading_repeat(re_syntax_base
* state
)
1261 // enumerate our states, and see if we have a leading repeat
1262 // for which failed search restarts can be optimised;
1267 case syntax_element_startmark
:
1268 if(static_cast<re_brace
*>(state
)->index
>= 0)
1270 state
= state
->next
.p
;
1273 if((static_cast<re_brace
*>(state
)->index
== -1)
1274 || (static_cast<re_brace
*>(state
)->index
== -2))
1276 // skip past the zero width assertion:
1277 state
= static_cast<const re_jump
*>(state
->next
.p
)->alt
.p
->next
.p
;
1280 if(static_cast<re_brace
*>(state
)->index
== -3)
1282 // Have to skip the leading jump state:
1283 state
= state
->next
.p
->next
.p
;
1287 case syntax_element_endmark
:
1288 case syntax_element_start_line
:
1289 case syntax_element_end_line
:
1290 case syntax_element_word_boundary
:
1291 case syntax_element_within_word
:
1292 case syntax_element_word_start
:
1293 case syntax_element_word_end
:
1294 case syntax_element_buffer_start
:
1295 case syntax_element_buffer_end
:
1296 case syntax_element_restart_continue
:
1297 state
= state
->next
.p
;
1299 case syntax_element_dot_rep
:
1300 case syntax_element_char_rep
:
1301 case syntax_element_short_set_rep
:
1302 case syntax_element_long_set_rep
:
1303 if(this->m_has_backrefs
== 0)
1304 static_cast<re_repeat
*>(state
)->leading
= true;
1313 } // namespace re_detail
1315 } // namespace boost
1318 # pragma warning(pop)
1322 #pragma warning(push)
1323 #pragma warning(disable: 4103)
1325 #ifdef BOOST_HAS_ABI_HEADERS
1326 # include BOOST_ABI_SUFFIX
1329 #pragma warning(pop)