Line data Source code
1 : /*
2 : *
3 : * Copyright (c) 2002
4 : * John Maddock
5 : *
6 : * Use, modification and distribution are subject to the
7 : * Boost Software License, Version 1.0. (See accompanying file
8 : * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 : *
10 : */
11 :
12 : #ifndef BOOST_REGEX_MATCHER_HPP
13 : #define BOOST_REGEX_MATCHER_HPP
14 :
15 : #include <boost/regex/v4/iterator_category.hpp>
16 :
17 : #ifdef BOOST_MSVC
18 : #pragma warning(push)
19 : #pragma warning(disable: 4103)
20 : #endif
21 : #ifdef BOOST_HAS_ABI_HEADERS
22 : # include BOOST_ABI_PREFIX
23 : #endif
24 : #ifdef BOOST_MSVC
25 : #pragma warning(pop)
26 : #endif
27 :
28 : #ifdef BOOST_MSVC
29 : # pragma warning(push)
30 : #if BOOST_MSVC < 1910
31 : #pragma warning(disable:4800)
32 : #endif
33 : #endif
34 :
35 : namespace boost{
36 : namespace BOOST_REGEX_DETAIL_NS{
37 :
38 : //
39 : // error checking API:
40 : //
41 : BOOST_REGEX_DECL void BOOST_REGEX_CALL verify_options(boost::regex_constants::syntax_option_type ef, match_flag_type mf);
42 : //
43 : // function can_start:
44 : //
45 : template <class charT>
46 : inline bool can_start(charT c, const unsigned char* map, unsigned char mask)
47 : {
48 : return ((c < static_cast<charT>(0)) ? true : ((c >= static_cast<charT>(1 << CHAR_BIT)) ? true : map[c] & mask));
49 : }
50 0 : inline bool can_start(char c, const unsigned char* map, unsigned char mask)
51 : {
52 0 : return map[(unsigned char)c] & mask;
53 : }
54 : inline bool can_start(signed char c, const unsigned char* map, unsigned char mask)
55 : {
56 : return map[(unsigned char)c] & mask;
57 : }
58 : inline bool can_start(unsigned char c, const unsigned char* map, unsigned char mask)
59 : {
60 : return map[c] & mask;
61 : }
62 : inline bool can_start(unsigned short c, const unsigned char* map, unsigned char mask)
63 : {
64 : return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask);
65 : }
66 : #if !defined(__hpux) && !defined(__WINSCW__)// WCHAR_MIN not usable in pp-directives.
67 : #if defined(WCHAR_MIN) && (WCHAR_MIN == 0) && !defined(BOOST_NO_INTRINSIC_WCHAR_T)
68 : inline bool can_start(wchar_t c, const unsigned char* map, unsigned char mask)
69 : {
70 : return ((c >= static_cast<wchar_t>(1u << CHAR_BIT)) ? true : map[c] & mask);
71 : }
72 : #endif
73 : #endif
74 : #if !defined(BOOST_NO_INTRINSIC_WCHAR_T)
75 : inline bool can_start(unsigned int c, const unsigned char* map, unsigned char mask)
76 : {
77 : return (((c >= static_cast<unsigned int>(1u << CHAR_BIT)) ? true : map[c] & mask));
78 : }
79 : #endif
80 :
81 :
82 : //
83 : // Unfortunately Rogue Waves standard library appears to have a bug
84 : // in std::basic_string::compare that results in eroneous answers
85 : // in some cases (tested with Borland C++ 5.1, Rogue Wave lib version
86 : // 0x020101) the test case was:
87 : // {39135,0} < {0xff,0}
88 : // which succeeds when it should not.
89 : //
90 : #ifndef _RWSTD_VER
91 : template <class C, class T, class A>
92 0 : inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
93 : {
94 0 : if(0 == *p)
95 : {
96 0 : if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
97 : return 0;
98 : }
99 0 : return s.compare(p);
100 : }
101 : #else
102 : template <class C, class T, class A>
103 : inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
104 : {
105 : if(0 == *p)
106 : {
107 : if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
108 : return 0;
109 : }
110 : return s.compare(p);
111 : }
112 : inline int string_compare(const std::string& s, const char* p)
113 : { return std::strcmp(s.c_str(), p); }
114 : # ifndef BOOST_NO_WREGEX
115 : inline int string_compare(const std::wstring& s, const wchar_t* p)
116 : { return std::wcscmp(s.c_str(), p); }
117 : #endif
118 : #endif
119 : template <class Seq, class C>
120 : inline int string_compare(const Seq& s, const C* p)
121 : {
122 : std::size_t i = 0;
123 : while((i < s.size()) && (p[i] == s[i]))
124 : {
125 : ++i;
126 : }
127 : return (i == s.size()) ? -(int)p[i] : (int)s[i] - (int)p[i];
128 : }
129 : # define STR_COMP(s,p) string_compare(s,p)
130 :
131 : template<class charT>
132 : inline const charT* re_skip_past_null(const charT* p)
133 : {
134 0 : while (*p != static_cast<charT>(0)) ++p;
135 0 : return ++p;
136 : }
137 :
138 : template <class iterator, class charT, class traits_type, class char_classT>
139 0 : iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
140 : iterator last,
141 : const re_set_long<char_classT>* set_,
142 : const regex_data<charT, traits_type>& e, bool icase)
143 : {
144 0 : const charT* p = reinterpret_cast<const charT*>(set_+1);
145 0 : iterator ptr;
146 : unsigned int i;
147 : //bool icase = e.m_flags & regex_constants::icase;
148 :
149 0 : if(next == last) return next;
150 :
151 : typedef typename traits_type::string_type traits_string_type;
152 0 : const ::boost::regex_traits_wrapper<traits_type>& traits_inst = *(e.m_ptraits);
153 :
154 : // dwa 9/13/00 suppress incorrect MSVC warning - it claims this is never
155 : // referenced
156 : (void)traits_inst;
157 :
158 : // try and match a single character, could be a multi-character
159 : // collating element...
160 0 : for(i = 0; i < set_->csingles; ++i)
161 : {
162 0 : ptr = next;
163 0 : if(*p == static_cast<charT>(0))
164 : {
165 : // treat null string as special case:
166 0 : if(traits_inst.translate(*ptr, icase))
167 : {
168 0 : ++p;
169 0 : continue;
170 : }
171 0 : return set_->isnot ? next : (ptr == next) ? ++next : ptr;
172 : }
173 : else
174 : {
175 0 : while(*p && (ptr != last))
176 : {
177 0 : if(traits_inst.translate(*ptr, icase) != *p)
178 : break;
179 0 : ++p;
180 0 : ++ptr;
181 : }
182 :
183 0 : if(*p == static_cast<charT>(0)) // if null we've matched
184 0 : return set_->isnot ? next : (ptr == next) ? ++next : ptr;
185 :
186 0 : p = re_skip_past_null(p); // skip null
187 : }
188 : }
189 :
190 0 : charT col = traits_inst.translate(*next, icase);
191 :
192 :
193 0 : if(set_->cranges || set_->cequivalents)
194 : {
195 0 : traits_string_type s1;
196 : //
197 : // try and match a range, NB only a single character can match
198 0 : if(set_->cranges)
199 : {
200 0 : if((e.m_flags & regex_constants::collate) == 0)
201 0 : s1.assign(1, col);
202 : else
203 : {
204 0 : charT a[2] = { col, charT(0), };
205 0 : s1 = traits_inst.transform(a, a + 1);
206 : }
207 0 : for(i = 0; i < set_->cranges; ++i)
208 : {
209 0 : if(STR_COMP(s1, p) >= 0)
210 : {
211 0 : do{ ++p; }while(*p);
212 0 : ++p;
213 0 : if(STR_COMP(s1, p) <= 0)
214 0 : return set_->isnot ? next : ++next;
215 : }
216 : else
217 : {
218 : // skip first string
219 0 : do{ ++p; }while(*p);
220 0 : ++p;
221 : }
222 : // skip second string
223 0 : do{ ++p; }while(*p);
224 0 : ++p;
225 : }
226 : }
227 : //
228 : // try and match an equivalence class, NB only a single character can match
229 0 : if(set_->cequivalents)
230 : {
231 0 : charT a[2] = { col, charT(0), };
232 0 : s1 = traits_inst.transform_primary(a, a +1);
233 0 : for(i = 0; i < set_->cequivalents; ++i)
234 : {
235 0 : if(STR_COMP(s1, p) == 0)
236 0 : return set_->isnot ? next : ++next;
237 : // skip string
238 0 : do{ ++p; }while(*p);
239 0 : ++p;
240 : }
241 : }
242 : }
243 0 : if(traits_inst.isctype(col, set_->cclasses) == true)
244 0 : return set_->isnot ? next : ++next;
245 0 : if((set_->cnclasses != 0) && (traits_inst.isctype(col, set_->cnclasses) == false))
246 0 : return set_->isnot ? next : ++next;
247 0 : return set_->isnot ? ++next : next;
248 : }
249 :
250 : template <class BidiIterator>
251 : class repeater_count
252 : {
253 : repeater_count** stack;
254 : repeater_count* next;
255 : int state_id;
256 : std::size_t count; // the number of iterations so far
257 : BidiIterator start_pos; // where the last repeat started
258 :
259 0 : repeater_count* unwind_until(int n, repeater_count* p, int current_recursion_id)
260 : {
261 0 : while(p && (p->state_id != n))
262 : {
263 0 : if(-2 - current_recursion_id == p->state_id)
264 : return 0;
265 0 : p = p->next;
266 0 : if(p && (p->state_id < 0))
267 : {
268 0 : p = unwind_until(p->state_id, p, current_recursion_id);
269 0 : if(!p)
270 : return p;
271 0 : p = p->next;
272 : }
273 : }
274 : return p;
275 : }
276 : public:
277 0 : repeater_count(repeater_count** s) : stack(s), next(0), state_id(-1), count(0), start_pos() {}
278 :
279 0 : repeater_count(int i, repeater_count** s, BidiIterator start, int current_recursion_id)
280 0 : : start_pos(start)
281 : {
282 0 : state_id = i;
283 0 : stack = s;
284 0 : next = *stack;
285 0 : *stack = this;
286 0 : if((state_id > next->state_id) && (next->state_id >= 0))
287 0 : count = 0;
288 : else
289 : {
290 0 : repeater_count* p = next;
291 0 : p = unwind_until(state_id, p, current_recursion_id);
292 0 : if(p)
293 : {
294 0 : count = p->count;
295 0 : start_pos = p->start_pos;
296 : }
297 : else
298 0 : count = 0;
299 : }
300 0 : }
301 0 : ~repeater_count()
302 : {
303 0 : if(next)
304 0 : *stack = next;
305 0 : }
306 0 : std::size_t get_count() { return count; }
307 0 : int get_id() { return state_id; }
308 0 : std::size_t operator++() { return ++count; }
309 0 : bool check_null_repeat(const BidiIterator& pos, std::size_t max)
310 : {
311 : // this is called when we are about to start a new repeat,
312 : // if the last one was NULL move our count to max,
313 : // otherwise save the current position.
314 0 : bool result = (count == 0) ? false : (pos == start_pos);
315 : if(result)
316 0 : count = max;
317 : else
318 0 : start_pos = pos;
319 : return result;
320 : }
321 : };
322 :
323 : struct saved_state;
324 :
325 : enum saved_state_type
326 : {
327 : saved_type_end = 0,
328 : saved_type_paren = 1,
329 : saved_type_recurse = 2,
330 : saved_type_assertion = 3,
331 : saved_state_alt = 4,
332 : saved_state_repeater_count = 5,
333 : saved_state_extra_block = 6,
334 : saved_state_greedy_single_repeat = 7,
335 : saved_state_rep_slow_dot = 8,
336 : saved_state_rep_fast_dot = 9,
337 : saved_state_rep_char = 10,
338 : saved_state_rep_short_set = 11,
339 : saved_state_rep_long_set = 12,
340 : saved_state_non_greedy_long_repeat = 13,
341 : saved_state_count = 14
342 : };
343 :
344 : template <class Results>
345 0 : struct recursion_info
346 : {
347 : typedef typename Results::value_type value_type;
348 : typedef typename value_type::iterator iterator;
349 : int idx;
350 : const re_syntax_base* preturn_address;
351 : Results results;
352 : repeater_count<iterator>* repeater_stack;
353 : iterator location_of_start;
354 : };
355 :
356 : #ifdef BOOST_MSVC
357 : #pragma warning(push)
358 : #pragma warning(disable : 4251)
359 : #if BOOST_MSVC < 1700
360 : # pragma warning(disable : 4231)
361 : #endif
362 : # if BOOST_MSVC < 1600
363 : # pragma warning(disable : 4660)
364 : # endif
365 : #endif
366 :
367 : template <class BidiIterator, class Allocator, class traits>
368 : class perl_matcher
369 : {
370 : public:
371 : typedef typename traits::char_type char_type;
372 : typedef perl_matcher<BidiIterator, Allocator, traits> self_type;
373 : typedef bool (self_type::*matcher_proc_type)(void);
374 : typedef std::size_t traits_size_type;
375 : typedef typename is_byte<char_type>::width_type width_type;
376 : typedef typename regex_iterator_traits<BidiIterator>::difference_type difference_type;
377 : typedef match_results<BidiIterator, Allocator> results_type;
378 :
379 0 : perl_matcher(BidiIterator first, BidiIterator end,
380 : match_results<BidiIterator, Allocator>& what,
381 : const basic_regex<char_type, traits>& e,
382 : match_flag_type f,
383 : BidiIterator l_base)
384 : : m_result(what), base(first), last(end),
385 : position(first), backstop(l_base), re(e), traits_inst(e.get_traits()),
386 0 : m_independent(false), next_count(&rep_obj), rep_obj(&next_count)
387 : #ifdef BOOST_REGEX_NON_RECURSIVE
388 0 : , m_recursions(0)
389 : #endif
390 : {
391 0 : construct_init(e, f);
392 0 : }
393 :
394 : bool match();
395 : bool find();
396 :
397 : void setf(match_flag_type f)
398 : { m_match_flags |= f; }
399 : void unsetf(match_flag_type f)
400 : { m_match_flags &= ~f; }
401 :
402 : private:
403 : void construct_init(const basic_regex<char_type, traits>& e, match_flag_type f);
404 :
405 : bool find_imp();
406 : bool match_imp();
407 : #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD
408 : typedef bool (perl_matcher::*protected_proc_type)();
409 : bool protected_call(protected_proc_type);
410 : #endif
411 : void estimate_max_state_count(std::random_access_iterator_tag*);
412 : void estimate_max_state_count(void*);
413 : bool match_prefix();
414 : bool match_all_states();
415 :
416 : // match procs, stored in s_match_vtable:
417 : bool match_startmark();
418 : bool match_endmark();
419 : bool match_literal();
420 : bool match_start_line();
421 : bool match_end_line();
422 : bool match_wild();
423 : bool match_match();
424 : bool match_word_boundary();
425 : bool match_within_word();
426 : bool match_word_start();
427 : bool match_word_end();
428 : bool match_buffer_start();
429 : bool match_buffer_end();
430 : bool match_backref();
431 : bool match_long_set();
432 : bool match_set();
433 : bool match_jump();
434 : bool match_alt();
435 : bool match_rep();
436 : bool match_combining();
437 : bool match_soft_buffer_end();
438 : bool match_restart_continue();
439 : bool match_long_set_repeat();
440 : bool match_set_repeat();
441 : bool match_char_repeat();
442 : bool match_dot_repeat_fast();
443 : bool match_dot_repeat_slow();
444 0 : bool match_dot_repeat_dispatch()
445 : {
446 0 : return ::boost::is_random_access_iterator<BidiIterator>::value ? match_dot_repeat_fast() : match_dot_repeat_slow();
447 : }
448 : bool match_backstep();
449 : bool match_assert_backref();
450 : bool match_toggle_case();
451 : #ifdef BOOST_REGEX_RECURSIVE
452 : bool backtrack_till_match(std::size_t count);
453 : #endif
454 : bool match_recursion();
455 : bool match_fail();
456 : bool match_accept();
457 : bool match_commit();
458 : bool match_then();
459 : bool skip_until_paren(int index, bool match = true);
460 :
461 : // find procs stored in s_find_vtable:
462 : bool find_restart_any();
463 : bool find_restart_word();
464 : bool find_restart_line();
465 : bool find_restart_buf();
466 : bool find_restart_lit();
467 :
468 : private:
469 : // final result structure to be filled in:
470 : match_results<BidiIterator, Allocator>& m_result;
471 : // temporary result for POSIX matches:
472 : scoped_ptr<match_results<BidiIterator, Allocator> > m_temp_match;
473 : // pointer to actual result structure to fill in:
474 : match_results<BidiIterator, Allocator>* m_presult;
475 : // start of sequence being searched:
476 : BidiIterator base;
477 : // end of sequence being searched:
478 : BidiIterator last;
479 : // current character being examined:
480 : BidiIterator position;
481 : // where to restart next search after failed match attempt:
482 : BidiIterator restart;
483 : // where the current search started from, acts as base for $` during grep:
484 : BidiIterator search_base;
485 : // how far we can go back when matching lookbehind:
486 : BidiIterator backstop;
487 : // the expression being examined:
488 : const basic_regex<char_type, traits>& re;
489 : // the expression's traits class:
490 : const ::boost::regex_traits_wrapper<traits>& traits_inst;
491 : // the next state in the machine being matched:
492 : const re_syntax_base* pstate;
493 : // matching flags in use:
494 : match_flag_type m_match_flags;
495 : // how many states we have examined so far:
496 : std::ptrdiff_t state_count;
497 : // max number of states to examine before giving up:
498 : std::ptrdiff_t max_state_count;
499 : // whether we should ignore case or not:
500 : bool icase;
501 : // set to true when (position == last), indicates that we may have a partial match:
502 : bool m_has_partial_match;
503 : // set to true whenever we get a match:
504 : bool m_has_found_match;
505 : // set to true whenever we're inside an independent sub-expression:
506 : bool m_independent;
507 : // the current repeat being examined:
508 : repeater_count<BidiIterator>* next_count;
509 : // the first repeat being examined (top of linked list):
510 : repeater_count<BidiIterator> rep_obj;
511 : // the mask to pass when matching word boundaries:
512 : typename traits::char_class_type m_word_mask;
513 : // the bitmask to use when determining whether a match_any matches a newline or not:
514 : unsigned char match_any_mask;
515 : // recursion information:
516 : std::vector<recursion_info<results_type> > recursion_stack;
517 : #ifdef BOOST_REGEX_RECURSIVE
518 : // Set to false by a (*COMMIT):
519 : bool m_can_backtrack;
520 : bool m_have_accept;
521 : bool m_have_then;
522 : #endif
523 : #ifdef BOOST_REGEX_NON_RECURSIVE
524 : //
525 : // additional members for non-recursive version:
526 : //
527 : typedef bool (self_type::*unwind_proc_type)(bool);
528 :
529 : void extend_stack();
530 : bool unwind(bool);
531 : bool unwind_end(bool);
532 : bool unwind_paren(bool);
533 : bool unwind_recursion_stopper(bool);
534 : bool unwind_assertion(bool);
535 : bool unwind_alt(bool);
536 : bool unwind_repeater_counter(bool);
537 : bool unwind_extra_block(bool);
538 : bool unwind_greedy_single_repeat(bool);
539 : bool unwind_slow_dot_repeat(bool);
540 : bool unwind_fast_dot_repeat(bool);
541 : bool unwind_char_repeat(bool);
542 : bool unwind_short_set_repeat(bool);
543 : bool unwind_long_set_repeat(bool);
544 : bool unwind_non_greedy_repeat(bool);
545 : bool unwind_recursion(bool);
546 : bool unwind_recursion_pop(bool);
547 : bool unwind_commit(bool);
548 : bool unwind_then(bool);
549 : bool unwind_case(bool);
550 : void destroy_single_repeat();
551 : void push_matched_paren(int index, const sub_match<BidiIterator>& sub);
552 : void push_recursion_stopper();
553 : void push_assertion(const re_syntax_base* ps, bool positive);
554 : void push_alt(const re_syntax_base* ps);
555 : void push_repeater_count(int i, repeater_count<BidiIterator>** s);
556 : void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int state_id);
557 : void push_non_greedy_repeat(const re_syntax_base* ps);
558 : void push_recursion(int idx, const re_syntax_base* p, results_type* presults, results_type* presults2);
559 : void push_recursion_pop();
560 : void push_case_change(bool);
561 :
562 : // pointer to base of stack:
563 : saved_state* m_stack_base;
564 : // pointer to current stack position:
565 : saved_state* m_backup_state;
566 : // how many memory blocks have we used up?:
567 : unsigned used_block_count;
568 : // determines what value to return when unwinding from recursion,
569 : // allows for mixed recursive/non-recursive algorithm:
570 : bool m_recursive_result;
571 : // We have unwound to a lookahead/lookbehind, used by COMMIT/PRUNE/SKIP:
572 : bool m_unwound_lookahead;
573 : // We have unwound to an alternative, used by THEN:
574 : bool m_unwound_alt;
575 : // We are unwinding a commit - used by independent subs to determine whether to stop there or carry on unwinding:
576 : //bool m_unwind_commit;
577 : // Recursion limit:
578 : unsigned m_recursions;
579 : #endif
580 :
581 : // these operations aren't allowed, so are declared private,
582 : // bodies are provided to keep explicit-instantiation requests happy:
583 : perl_matcher& operator=(const perl_matcher&)
584 : {
585 : return *this;
586 : }
587 : perl_matcher(const perl_matcher& that)
588 : : m_result(that.m_result), re(that.re), traits_inst(that.traits_inst), rep_obj(0) {}
589 : };
590 :
591 : #ifdef BOOST_MSVC
592 : #pragma warning(pop)
593 : #endif
594 :
595 : } // namespace BOOST_REGEX_DETAIL_NS
596 :
597 : #ifdef BOOST_MSVC
598 : #pragma warning(push)
599 : #pragma warning(disable: 4103)
600 : #endif
601 : #ifdef BOOST_HAS_ABI_HEADERS
602 : # include BOOST_ABI_SUFFIX
603 : #endif
604 : #ifdef BOOST_MSVC
605 : #pragma warning(pop)
606 : #endif
607 :
608 : } // namespace boost
609 :
610 : #ifdef BOOST_MSVC
611 : # pragma warning(pop)
612 : #endif
613 :
614 : //
615 : // include the implementation of perl_matcher:
616 : //
617 : #ifdef BOOST_REGEX_RECURSIVE
618 : #include <boost/regex/v4/perl_matcher_recursive.hpp>
619 : #else
620 : #include <boost/regex/v4/perl_matcher_non_recursive.hpp>
621 : #endif
622 : // this one has to be last:
623 : #include <boost/regex/v4/perl_matcher_common.hpp>
624 :
625 : #endif
626 :
|