Line data Source code
1 : // class template regex -*- C++ -*-
2 :
3 : // Copyright (C) 2013-2019 Free Software Foundation, Inc.
4 : //
5 : // This file is part of the GNU ISO C++ Library. This library is free
6 : // software; you can redistribute it and/or modify it under the
7 : // terms of the GNU General Public License as published by the
8 : // Free Software Foundation; either version 3, or (at your option)
9 : // any later version.
10 :
11 : // This library is distributed in the hope that it will be useful,
12 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : // GNU General Public License for more details.
15 :
16 : // Under Section 7 of GPL version 3, you are granted additional
17 : // permissions described in the GCC Runtime Library Exception, version
18 : // 3.1, as published by the Free Software Foundation.
19 :
20 : // You should have received a copy of the GNU General Public License and
21 : // a copy of the GCC Runtime Library Exception along with this program;
22 : // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 : // <http://www.gnu.org/licenses/>.
24 :
25 : /**
26 : * @file bits/regex_scanner.tcc
27 : * This is an internal header file, included by other library headers.
28 : * Do not attempt to use it directly. @headername{regex}
29 : */
30 :
31 : // FIXME make comments doxygen format.
32 :
33 : // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
34 : // and awk
35 : // 1) grep is basic except '\n' is treated as '|'
36 : // 2) egrep is extended except '\n' is treated as '|'
37 : // 3) awk is extended except special escaping rules, and there's no
38 : // back-reference.
39 : //
40 : // References:
41 : //
42 : // ECMAScript: ECMA-262 15.10
43 : //
44 : // basic, extended:
45 : // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
46 : //
47 : // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
48 :
49 : namespace std _GLIBCXX_VISIBILITY(default)
50 : {
51 : _GLIBCXX_BEGIN_NAMESPACE_VERSION
52 :
53 : namespace __detail
54 : {
55 : template<typename _CharT>
56 955 : _Scanner<_CharT>::
57 : _Scanner(typename _Scanner::_IterT __begin,
58 : typename _Scanner::_IterT __end,
59 : _FlagT __flags, std::locale __loc)
60 : : _ScannerBase(__flags),
61 : _M_current(__begin), _M_end(__end),
62 955 : _M_ctype(std::use_facet<_CtypeT>(__loc)),
63 955 : _M_eat_escape(_M_is_ecma()
64 : ? &_Scanner::_M_eat_escape_ecma
65 955 : : &_Scanner::_M_eat_escape_posix)
66 955 : { _M_advance(); }
67 :
68 : template<typename _CharT>
69 : void
70 21010 : _Scanner<_CharT>::
71 : _M_advance()
72 : {
73 21010 : if (_M_current == _M_end)
74 : {
75 1910 : _M_token = _S_token_eof;
76 1910 : return;
77 : }
78 :
79 19100 : if (_M_state == _S_state_normal)
80 4775 : _M_scan_normal();
81 14325 : else if (_M_state == _S_state_in_bracket)
82 14325 : _M_scan_in_bracket();
83 0 : else if (_M_state == _S_state_in_brace)
84 0 : _M_scan_in_brace();
85 : else
86 : {
87 : __glibcxx_assert(false);
88 : }
89 : }
90 :
91 : // Differences between styles:
92 : // 1) "\(", "\)", "\{" in basic. It's not escaping.
93 : // 2) "(?:", "(?=", "(?!" in ECMAScript.
94 : template<typename _CharT>
95 : void
96 4775 : _Scanner<_CharT>::
97 : _M_scan_normal()
98 : {
99 4775 : auto __c = *_M_current++;
100 :
101 5045 : if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
102 : {
103 0 : _M_token = _S_token_ord_char;
104 0 : _M_value.assign(1, __c);
105 0 : return;
106 : }
107 4775 : if (__c == '\\')
108 : {
109 0 : if (_M_current == _M_end)
110 0 : __throw_regex_error(
111 : regex_constants::error_escape,
112 : "Unexpected end of regex when escaping.");
113 :
114 0 : if (!_M_is_basic()
115 0 : || (*_M_current != '('
116 : && *_M_current != ')'
117 0 : && *_M_current != '{'))
118 : {
119 0 : (this->*_M_eat_escape)();
120 0 : return;
121 : }
122 0 : __c = *_M_current++;
123 : }
124 4775 : if (__c == '(')
125 : {
126 0 : if (_M_is_ecma() && *_M_current == '?')
127 : {
128 0 : if (++_M_current == _M_end)
129 0 : __throw_regex_error(
130 : regex_constants::error_paren,
131 : "Unexpected end of regex when in an open parenthesis.");
132 :
133 0 : if (*_M_current == ':')
134 : {
135 0 : ++_M_current;
136 0 : _M_token = _S_token_subexpr_no_group_begin;
137 : }
138 0 : else if (*_M_current == '=')
139 : {
140 0 : ++_M_current;
141 0 : _M_token = _S_token_subexpr_lookahead_begin;
142 0 : _M_value.assign(1, 'p');
143 : }
144 0 : else if (*_M_current == '!')
145 : {
146 0 : ++_M_current;
147 0 : _M_token = _S_token_subexpr_lookahead_begin;
148 0 : _M_value.assign(1, 'n');
149 : }
150 : else
151 0 : __throw_regex_error(
152 : regex_constants::error_paren,
153 : "Invalid special open parenthesis.");
154 : }
155 0 : else if (_M_flags & regex_constants::nosubs)
156 0 : _M_token = _S_token_subexpr_no_group_begin;
157 : else
158 0 : _M_token = _S_token_subexpr_begin;
159 : }
160 4775 : else if (__c == ')')
161 0 : _M_token = _S_token_subexpr_end;
162 4775 : else if (__c == '[')
163 : {
164 4775 : _M_state = _S_state_in_bracket;
165 4775 : _M_at_bracket_start = true;
166 4775 : if (_M_current != _M_end && *_M_current == '^')
167 : {
168 0 : _M_token = _S_token_bracket_neg_begin;
169 0 : ++_M_current;
170 : }
171 : else
172 4775 : _M_token = _S_token_bracket_begin;
173 : }
174 0 : else if (__c == '{')
175 : {
176 0 : _M_state = _S_state_in_brace;
177 0 : _M_token = _S_token_interval_begin;
178 : }
179 0 : else if (__c != ']' && __c != '}')
180 : {
181 0 : auto __it = _M_token_tbl;
182 0 : auto __narrowc = _M_ctype.narrow(__c, '\0');
183 0 : for (; __it->first != '\0'; ++__it)
184 0 : if (__it->first == __narrowc)
185 : {
186 0 : _M_token = __it->second;
187 0 : return;
188 : }
189 : __glibcxx_assert(false);
190 : }
191 : else
192 : {
193 0 : _M_token = _S_token_ord_char;
194 0 : _M_value.assign(1, __c);
195 : }
196 : }
197 :
198 : // Differences between styles:
199 : // 1) different semantics of "[]" and "[^]".
200 : // 2) Escaping in bracket expr.
201 : template<typename _CharT>
202 : void
203 14325 : _Scanner<_CharT>::
204 : _M_scan_in_bracket()
205 : {
206 14325 : if (_M_current == _M_end)
207 0 : __throw_regex_error(
208 : regex_constants::error_brack,
209 : "Unexpected end of regex when in bracket expression.");
210 :
211 14325 : auto __c = *_M_current++;
212 :
213 14325 : if (__c == '-')
214 0 : _M_token = _S_token_bracket_dash;
215 14325 : else if (__c == '[')
216 : {
217 0 : if (_M_current == _M_end)
218 0 : __throw_regex_error(regex_constants::error_brack,
219 : "Unexpected character class open bracket.");
220 :
221 0 : if (*_M_current == '.')
222 : {
223 0 : _M_token = _S_token_collsymbol;
224 0 : _M_eat_class(*_M_current++);
225 : }
226 0 : else if (*_M_current == ':')
227 : {
228 0 : _M_token = _S_token_char_class_name;
229 0 : _M_eat_class(*_M_current++);
230 : }
231 0 : else if (*_M_current == '=')
232 : {
233 0 : _M_token = _S_token_equiv_class_name;
234 0 : _M_eat_class(*_M_current++);
235 : }
236 : else
237 : {
238 0 : _M_token = _S_token_ord_char;
239 0 : _M_value.assign(1, __c);
240 : }
241 : }
242 : // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
243 : // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
244 : // `*/empty_range.cc`.
245 14325 : else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
246 : {
247 4775 : _M_token = _S_token_bracket_end;
248 4775 : _M_state = _S_state_normal;
249 : }
250 : // ECMAScript and awk permits escaping in bracket.
251 9550 : else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
252 0 : (this->*_M_eat_escape)();
253 : else
254 : {
255 9550 : _M_token = _S_token_ord_char;
256 9550 : _M_value.assign(1, __c);
257 : }
258 14325 : _M_at_bracket_start = false;
259 14325 : }
260 :
261 : // Differences between styles:
262 : // 1) "\}" in basic style.
263 : template<typename _CharT>
264 : void
265 0 : _Scanner<_CharT>::
266 : _M_scan_in_brace()
267 : {
268 0 : if (_M_current == _M_end)
269 0 : __throw_regex_error(
270 : regex_constants::error_brace,
271 : "Unexpected end of regex when in brace expression.");
272 :
273 0 : auto __c = *_M_current++;
274 :
275 0 : if (_M_ctype.is(_CtypeT::digit, __c))
276 : {
277 0 : _M_token = _S_token_dup_count;
278 0 : _M_value.assign(1, __c);
279 0 : while (_M_current != _M_end
280 0 : && _M_ctype.is(_CtypeT::digit, *_M_current))
281 0 : _M_value += *_M_current++;
282 : }
283 0 : else if (__c == ',')
284 0 : _M_token = _S_token_comma;
285 : // basic use \}.
286 0 : else if (_M_is_basic())
287 : {
288 0 : if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
289 : {
290 0 : _M_state = _S_state_normal;
291 0 : _M_token = _S_token_interval_end;
292 0 : ++_M_current;
293 : }
294 : else
295 0 : __throw_regex_error(regex_constants::error_badbrace,
296 : "Unexpected character in brace expression.");
297 : }
298 0 : else if (__c == '}')
299 : {
300 0 : _M_state = _S_state_normal;
301 0 : _M_token = _S_token_interval_end;
302 : }
303 : else
304 0 : __throw_regex_error(regex_constants::error_badbrace,
305 : "Unexpected character in brace expression.");
306 0 : }
307 :
308 : template<typename _CharT>
309 : void
310 0 : _Scanner<_CharT>::
311 : _M_eat_escape_ecma()
312 : {
313 0 : if (_M_current == _M_end)
314 0 : __throw_regex_error(regex_constants::error_escape,
315 : "Unexpected end of regex when escaping.");
316 :
317 0 : auto __c = *_M_current++;
318 0 : auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
319 :
320 0 : if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
321 : {
322 0 : _M_token = _S_token_ord_char;
323 0 : _M_value.assign(1, *__pos);
324 : }
325 0 : else if (__c == 'b')
326 : {
327 0 : _M_token = _S_token_word_bound;
328 0 : _M_value.assign(1, 'p');
329 : }
330 0 : else if (__c == 'B')
331 : {
332 0 : _M_token = _S_token_word_bound;
333 0 : _M_value.assign(1, 'n');
334 : }
335 : // N3376 28.13
336 0 : else if (__c == 'd'
337 0 : || __c == 'D'
338 0 : || __c == 's'
339 0 : || __c == 'S'
340 0 : || __c == 'w'
341 0 : || __c == 'W')
342 : {
343 0 : _M_token = _S_token_quoted_class;
344 0 : _M_value.assign(1, __c);
345 : }
346 0 : else if (__c == 'c')
347 : {
348 0 : if (_M_current == _M_end)
349 0 : __throw_regex_error(
350 : regex_constants::error_escape,
351 : "Unexpected end of regex when reading control code.");
352 0 : _M_token = _S_token_ord_char;
353 0 : _M_value.assign(1, *_M_current++);
354 : }
355 0 : else if (__c == 'x' || __c == 'u')
356 : {
357 0 : _M_value.erase();
358 0 : for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
359 : {
360 0 : if (_M_current == _M_end
361 0 : || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
362 0 : __throw_regex_error(
363 : regex_constants::error_escape,
364 : "Unexpected end of regex when ascii character.");
365 0 : _M_value += *_M_current++;
366 : }
367 0 : _M_token = _S_token_hex_num;
368 : }
369 : // ECMAScript recognizes multi-digit back-references.
370 0 : else if (_M_ctype.is(_CtypeT::digit, __c))
371 : {
372 0 : _M_value.assign(1, __c);
373 0 : while (_M_current != _M_end
374 0 : && _M_ctype.is(_CtypeT::digit, *_M_current))
375 0 : _M_value += *_M_current++;
376 0 : _M_token = _S_token_backref;
377 : }
378 : else
379 : {
380 0 : _M_token = _S_token_ord_char;
381 0 : _M_value.assign(1, __c);
382 : }
383 0 : }
384 :
385 : // Differences between styles:
386 : // 1) Extended doesn't support backref, but basic does.
387 : template<typename _CharT>
388 : void
389 0 : _Scanner<_CharT>::
390 : _M_eat_escape_posix()
391 : {
392 0 : if (_M_current == _M_end)
393 0 : __throw_regex_error(regex_constants::error_escape,
394 : "Unexpected end of regex when escaping.");
395 :
396 0 : auto __c = *_M_current;
397 0 : auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
398 :
399 0 : if (__pos != nullptr && *__pos != '\0')
400 : {
401 0 : _M_token = _S_token_ord_char;
402 0 : _M_value.assign(1, __c);
403 : }
404 : // We MUST judge awk before handling backrefs. There's no backref in awk.
405 0 : else if (_M_is_awk())
406 : {
407 0 : _M_eat_escape_awk();
408 0 : return;
409 : }
410 0 : else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
411 : {
412 0 : _M_token = _S_token_backref;
413 0 : _M_value.assign(1, __c);
414 : }
415 : else
416 : {
417 : #ifdef __STRICT_ANSI__
418 : // POSIX says it is undefined to escape ordinary characters
419 0 : __throw_regex_error(regex_constants::error_escape,
420 : "Unexpected escape character.");
421 : #else
422 : _M_token = _S_token_ord_char;
423 : _M_value.assign(1, __c);
424 : #endif
425 : }
426 0 : ++_M_current;
427 : }
428 :
429 : template<typename _CharT>
430 : void
431 0 : _Scanner<_CharT>::
432 : _M_eat_escape_awk()
433 : {
434 0 : auto __c = *_M_current++;
435 0 : auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
436 :
437 0 : if (__pos != nullptr)
438 : {
439 0 : _M_token = _S_token_ord_char;
440 0 : _M_value.assign(1, *__pos);
441 : }
442 : // \ddd for oct representation
443 0 : else if (_M_ctype.is(_CtypeT::digit, __c)
444 0 : && __c != '8'
445 0 : && __c != '9')
446 : {
447 0 : _M_value.assign(1, __c);
448 0 : for (int __i = 0;
449 : __i < 2
450 0 : && _M_current != _M_end
451 0 : && _M_ctype.is(_CtypeT::digit, *_M_current)
452 0 : && *_M_current != '8'
453 0 : && *_M_current != '9';
454 : __i++)
455 0 : _M_value += *_M_current++;
456 0 : _M_token = _S_token_oct_num;
457 0 : return;
458 : }
459 : else
460 0 : __throw_regex_error(regex_constants::error_escape,
461 : "Unexpected escape character.");
462 : }
463 :
464 : // Eats a character class or throws an exception.
465 : // __ch could be ':', '.' or '=', _M_current is the char after ']' when
466 : // returning.
467 : template<typename _CharT>
468 : void
469 0 : _Scanner<_CharT>::
470 : _M_eat_class(char __ch)
471 : {
472 0 : for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
473 0 : _M_value += *_M_current++;
474 : if (_M_current == _M_end
475 0 : || *_M_current++ != __ch
476 0 : || _M_current == _M_end // skip __ch
477 0 : || *_M_current++ != ']') // skip ']'
478 : {
479 0 : if (__ch == ':')
480 0 : __throw_regex_error(regex_constants::error_ctype,
481 : "Unexpected end of character class.");
482 : else
483 0 : __throw_regex_error(regex_constants::error_collate,
484 : "Unexpected end of character class.");
485 : }
486 0 : }
487 :
488 : #ifdef _GLIBCXX_DEBUG
489 : template<typename _CharT>
490 : std::ostream&
491 : _Scanner<_CharT>::
492 : _M_print(std::ostream& ostr)
493 : {
494 : switch (_M_token)
495 : {
496 : case _S_token_anychar:
497 : ostr << "any-character\n";
498 : break;
499 : case _S_token_backref:
500 : ostr << "backref\n";
501 : break;
502 : case _S_token_bracket_begin:
503 : ostr << "bracket-begin\n";
504 : break;
505 : case _S_token_bracket_neg_begin:
506 : ostr << "bracket-neg-begin\n";
507 : break;
508 : case _S_token_bracket_end:
509 : ostr << "bracket-end\n";
510 : break;
511 : case _S_token_char_class_name:
512 : ostr << "char-class-name \"" << _M_value << "\"\n";
513 : break;
514 : case _S_token_closure0:
515 : ostr << "closure0\n";
516 : break;
517 : case _S_token_closure1:
518 : ostr << "closure1\n";
519 : break;
520 : case _S_token_collsymbol:
521 : ostr << "collsymbol \"" << _M_value << "\"\n";
522 : break;
523 : case _S_token_comma:
524 : ostr << "comma\n";
525 : break;
526 : case _S_token_dup_count:
527 : ostr << "dup count: " << _M_value << "\n";
528 : break;
529 : case _S_token_eof:
530 : ostr << "EOF\n";
531 : break;
532 : case _S_token_equiv_class_name:
533 : ostr << "equiv-class-name \"" << _M_value << "\"\n";
534 : break;
535 : case _S_token_interval_begin:
536 : ostr << "interval begin\n";
537 : break;
538 : case _S_token_interval_end:
539 : ostr << "interval end\n";
540 : break;
541 : case _S_token_line_begin:
542 : ostr << "line begin\n";
543 : break;
544 : case _S_token_line_end:
545 : ostr << "line end\n";
546 : break;
547 : case _S_token_opt:
548 : ostr << "opt\n";
549 : break;
550 : case _S_token_or:
551 : ostr << "or\n";
552 : break;
553 : case _S_token_ord_char:
554 : ostr << "ordinary character: \"" << _M_value << "\"\n";
555 : break;
556 : case _S_token_subexpr_begin:
557 : ostr << "subexpr begin\n";
558 : break;
559 : case _S_token_subexpr_no_group_begin:
560 : ostr << "no grouping subexpr begin\n";
561 : break;
562 : case _S_token_subexpr_lookahead_begin:
563 : ostr << "lookahead subexpr begin\n";
564 : break;
565 : case _S_token_subexpr_end:
566 : ostr << "subexpr end\n";
567 : break;
568 : case _S_token_unknown:
569 : ostr << "-- unknown token --\n";
570 : break;
571 : case _S_token_oct_num:
572 : ostr << "oct number " << _M_value << "\n";
573 : break;
574 : case _S_token_hex_num:
575 : ostr << "hex number " << _M_value << "\n";
576 : break;
577 : case _S_token_quoted_class:
578 : ostr << "quoted class " << "\\" << _M_value << "\n";
579 : break;
580 : default:
581 : _GLIBCXX_DEBUG_ASSERT(false);
582 : }
583 : return ostr;
584 : }
585 : #endif
586 :
587 : } // namespace __detail
588 : _GLIBCXX_END_NAMESPACE_VERSION
589 : } // namespace
|