LCOV - code coverage report
Current view: top level - usr/include/c++/9/bits - regex_scanner.tcc (source / functions) Hit Total Coverage
Test: ROSE Lines: 37 211 17.5 %
Date: 2022-12-08 13:48:47 Functions: 4 9 44.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // class template regex -*- C++ -*-
       2             : 
       3             : // Copyright (C) 2013-2019 Free Software Foundation, Inc.
       4             : //
       5             : // This file is part of the GNU ISO C++ Library.  This library is free
       6             : // software; you can redistribute it and/or modify it under the
       7             : // terms of the GNU General Public License as published by the
       8             : // Free Software Foundation; either version 3, or (at your option)
       9             : // any later version.
      10             : 
      11             : // This library is distributed in the hope that it will be useful,
      12             : // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13             : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14             : // GNU General Public License for more details.
      15             : 
      16             : // Under Section 7 of GPL version 3, you are granted additional
      17             : // permissions described in the GCC Runtime Library Exception, version
      18             : // 3.1, as published by the Free Software Foundation.
      19             : 
      20             : // You should have received a copy of the GNU General Public License and
      21             : // a copy of the GCC Runtime Library Exception along with this program;
      22             : // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      23             : // <http://www.gnu.org/licenses/>.
      24             : 
      25             : /**
      26             :  *  @file bits/regex_scanner.tcc
      27             :  *  This is an internal header file, included by other library headers.
      28             :  *  Do not attempt to use it directly. @headername{regex}
      29             :  */
      30             : 
      31             : // FIXME make comments doxygen format.
      32             : 
      33             : // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
      34             : // and awk
      35             : // 1) grep is basic except '\n' is treated as '|'
      36             : // 2) egrep is extended except '\n' is treated as '|'
      37             : // 3) awk is extended except special escaping rules, and there's no
      38             : //    back-reference.
      39             : //
      40             : // References:
      41             : //
      42             : // ECMAScript: ECMA-262 15.10
      43             : //
      44             : // basic, extended:
      45             : // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
      46             : //
      47             : // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
      48             : 
      49             : namespace std _GLIBCXX_VISIBILITY(default)
      50             : {
      51             : _GLIBCXX_BEGIN_NAMESPACE_VERSION
      52             : 
      53             : namespace __detail
      54             : {
      55             :   template<typename _CharT>
      56         955 :     _Scanner<_CharT>::
      57             :     _Scanner(typename _Scanner::_IterT __begin,
      58             :              typename _Scanner::_IterT __end,
      59             :              _FlagT __flags, std::locale __loc)
      60             :     : _ScannerBase(__flags),
      61             :       _M_current(__begin), _M_end(__end),
      62         955 :       _M_ctype(std::use_facet<_CtypeT>(__loc)),
      63         955 :       _M_eat_escape(_M_is_ecma()
      64             :                     ? &_Scanner::_M_eat_escape_ecma
      65         955 :                     : &_Scanner::_M_eat_escape_posix)
      66         955 :     { _M_advance(); }
      67             : 
      68             :   template<typename _CharT>
      69             :     void
      70       21010 :     _Scanner<_CharT>::
      71             :     _M_advance()
      72             :     {
      73       21010 :       if (_M_current == _M_end)
      74             :         {
      75        1910 :           _M_token = _S_token_eof;
      76        1910 :           return;
      77             :         }
      78             : 
      79       19100 :       if (_M_state == _S_state_normal)
      80        4775 :         _M_scan_normal();
      81       14325 :       else if (_M_state == _S_state_in_bracket)
      82       14325 :         _M_scan_in_bracket();
      83           0 :       else if (_M_state == _S_state_in_brace)
      84           0 :         _M_scan_in_brace();
      85             :       else
      86             :         {
      87             :           __glibcxx_assert(false);
      88             :         }
      89             :     }
      90             : 
      91             :   // Differences between styles:
      92             :   // 1) "\(", "\)", "\{" in basic. It's not escaping.
      93             :   // 2) "(?:", "(?=", "(?!" in ECMAScript.
      94             :   template<typename _CharT>
      95             :     void
      96        4775 :     _Scanner<_CharT>::
      97             :     _M_scan_normal()
      98             :     {
      99        4775 :       auto __c = *_M_current++;
     100             : 
     101        5045 :       if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
     102             :         {
     103           0 :           _M_token = _S_token_ord_char;
     104           0 :           _M_value.assign(1, __c);
     105           0 :           return;
     106             :         }
     107        4775 :       if (__c == '\\')
     108             :         {
     109           0 :           if (_M_current == _M_end)
     110           0 :             __throw_regex_error(
     111             :               regex_constants::error_escape,
     112             :               "Unexpected end of regex when escaping.");
     113             : 
     114           0 :           if (!_M_is_basic()
     115           0 :               || (*_M_current != '('
     116             :                   && *_M_current != ')'
     117           0 :                   && *_M_current != '{'))
     118             :             {
     119           0 :               (this->*_M_eat_escape)();
     120           0 :               return;
     121             :             }
     122           0 :           __c = *_M_current++;
     123             :         }
     124        4775 :       if (__c == '(')
     125             :         {
     126           0 :           if (_M_is_ecma() && *_M_current == '?')
     127             :             {
     128           0 :               if (++_M_current == _M_end)
     129           0 :                 __throw_regex_error(
     130             :                   regex_constants::error_paren,
     131             :                   "Unexpected end of regex when in an open parenthesis.");
     132             : 
     133           0 :               if (*_M_current == ':')
     134             :                 {
     135           0 :                   ++_M_current;
     136           0 :                   _M_token = _S_token_subexpr_no_group_begin;
     137             :                 }
     138           0 :               else if (*_M_current == '=')
     139             :                 {
     140           0 :                   ++_M_current;
     141           0 :                   _M_token = _S_token_subexpr_lookahead_begin;
     142           0 :                   _M_value.assign(1, 'p');
     143             :                 }
     144           0 :               else if (*_M_current == '!')
     145             :                 {
     146           0 :                   ++_M_current;
     147           0 :                   _M_token = _S_token_subexpr_lookahead_begin;
     148           0 :                   _M_value.assign(1, 'n');
     149             :                 }
     150             :               else
     151           0 :                 __throw_regex_error(
     152             :                   regex_constants::error_paren,
     153             :                   "Invalid special open parenthesis.");
     154             :             }
     155           0 :           else if (_M_flags & regex_constants::nosubs)
     156           0 :             _M_token = _S_token_subexpr_no_group_begin;
     157             :           else
     158           0 :             _M_token = _S_token_subexpr_begin;
     159             :         }
     160        4775 :       else if (__c == ')')
     161           0 :         _M_token = _S_token_subexpr_end;
     162        4775 :       else if (__c == '[')
     163             :         {
     164        4775 :           _M_state = _S_state_in_bracket;
     165        4775 :           _M_at_bracket_start = true;
     166        4775 :           if (_M_current != _M_end && *_M_current == '^')
     167             :             {
     168           0 :               _M_token = _S_token_bracket_neg_begin;
     169           0 :               ++_M_current;
     170             :             }
     171             :           else
     172        4775 :             _M_token = _S_token_bracket_begin;
     173             :         }
     174           0 :       else if (__c == '{')
     175             :         {
     176           0 :           _M_state = _S_state_in_brace;
     177           0 :           _M_token = _S_token_interval_begin;
     178             :         }
     179           0 :       else if (__c != ']' && __c != '}')
     180             :         {
     181           0 :           auto __it = _M_token_tbl;
     182           0 :           auto __narrowc = _M_ctype.narrow(__c, '\0');
     183           0 :           for (; __it->first != '\0'; ++__it)
     184           0 :             if (__it->first == __narrowc)
     185             :               {
     186           0 :                 _M_token = __it->second;
     187           0 :                 return;
     188             :               }
     189             :           __glibcxx_assert(false);
     190             :         }
     191             :       else
     192             :         {
     193           0 :           _M_token = _S_token_ord_char;
     194           0 :           _M_value.assign(1, __c);
     195             :         }
     196             :     }
     197             : 
     198             :   // Differences between styles:
     199             :   // 1) different semantics of "[]" and "[^]".
     200             :   // 2) Escaping in bracket expr.
     201             :   template<typename _CharT>
     202             :     void
     203       14325 :     _Scanner<_CharT>::
     204             :     _M_scan_in_bracket()
     205             :     {
     206       14325 :       if (_M_current == _M_end)
     207           0 :         __throw_regex_error(
     208             :           regex_constants::error_brack,
     209             :           "Unexpected end of regex when in bracket expression.");
     210             : 
     211       14325 :       auto __c = *_M_current++;
     212             : 
     213       14325 :       if (__c == '-')
     214           0 :         _M_token = _S_token_bracket_dash;
     215       14325 :       else if (__c == '[')
     216             :         {
     217           0 :           if (_M_current == _M_end)
     218           0 :             __throw_regex_error(regex_constants::error_brack,
     219             :                                 "Unexpected character class open bracket.");
     220             : 
     221           0 :           if (*_M_current == '.')
     222             :             {
     223           0 :               _M_token = _S_token_collsymbol;
     224           0 :               _M_eat_class(*_M_current++);
     225             :             }
     226           0 :           else if (*_M_current == ':')
     227             :             {
     228           0 :               _M_token = _S_token_char_class_name;
     229           0 :               _M_eat_class(*_M_current++);
     230             :             }
     231           0 :           else if (*_M_current == '=')
     232             :             {
     233           0 :               _M_token = _S_token_equiv_class_name;
     234           0 :               _M_eat_class(*_M_current++);
     235             :             }
     236             :           else
     237             :             {
     238           0 :               _M_token = _S_token_ord_char;
     239           0 :               _M_value.assign(1, __c);
     240             :             }
     241             :         }
     242             :       // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
     243             :       // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
     244             :       // `*/empty_range.cc`.
     245       14325 :       else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
     246             :         {
     247        4775 :           _M_token = _S_token_bracket_end;
     248        4775 :           _M_state = _S_state_normal;
     249             :         }
     250             :       // ECMAScript and awk permits escaping in bracket.
     251        9550 :       else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
     252           0 :         (this->*_M_eat_escape)();
     253             :       else
     254             :         {
     255        9550 :           _M_token = _S_token_ord_char;
     256        9550 :           _M_value.assign(1, __c);
     257             :         }
     258       14325 :       _M_at_bracket_start = false;
     259       14325 :     }
     260             : 
     261             :   // Differences between styles:
     262             :   // 1) "\}" in basic style.
     263             :   template<typename _CharT>
     264             :     void
     265           0 :     _Scanner<_CharT>::
     266             :     _M_scan_in_brace()
     267             :     {
     268           0 :       if (_M_current == _M_end)
     269           0 :         __throw_regex_error(
     270             :           regex_constants::error_brace,
     271             :           "Unexpected end of regex when in brace expression.");
     272             : 
     273           0 :       auto __c = *_M_current++;
     274             : 
     275           0 :       if (_M_ctype.is(_CtypeT::digit, __c))
     276             :         {
     277           0 :           _M_token = _S_token_dup_count;
     278           0 :           _M_value.assign(1, __c);
     279           0 :           while (_M_current != _M_end
     280           0 :                  && _M_ctype.is(_CtypeT::digit, *_M_current))
     281           0 :             _M_value += *_M_current++;
     282             :         }
     283           0 :       else if (__c == ',')
     284           0 :         _M_token = _S_token_comma;
     285             :       // basic use \}.
     286           0 :       else if (_M_is_basic())
     287             :         {
     288           0 :           if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
     289             :             {
     290           0 :               _M_state = _S_state_normal;
     291           0 :               _M_token = _S_token_interval_end;
     292           0 :               ++_M_current;
     293             :             }
     294             :           else
     295           0 :             __throw_regex_error(regex_constants::error_badbrace,
     296             :                                 "Unexpected character in brace expression.");
     297             :         }
     298           0 :       else if (__c == '}')
     299             :         {
     300           0 :           _M_state = _S_state_normal;
     301           0 :           _M_token = _S_token_interval_end;
     302             :         }
     303             :       else
     304           0 :         __throw_regex_error(regex_constants::error_badbrace,
     305             :                             "Unexpected character in brace expression.");
     306           0 :     }
     307             : 
     308             :   template<typename _CharT>
     309             :     void
     310           0 :     _Scanner<_CharT>::
     311             :     _M_eat_escape_ecma()
     312             :     {
     313           0 :       if (_M_current == _M_end)
     314           0 :         __throw_regex_error(regex_constants::error_escape,
     315             :                             "Unexpected end of regex when escaping.");
     316             : 
     317           0 :       auto __c = *_M_current++;
     318           0 :       auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
     319             : 
     320           0 :       if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
     321             :         {
     322           0 :           _M_token = _S_token_ord_char;
     323           0 :           _M_value.assign(1, *__pos);
     324             :         }
     325           0 :       else if (__c == 'b')
     326             :         {
     327           0 :           _M_token = _S_token_word_bound;
     328           0 :           _M_value.assign(1, 'p');
     329             :         }
     330           0 :       else if (__c == 'B')
     331             :         {
     332           0 :           _M_token = _S_token_word_bound;
     333           0 :           _M_value.assign(1, 'n');
     334             :         }
     335             :       // N3376 28.13
     336           0 :       else if (__c == 'd'
     337           0 :                || __c == 'D'
     338           0 :                || __c == 's'
     339           0 :                || __c == 'S'
     340           0 :                || __c == 'w'
     341           0 :                || __c == 'W')
     342             :         {
     343           0 :           _M_token = _S_token_quoted_class;
     344           0 :           _M_value.assign(1, __c);
     345             :         }
     346           0 :       else if (__c == 'c')
     347             :         {
     348           0 :           if (_M_current == _M_end)
     349           0 :             __throw_regex_error(
     350             :               regex_constants::error_escape,
     351             :               "Unexpected end of regex when reading control code.");
     352           0 :           _M_token = _S_token_ord_char;
     353           0 :           _M_value.assign(1, *_M_current++);
     354             :         }
     355           0 :       else if (__c == 'x' || __c == 'u')
     356             :         {
     357           0 :           _M_value.erase();
     358           0 :           for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
     359             :             {
     360           0 :               if (_M_current == _M_end
     361           0 :                   || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
     362           0 :                 __throw_regex_error(
     363             :                   regex_constants::error_escape,
     364             :                   "Unexpected end of regex when ascii character.");
     365           0 :               _M_value += *_M_current++;
     366             :             }
     367           0 :           _M_token = _S_token_hex_num;
     368             :         }
     369             :       // ECMAScript recognizes multi-digit back-references.
     370           0 :       else if (_M_ctype.is(_CtypeT::digit, __c))
     371             :         {
     372           0 :           _M_value.assign(1, __c);
     373           0 :           while (_M_current != _M_end
     374           0 :                  && _M_ctype.is(_CtypeT::digit, *_M_current))
     375           0 :             _M_value += *_M_current++;
     376           0 :           _M_token = _S_token_backref;
     377             :         }
     378             :       else
     379             :         {
     380           0 :           _M_token = _S_token_ord_char;
     381           0 :           _M_value.assign(1, __c);
     382             :         }
     383           0 :     }
     384             : 
     385             :   // Differences between styles:
     386             :   // 1) Extended doesn't support backref, but basic does.
     387             :   template<typename _CharT>
     388             :     void
     389           0 :     _Scanner<_CharT>::
     390             :     _M_eat_escape_posix()
     391             :     {
     392           0 :       if (_M_current == _M_end)
     393           0 :         __throw_regex_error(regex_constants::error_escape,
     394             :                             "Unexpected end of regex when escaping.");
     395             : 
     396           0 :       auto __c = *_M_current;
     397           0 :       auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
     398             : 
     399           0 :       if (__pos != nullptr && *__pos != '\0')
     400             :         {
     401           0 :           _M_token = _S_token_ord_char;
     402           0 :           _M_value.assign(1, __c);
     403             :         }
     404             :       // We MUST judge awk before handling backrefs. There's no backref in awk.
     405           0 :       else if (_M_is_awk())
     406             :         {
     407           0 :           _M_eat_escape_awk();
     408           0 :           return;
     409             :         }
     410           0 :       else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
     411             :         {
     412           0 :           _M_token = _S_token_backref;
     413           0 :           _M_value.assign(1, __c);
     414             :         }
     415             :       else
     416             :         {
     417             : #ifdef __STRICT_ANSI__
     418             :           // POSIX says it is undefined to escape ordinary characters
     419           0 :           __throw_regex_error(regex_constants::error_escape,
     420             :                               "Unexpected escape character.");
     421             : #else
     422             :           _M_token = _S_token_ord_char;
     423             :           _M_value.assign(1, __c);
     424             : #endif
     425             :         }
     426           0 :       ++_M_current;
     427             :     }
     428             : 
     429             :   template<typename _CharT>
     430             :     void
     431           0 :     _Scanner<_CharT>::
     432             :     _M_eat_escape_awk()
     433             :     {
     434           0 :       auto __c = *_M_current++;
     435           0 :       auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
     436             : 
     437           0 :       if (__pos != nullptr)
     438             :         {
     439           0 :           _M_token = _S_token_ord_char;
     440           0 :           _M_value.assign(1, *__pos);
     441             :         }
     442             :       // \ddd for oct representation
     443           0 :       else if (_M_ctype.is(_CtypeT::digit, __c)
     444           0 :                && __c != '8'
     445           0 :                && __c != '9')
     446             :         {
     447           0 :           _M_value.assign(1,  __c);
     448           0 :           for (int __i = 0;
     449             :                __i < 2
     450           0 :                && _M_current != _M_end
     451           0 :                && _M_ctype.is(_CtypeT::digit, *_M_current)
     452           0 :                && *_M_current != '8'
     453           0 :                && *_M_current != '9';
     454             :                __i++)
     455           0 :             _M_value += *_M_current++;
     456           0 :           _M_token = _S_token_oct_num;
     457           0 :           return;
     458             :         }
     459             :       else
     460           0 :         __throw_regex_error(regex_constants::error_escape,
     461             :                             "Unexpected escape character.");
     462             :     }
     463             : 
     464             :   // Eats a character class or throws an exception.
     465             :   // __ch could be ':', '.' or '=', _M_current is the char after ']' when
     466             :   // returning.
     467             :   template<typename _CharT>
     468             :     void
     469           0 :     _Scanner<_CharT>::
     470             :     _M_eat_class(char __ch)
     471             :     {
     472           0 :       for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
     473           0 :         _M_value += *_M_current++;
     474             :       if (_M_current == _M_end
     475           0 :           || *_M_current++ != __ch
     476           0 :           || _M_current == _M_end // skip __ch
     477           0 :           || *_M_current++ != ']') // skip ']'
     478             :         {
     479           0 :           if (__ch == ':')
     480           0 :             __throw_regex_error(regex_constants::error_ctype,
     481             :                                 "Unexpected end of character class.");
     482             :           else
     483           0 :             __throw_regex_error(regex_constants::error_collate,
     484             :                                 "Unexpected end of character class.");
     485             :         }
     486           0 :     }
     487             : 
     488             : #ifdef _GLIBCXX_DEBUG
     489             :   template<typename _CharT>
     490             :     std::ostream&
     491             :     _Scanner<_CharT>::
     492             :     _M_print(std::ostream& ostr)
     493             :     {
     494             :       switch (_M_token)
     495             :       {
     496             :       case _S_token_anychar:
     497             :         ostr << "any-character\n";
     498             :         break;
     499             :       case _S_token_backref:
     500             :         ostr << "backref\n";
     501             :         break;
     502             :       case _S_token_bracket_begin:
     503             :         ostr << "bracket-begin\n";
     504             :         break;
     505             :       case _S_token_bracket_neg_begin:
     506             :         ostr << "bracket-neg-begin\n";
     507             :         break;
     508             :       case _S_token_bracket_end:
     509             :         ostr << "bracket-end\n";
     510             :         break;
     511             :       case _S_token_char_class_name:
     512             :         ostr << "char-class-name \"" << _M_value << "\"\n";
     513             :         break;
     514             :       case _S_token_closure0:
     515             :         ostr << "closure0\n";
     516             :         break;
     517             :       case _S_token_closure1:
     518             :         ostr << "closure1\n";
     519             :         break;
     520             :       case _S_token_collsymbol:
     521             :         ostr << "collsymbol \"" << _M_value << "\"\n";
     522             :         break;
     523             :       case _S_token_comma:
     524             :         ostr << "comma\n";
     525             :         break;
     526             :       case _S_token_dup_count:
     527             :         ostr << "dup count: " << _M_value << "\n";
     528             :         break;
     529             :       case _S_token_eof:
     530             :         ostr << "EOF\n";
     531             :         break;
     532             :       case _S_token_equiv_class_name:
     533             :         ostr << "equiv-class-name \"" << _M_value << "\"\n";
     534             :         break;
     535             :       case _S_token_interval_begin:
     536             :         ostr << "interval begin\n";
     537             :         break;
     538             :       case _S_token_interval_end:
     539             :         ostr << "interval end\n";
     540             :         break;
     541             :       case _S_token_line_begin:
     542             :         ostr << "line begin\n";
     543             :         break;
     544             :       case _S_token_line_end:
     545             :         ostr << "line end\n";
     546             :         break;
     547             :       case _S_token_opt:
     548             :         ostr << "opt\n";
     549             :         break;
     550             :       case _S_token_or:
     551             :         ostr << "or\n";
     552             :         break;
     553             :       case _S_token_ord_char:
     554             :         ostr << "ordinary character: \"" << _M_value << "\"\n";
     555             :         break;
     556             :       case _S_token_subexpr_begin:
     557             :         ostr << "subexpr begin\n";
     558             :         break;
     559             :       case _S_token_subexpr_no_group_begin:
     560             :         ostr << "no grouping subexpr begin\n";
     561             :         break;
     562             :       case _S_token_subexpr_lookahead_begin:
     563             :         ostr << "lookahead subexpr begin\n";
     564             :         break;
     565             :       case _S_token_subexpr_end:
     566             :         ostr << "subexpr end\n";
     567             :         break;
     568             :       case _S_token_unknown:
     569             :         ostr << "-- unknown token --\n";
     570             :         break;
     571             :       case _S_token_oct_num:
     572             :         ostr << "oct number " << _M_value << "\n";
     573             :         break;
     574             :       case _S_token_hex_num:
     575             :         ostr << "hex number " << _M_value << "\n";
     576             :         break;
     577             :       case _S_token_quoted_class:
     578             :         ostr << "quoted class " << "\\" << _M_value << "\n";
     579             :         break;
     580             :       default:
     581             :         _GLIBCXX_DEBUG_ASSERT(false);
     582             :       }
     583             :       return ostr;
     584             :     }
     585             : #endif
     586             : 
     587             : } // namespace __detail
     588             : _GLIBCXX_END_NAMESPACE_VERSION
     589             : } // namespace

Generated by: LCOV version 1.14