diff options
author | timshen <timshen@138bc75d-0d04-0410-961f-82ee72b054a4> | 2013-08-27 02:49:22 +0000 |
---|---|---|
committer | timshen <timshen@138bc75d-0d04-0410-961f-82ee72b054a4> | 2013-08-27 02:49:22 +0000 |
commit | 24ef3585c8da774f02d9163d1603c5901ed9455a (patch) | |
tree | 2152998d3b362b24cc7f901c04a1249d8fe40c68 /libstdc++-v3/include | |
parent | 2517803207400a68ebb8e234e2a80075d24fa654 (diff) | |
download | ppe42-gcc-24ef3585c8da774f02d9163d1603c5901ed9455a.tar.gz ppe42-gcc-24ef3585c8da774f02d9163d1603c5901ed9455a.zip |
2013-08-26 Tim Shen <timshen91@gmail.com>
* include/Makefile.am: Add regex_scanner.{h,tcc}.
* include/Makefile.in: Regenerate.
* include/bits/regex.h (match_search): Handle the `__first == __last`
situation correctly.
* include/bits/regex_compiler.h: Move _Scanner...
* include/bits/regex_scanner.h: ...to here. New.
* include/bits/regex_compiler.tcc: Move _Scanner...
* include/bits/regex_scanner.tcc: ...to here, too. New.
* include/bits/regex_executor.tcc: Use value instead of reference for
submatch.
* include/std/regex: Add regex_scanner.h
* testsuite/28_regex/algorithms/regex_match/awk/cstring_01.cc: New.
* testsuite/28_regex/algorithms/regex_match/basic/empty_range.cc: New.
* testsuite/28_regex/algorithms/regex_match/ecma/cstring_hex.cc: New.
* testsuite/28_regex/algorithms/regex_match/ecma/empty_range.cc: New.
* testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc: New.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@202015 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libstdc++-v3/include')
-rw-r--r-- | libstdc++-v3/include/Makefile.am | 2 | ||||
-rw-r--r-- | libstdc++-v3/include/Makefile.in | 2 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex.h | 12 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_compiler.h | 295 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_compiler.tcc | 693 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_executor.tcc | 2 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_scanner.h | 194 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_scanner.tcc | 609 | ||||
-rw-r--r-- | libstdc++-v3/include/std/regex | 1 |
9 files changed, 1020 insertions, 790 deletions
diff --git a/libstdc++-v3/include/Makefile.am b/libstdc++-v3/include/Makefile.am index 5971af3edc1..0bceb5776a5 100644 --- a/libstdc++-v3/include/Makefile.am +++ b/libstdc++-v3/include/Makefile.am @@ -128,6 +128,8 @@ bits_headers = \ ${bits_srcdir}/regex.h \ ${bits_srcdir}/regex_constants.h \ ${bits_srcdir}/regex_error.h \ + ${bits_srcdir}/regex_scanner.h \ + ${bits_srcdir}/regex_scanner.tcc \ ${bits_srcdir}/regex_automaton.h \ ${bits_srcdir}/regex_automaton.tcc \ ${bits_srcdir}/regex_compiler.h \ diff --git a/libstdc++-v3/include/Makefile.in b/libstdc++-v3/include/Makefile.in index aa8ef43b224..b1606397013 100644 --- a/libstdc++-v3/include/Makefile.in +++ b/libstdc++-v3/include/Makefile.in @@ -395,6 +395,8 @@ bits_headers = \ ${bits_srcdir}/regex.h \ ${bits_srcdir}/regex_constants.h \ ${bits_srcdir}/regex_error.h \ + ${bits_srcdir}/regex_scanner.h \ + ${bits_srcdir}/regex_scanner.tcc \ ${bits_srcdir}/regex_automaton.h \ ${bits_srcdir}/regex_automaton.tcc \ ${bits_srcdir}/regex_compiler.h \ diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h index 555dfc6e1fb..48388198ce0 100644 --- a/libstdc++-v3/include/bits/regex.h +++ b/libstdc++-v3/include/bits/regex.h @@ -740,11 +740,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION * @throws regex_error if @p [__first, __last) is not a valid regular * expression. */ - template<typename _InputIterator> - basic_regex(_InputIterator __first, _InputIterator __last, + template<typename _FwdIter> + basic_regex(_FwdIter __first, _FwdIter __last, flag_type __f = ECMAScript) : _M_flags(__f), - _M_automaton(__detail::_Compiler<_InputIterator, _Ch_type, _Rx_traits> + _M_automaton(__detail::_Compiler<_FwdIter, _Ch_type, _Rx_traits> (__first, __last, _M_traits, _M_flags)._M_get_nfa()) { } @@ -2371,7 +2371,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { if (__re._M_automaton == nullptr) return false; - for (auto __cur = __first; __cur != __last; ++__cur) // Any KMP-like algo? + auto __cur = __first; + // Continue when __cur == __last + do { __detail::__get_executor(__cur, __last, __m, __re, __flags) ->_M_search_from_first(); @@ -2391,10 +2393,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return true; } } + while (__cur++ != __last); return false; } - /** * Searches for a regular expression within a range. * @param __first [IN] The start of the string to search. diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index 4ab36d28d2b..1d588b91df8 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -39,197 +39,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION * @{ */ - /// Matches a character range (bracket expression) template<typename _CharT, typename _TraitsT> - struct _BracketMatcher - { - typedef typename _TraitsT::char_class_type _CharClassT; - typedef typename _TraitsT::string_type _StringT; - typedef regex_constants::syntax_option_type _FlagT; - - explicit - _BracketMatcher(bool __is_non_matching, - const _TraitsT& __t, - _FlagT __flags) - : _M_is_non_matching(__is_non_matching), _M_traits(__t), - _M_flags(__flags), _M_class_set(0) - { } - - bool - operator()(_CharT) const; - - void - _M_add_char(_CharT __c) - { - if (_M_flags & regex_constants::collate) - if (_M_is_icase()) - _M_char_set.push_back(_M_traits.translate_nocase(__c)); - else - _M_char_set.push_back(_M_traits.translate(__c)); - else - _M_char_set.push_back(__c); - } - - void - _M_add_collating_element(const _StringT& __s) - { - auto __st = _M_traits.lookup_collatename(&*__s.begin(), &*__s.end()); - if (__st.empty()) - __throw_regex_error(regex_constants::error_collate); - // TODO: digraph - _M_char_set.push_back(__st[0]); - } - - void - _M_add_equivalence_class(const _StringT& __s) - { - _M_add_character_class( - _M_traits.transform_primary(&*__s.begin(), &*__s.end())); - } - - void - _M_add_character_class(const _StringT& __s) - { - auto __st = _M_traits. - lookup_classname(&*__s.begin(), &*__s.end(), _M_is_icase()); - if (__st == 0) - __throw_regex_error(regex_constants::error_ctype); - _M_class_set |= __st; - } - - void - _M_make_range(_CharT __l, _CharT __r) - { _M_range_set.push_back(make_pair(_M_get_str(__l), _M_get_str(__r))); } - - bool - _M_is_icase() const - { return _M_flags & regex_constants::icase; } - - _StringT - _M_get_str(_CharT __c) const - { - auto __s = _StringT(1, - _M_is_icase() - ? _M_traits.translate_nocase(__c) - : _M_traits.translate(__c)); - return _M_traits.transform(__s.begin(), __s.end()); - } - - _TraitsT _M_traits; - _FlagT _M_flags; - bool _M_is_non_matching; - std::vector<_CharT> _M_char_set; - std::vector<pair<_StringT, _StringT>> _M_range_set; - _CharClassT _M_class_set; - }; - - /** - * @brief struct _Scanner. Scans an input range for regex tokens. - * - * The %_Scanner class interprets the regular expression pattern in - * the input range passed to its constructor as a sequence of parse - * tokens passed to the regular expression compiler. The sequence - * of tokens provided depends on the flag settings passed to the - * constructor: different regular expression grammars will interpret - * the same input pattern in syntactically different ways. - */ - template<typename _InputIter> - class _Scanner - { - public: - typedef unsigned int _StateT; - typedef typename std::iterator_traits<_InputIter>::value_type _CharT; - typedef std::basic_string<_CharT> _StringT; - typedef regex_constants::syntax_option_type _FlagT; - typedef const std::ctype<_CharT> _CtypeT; - - /// Token types returned from the scanner. - enum _TokenT - { - _S_token_anychar, - _S_token_backref, - _S_token_bracket_begin, - _S_token_bracket_inverse_begin, - _S_token_bracket_end, - _S_token_char_class_name, - _S_token_closure0, - _S_token_closure1, - _S_token_collelem_multi, - _S_token_collelem_single, - _S_token_collsymbol, - _S_token_comma, - _S_token_dash, - _S_token_dup_count, - _S_token_eof, - _S_token_equiv_class_name, - _S_token_interval_begin, - _S_token_interval_end, - _S_token_line_begin, - _S_token_line_end, - _S_token_opt, - _S_token_or, - _S_token_ord_char, - _S_token_subexpr_begin, - _S_token_subexpr_end, - _S_token_word_begin, - _S_token_word_end, - _S_token_unknown - }; - - _Scanner(_InputIter __begin, _InputIter __end, - _FlagT __flags, std::locale __loc) - : _M_current(__begin) , _M_end(__end) , _M_flags(__flags), - _M_ctype(std::use_facet<_CtypeT>(__loc)), _M_state(0) - { _M_advance(); } - - void - _M_advance(); - - _TokenT - _M_token() const - { return _M_curToken; } - - const _StringT& - _M_value() const - { return _M_curValue; } - -#ifdef _GLIBCXX_DEBUG - std::ostream& - _M_print(std::ostream&); -#endif - - private: - void - _M_eat_escape(); - - void - _M_scan_in_brace(); - - void - _M_scan_in_bracket(); - - void - _M_eat_charclass(); - - void - _M_eat_equivclass(); - - void - _M_eat_collsymbol(); - - static constexpr _StateT _S_state_in_brace = 1 << 0; - static constexpr _StateT _S_state_in_bracket = 1 << 1; - _InputIter _M_current; - _InputIter _M_end; - _FlagT _M_flags; - _CtypeT& _M_ctype; - _TokenT _M_curToken; - _StringT _M_curValue; - _StateT _M_state; - }; + struct _BracketMatcher; /// Builds an NFA from an input iterator interval. - template<typename _InputIter, typename _CharT, typename _TraitsT> + template<typename _FwdIter, typename _CharT, typename _TraitsT> class _Compiler { public: @@ -237,7 +51,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION typedef _NFA<_CharT, _TraitsT> _RegexT; typedef regex_constants::syntax_option_type _FlagT; - _Compiler(_InputIter __b, _InputIter __e, + _Compiler(_FwdIter __b, _FwdIter __e, const _TraitsT& __traits, _FlagT __flags); std::shared_ptr<_RegexT> @@ -245,7 +59,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { return std::shared_ptr<_RegexT>(new _RegexT(_M_state_store)); } private: - typedef _Scanner<_InputIter> _ScannerT; + typedef _Scanner<_FwdIter> _ScannerT; typedef typename _ScannerT::_TokenT _TokenT; typedef _StateSeq<_CharT, _TraitsT> _StateSeqT; typedef std::stack<_StateSeqT, std::vector<_StateSeqT>> _StackT; @@ -276,7 +90,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_bracket_expression(); - bool + void _M_bracket_list(_BMatcherT& __matcher); bool @@ -303,14 +117,111 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION int _M_cur_int_value(int __radix); + bool + _M_try_char(); + + _CharT + _M_get_char(); + const _TraitsT& _M_traits; _ScannerT _M_scanner; - _StringT _M_cur_value; + _StringT _M_value; _RegexT _M_state_store; _StackT _M_stack; _FlagT _M_flags; }; + /// Matches a character range (bracket expression) + template<typename _CharT, typename _TraitsT> + struct _BracketMatcher + { + typedef typename _TraitsT::char_class_type _CharClassT; + typedef typename _TraitsT::string_type _StringT; + typedef regex_constants::syntax_option_type _FlagT; + + explicit + _BracketMatcher(bool __is_non_matching, + const _TraitsT& __t, + _FlagT __flags) + : _M_is_non_matching(__is_non_matching), _M_traits(__t), + _M_flags(__flags), _M_class_set(0) + { } + + bool + operator()(_CharT) const; + + void + _M_add_char(_CharT __c) + { _M_char_set.push_back(_M_translate(__c)); } + + void + _M_add_collating_element(const _StringT& __s) + { + auto __st = _M_traits.lookup_collatename(__s.data(), + __s.data() + __s.size()); + if (__st.empty()) + __throw_regex_error(regex_constants::error_collate); + // TODO: digraph + _M_char_set.push_back(__st[0]); + } + + void + _M_add_equivalence_class(const _StringT& __s) + { + _M_add_character_class( + _M_traits.transform_primary(__s.data(), + __s.data() + __s.size())); + } + + void + _M_add_character_class(const _StringT& __s) + { + auto __st = _M_traits. + lookup_classname(__s.data(), __s.data() + __s.size(), _M_is_icase()); + if (__st == 0) + __throw_regex_error(regex_constants::error_ctype); + _M_class_set |= __st; + } + + void + _M_make_range(_CharT __l, _CharT __r) + { + _M_range_set.push_back( + make_pair(_M_get_str(_M_translate(__l)), + _M_get_str(_M_translate(__r)))); + } + + _CharT + _M_translate(_CharT __c) const + { + if (_M_flags & regex_constants::collate) + if (_M_is_icase()) + return _M_traits.translate_nocase(__c); + else + return _M_traits.translate(__c); + else + return __c; + } + + bool + _M_is_icase() const + { return _M_flags & regex_constants::icase; } + + _StringT + _M_get_str(_CharT __c) const + { + _StringT __s(1, __c); + return _M_traits.transform(__s.begin(), __s.end()); + } + + _TraitsT _M_traits; + _FlagT _M_flags; + bool _M_is_non_matching; + std::vector<_CharT> _M_char_set; + std::vector<pair<_StringT, _StringT>> _M_range_set; + _CharClassT _M_class_set; + }; + //@} regex-detail _GLIBCXX_END_NAMESPACE_VERSION } // namespace __detail diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 5755c2a0e86..bed091a4486 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -34,506 +34,15 @@ namespace __detail { _GLIBCXX_BEGIN_NAMESPACE_VERSION - template<typename _BiIter> - void - _Scanner<_BiIter>:: - _M_advance() - { - if (_M_current == _M_end) - { - _M_curToken = _S_token_eof; - return; - } - - _CharT __c = *_M_current; - if (_M_state & _S_state_in_bracket) - { - _M_scan_in_bracket(); - return; - } - if (_M_state & _S_state_in_brace) - { - _M_scan_in_brace(); - return; - } -#if 0 - // TODO: re-enable line anchors when _M_assertion is implemented. - // See PR libstdc++/47724 - else if (_M_state & _S_state_at_start && __c == _M_ctype.widen('^')) - { - _M_curToken = _S_token_line_begin; - ++_M_current; - return; - } - else if (__c == _M_ctype.widen('$')) - { - _M_curToken = _S_token_line_end; - ++_M_current; - return; - } -#endif - else if (__c == _M_ctype.widen('.')) - { - _M_curToken = _S_token_anychar; - ++_M_current; - return; - } - else if (__c == _M_ctype.widen('*')) - { - _M_curToken = _S_token_closure0; - ++_M_current; - return; - } - else if (__c == _M_ctype.widen('+')) - { - _M_curToken = _S_token_closure1; - ++_M_current; - return; - } - else if (__c == _M_ctype.widen('|')) - { - _M_curToken = _S_token_or; - ++_M_current; - return; - } - else if (__c == _M_ctype.widen('[')) - { - if (*++_M_current == _M_ctype.widen('^')) - { - _M_curToken = _S_token_bracket_inverse_begin; - ++_M_current; - } - else - _M_curToken = _S_token_bracket_begin; - _M_state |= _S_state_in_bracket; - return; - } - else if (__c == _M_ctype.widen('\\')) - { - _M_eat_escape(); - return; - } - else if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) - { - if (__c == _M_ctype.widen('(')) - { - _M_curToken = _S_token_subexpr_begin; - ++_M_current; - return; - } - else if (__c == _M_ctype.widen(')')) - { - _M_curToken = _S_token_subexpr_end; - ++_M_current; - return; - } - else if (__c == _M_ctype.widen('{')) - { - _M_curToken = _S_token_interval_begin; - _M_state |= _S_state_in_brace; - ++_M_current; - return; - } - } - - _M_curToken = _S_token_ord_char; - _M_curValue.assign(1, __c); - ++_M_current; - } - - template<typename _BiIter> - void - _Scanner<_BiIter>:: - _M_scan_in_brace() - { - if (_M_ctype.is(_CtypeT::digit, *_M_current)) - { - _M_curToken = _S_token_dup_count; - _M_curValue.assign(1, *_M_current); - ++_M_current; - while (_M_current != _M_end - && _M_ctype.is(_CtypeT::digit, *_M_current)) - { - _M_curValue += *_M_current; - ++_M_current; - } - return; - } - else if (*_M_current == _M_ctype.widen(',')) - { - _M_curToken = _S_token_comma; - ++_M_current; - return; - } - if (_M_flags & (regex_constants::basic | regex_constants::grep)) - { - if (*_M_current == _M_ctype.widen('\\')) - _M_eat_escape(); - } - else - { - if (*_M_current == _M_ctype.widen('}')) - { - _M_curToken = _S_token_interval_end; - _M_state &= ~_S_state_in_brace; - ++_M_current; - return; - } - } - } - - template<typename _BiIter> - void - _Scanner<_BiIter>:: - _M_scan_in_bracket() - { - if (*_M_current == _M_ctype.widen('[')) - { - ++_M_current; - if (_M_current == _M_end) - { - _M_curToken = _S_token_eof; - return; - } - - if (*_M_current == _M_ctype.widen('.')) - { - _M_curToken = _S_token_collsymbol; - _M_eat_collsymbol(); - return; - } - else if (*_M_current == _M_ctype.widen(':')) - { - _M_curToken = _S_token_char_class_name; - _M_eat_charclass(); - return; - } - else if (*_M_current == _M_ctype.widen('=')) - { - _M_curToken = _S_token_equiv_class_name; - _M_eat_equivclass(); - return; - } - } - else if (*_M_current == _M_ctype.widen('-')) - { - _M_curToken = _S_token_dash; - ++_M_current; - return; - } - else if (*_M_current == _M_ctype.widen(']')) - { - _M_curToken = _S_token_bracket_end; - _M_state &= ~_S_state_in_bracket; - ++_M_current; - return; - } - else if (*_M_current == _M_ctype.widen('\\')) - { - _M_eat_escape(); - return; - } - _M_curToken = _S_token_collelem_single; - _M_curValue.assign(1, *_M_current); - ++_M_current; - } - - // TODO Complete it. - template<typename _BiIter> - void - _Scanner<_BiIter>:: - _M_eat_escape() - { - ++_M_current; - if (_M_current == _M_end) - { - _M_curToken = _S_token_eof; - return; - } - _CharT __c = *_M_current; - ++_M_current; - - if (__c == _M_ctype.widen('(')) - { - if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) - { - _M_curToken = _S_token_ord_char; - _M_curValue.assign(1, __c); - } - else - _M_curToken = _S_token_subexpr_begin; - } - else if (__c == _M_ctype.widen(')')) - { - if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) - { - _M_curToken = _S_token_ord_char; - _M_curValue.assign(1, __c); - } - else - _M_curToken = _S_token_subexpr_end; - } - else if (__c == _M_ctype.widen('{')) - { - if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) - { - _M_curToken = _S_token_ord_char; - _M_curValue.assign(1, __c); - } - else - { - _M_curToken = _S_token_interval_begin; - _M_state |= _S_state_in_brace; - } - } - else if (__c == _M_ctype.widen('}')) - { - if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) - { - _M_curToken = _S_token_ord_char; - _M_curValue.assign(1, __c); - } - else - { - if (!(_M_state && _S_state_in_brace)) - __throw_regex_error(regex_constants::error_badbrace); - _M_state &= ~_S_state_in_brace; - _M_curToken = _S_token_interval_end; - } - } - else if (__c == _M_ctype.widen('x')) - { - ++_M_current; - if (_M_current == _M_end) - { - _M_curToken = _S_token_eof; - return; - } - if (_M_ctype.is(_CtypeT::digit, *_M_current)) - { - _M_curValue.assign(1, *_M_current); - ++_M_current; - if (_M_current == _M_end) - { - _M_curToken = _S_token_eof; - return; - } - if (_M_ctype.is(_CtypeT::digit, *_M_current)) - { - _M_curValue += *_M_current; - ++_M_current; - return; - } - } - } - else if (__c == _M_ctype.widen('^') - || __c == _M_ctype.widen('.') - || __c == _M_ctype.widen('*') - || __c == _M_ctype.widen('$') - || __c == _M_ctype.widen('\\')) - { - _M_curToken = _S_token_ord_char; - _M_curValue.assign(1, __c); - } - else if (_M_ctype.is(_CtypeT::digit, __c)) - { - _M_curToken = _S_token_backref; - _M_curValue.assign(1, __c); - } - else if (_M_state & _S_state_in_bracket) - { - if (__c == _M_ctype.widen('-') - || __c == _M_ctype.widen('[') - || __c == _M_ctype.widen(']')) - { - _M_curToken = _S_token_ord_char; - _M_curValue.assign(1, __c); - } - else if ((_M_flags & regex_constants::ECMAScript) - && __c == _M_ctype.widen('b')) - { - _M_curToken = _S_token_ord_char; - _M_curValue.assign(1, _M_ctype.widen(' ')); - } - else - __throw_regex_error(regex_constants::error_escape); - } - else - __throw_regex_error(regex_constants::error_escape); - } - - // Eats a character class or throwns an exception. - // current point to ':' delimiter on entry, char after ']' on return - template<typename _BiIter> - void - _Scanner<_BiIter>:: - _M_eat_charclass() - { - ++_M_current; // skip ':' - if (_M_current == _M_end) - __throw_regex_error(regex_constants::error_ctype); - for (_M_curValue.clear(); - _M_current != _M_end && *_M_current != _M_ctype.widen(':'); - ++_M_current) - _M_curValue += *_M_current; - if (_M_current == _M_end) - __throw_regex_error(regex_constants::error_ctype); - ++_M_current; // skip ':' - if (*_M_current != _M_ctype.widen(']')) - __throw_regex_error(regex_constants::error_ctype); - ++_M_current; // skip ']' - } - - - template<typename _BiIter> - void - _Scanner<_BiIter>:: - _M_eat_equivclass() - { - ++_M_current; // skip '=' - if (_M_current == _M_end) - __throw_regex_error(regex_constants::error_collate); - for (_M_curValue.clear(); - _M_current != _M_end && *_M_current != _M_ctype.widen('='); - ++_M_current) - _M_curValue += *_M_current; - if (_M_current == _M_end) - __throw_regex_error(regex_constants::error_collate); - ++_M_current; // skip '=' - if (*_M_current != _M_ctype.widen(']')) - __throw_regex_error(regex_constants::error_collate); - ++_M_current; // skip ']' - } - - - template<typename _BiIter> - void - _Scanner<_BiIter>:: - _M_eat_collsymbol() - { - ++_M_current; // skip '.' - if (_M_current == _M_end) - __throw_regex_error(regex_constants::error_collate); - for (_M_curValue.clear(); - _M_current != _M_end && *_M_current != _M_ctype.widen('.'); - ++_M_current) - _M_curValue += *_M_current; - if (_M_current == _M_end) - __throw_regex_error(regex_constants::error_collate); - ++_M_current; // skip '.' - if (*_M_current != _M_ctype.widen(']')) - __throw_regex_error(regex_constants::error_collate); - ++_M_current; // skip ']' - } - -#ifdef _GLIBCXX_DEBUG - template<typename _BiIter> - std::ostream& - _Scanner<_BiIter>:: - _M_print(std::ostream& ostr) - { - switch (_M_curToken) - { - case _S_token_anychar: - ostr << "any-character\n"; - break; - case _S_token_backref: - ostr << "backref\n"; - break; - case _S_token_bracket_begin: - ostr << "bracket-begin\n"; - break; - case _S_token_bracket_inverse_begin: - ostr << "bracket-inverse-begin\n"; - break; - case _S_token_bracket_end: - ostr << "bracket-end\n"; - break; - case _S_token_char_class_name: - ostr << "char-class-name \"" << _M_curValue << "\"\n"; - break; - case _S_token_closure0: - ostr << "closure0\n"; - break; - case _S_token_closure1: - ostr << "closure1\n"; - break; - case _S_token_collelem_multi: - ostr << "coll-elem-multi \"" << _M_curValue << "\"\n"; - break; - case _S_token_collelem_single: - ostr << "coll-elem-single \"" << _M_curValue << "\"\n"; - break; - case _S_token_collsymbol: - ostr << "collsymbol \"" << _M_curValue << "\"\n"; - break; - case _S_token_comma: - ostr << "comma\n"; - break; - case _S_token_dash: - ostr << "dash\n"; - break; - case _S_token_dup_count: - ostr << "dup count: " << _M_curValue << "\n"; - break; - case _S_token_eof: - ostr << "EOF\n"; - break; - case _S_token_equiv_class_name: - ostr << "equiv-class-name \"" << _M_curValue << "\"\n"; - break; - case _S_token_interval_begin: - ostr << "interval begin\n"; - break; - case _S_token_interval_end: - ostr << "interval end\n"; - break; - case _S_token_line_begin: - ostr << "line begin\n"; - break; - case _S_token_line_end: - ostr << "line end\n"; - break; - case _S_token_opt: - ostr << "opt\n"; - break; - case _S_token_or: - ostr << "or\n"; - break; - case _S_token_ord_char: - ostr << "ordinary character: \"" << _M_value() << "\"\n"; - break; - case _S_token_subexpr_begin: - ostr << "subexpr begin\n"; - break; - case _S_token_subexpr_end: - ostr << "subexpr end\n"; - break; - case _S_token_word_begin: - ostr << "word begin\n"; - break; - case _S_token_word_end: - ostr << "word end\n"; - break; - case _S_token_unknown: - ostr << "-- unknown token --\n"; - break; - default: - _GLIBCXX_DEBUG_ASSERT(false); - } - return ostr; - } -#endif - - template<typename _InputIter, typename _CharT, typename _TraitsT> - _Compiler<_InputIter, _CharT, _TraitsT>:: - _Compiler(_InputIter __b, _InputIter __e, + template<typename _FwdIter, typename _CharT, typename _TraitsT> + _Compiler<_FwdIter, _CharT, _TraitsT>:: + _Compiler(_FwdIter __b, _FwdIter __e, const _TraitsT& __traits, _FlagT __flags) : _M_traits(__traits), _M_scanner(__b, __e, __flags, _M_traits.getloc()), _M_state_store(__flags), _M_flags(__flags) { _StateSeqT __r(_M_state_store, - _M_state_store._M_insert_subexpr_begin()); + _M_state_store._M_insert_subexpr_begin()); _M_disjunction(); if (!_M_stack.empty()) { @@ -544,23 +53,23 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __r._M_append(_M_state_store._M_insert_accept()); } - template<typename _InputIter, typename _CharT, typename _TraitsT> + template<typename _FwdIter, typename _CharT, typename _TraitsT> bool - _Compiler<_InputIter, _CharT, _TraitsT>:: - _M_match_token(_Compiler<_InputIter, _CharT, _TraitsT>::_TokenT token) + _Compiler<_FwdIter, _CharT, _TraitsT>:: + _M_match_token(_TokenT token) { - if (token == _M_scanner._M_token()) + if (token == _M_scanner._M_get_token()) { - _M_cur_value = _M_scanner._M_value(); + _M_value = _M_scanner._M_get_value(); _M_scanner._M_advance(); return true; } return false; } - template<typename _InputIter, typename _CharT, typename _TraitsT> + template<typename _FwdIter, typename _CharT, typename _TraitsT> void - _Compiler<_InputIter, _CharT, _TraitsT>:: + _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_disjunction() { this->_M_alternative(); @@ -573,9 +82,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } } - template<typename _InputIter, typename _CharT, typename _TraitsT> + template<typename _FwdIter, typename _CharT, typename _TraitsT> void - _Compiler<_InputIter, _CharT, _TraitsT>:: + _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_alternative() { if (this->_M_term()) @@ -591,9 +100,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } } - template<typename _InputIter, typename _CharT, typename _TraitsT> + template<typename _FwdIter, typename _CharT, typename _TraitsT> bool - _Compiler<_InputIter, _CharT, _TraitsT>:: + _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_term() { if (this->_M_assertion()) @@ -606,37 +115,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } - template<typename _InputIter, typename _CharT, typename _TraitsT> + // TODO Implement it. + template<typename _FwdIter, typename _CharT, typename _TraitsT> bool - _Compiler<_InputIter, _CharT, _TraitsT>:: + _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_assertion() { - if (_M_match_token(_ScannerT::_S_token_line_begin)) - { - // __m.push(_Matcher::_S_opcode_line_begin); - return true; - } - if (_M_match_token(_ScannerT::_S_token_line_end)) - { - // __m.push(_Matcher::_S_opcode_line_end); - return true; - } - if (_M_match_token(_ScannerT::_S_token_word_begin)) - { - // __m.push(_Matcher::_S_opcode_word_begin); - return true; - } - if (_M_match_token(_ScannerT::_S_token_word_end)) - { - // __m.push(_Matcher::_S_opcode_word_end); - return true; - } return false; } - template<typename _InputIter, typename _CharT, typename _TraitsT> + template<typename _FwdIter, typename _CharT, typename _TraitsT> void - _Compiler<_InputIter, _CharT, _TraitsT>:: + _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_quantifier() { if (_M_match_token(_ScannerT::_S_token_closure0)) @@ -707,15 +197,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } } - template<typename _InputIter, typename _CharT, typename _TraitsT> + template<typename _FwdIter, typename _CharT, typename _TraitsT> bool - _Compiler<_InputIter, _CharT, _TraitsT>:: + _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_atom() { if (_M_match_token(_ScannerT::_S_token_anychar)) { const static auto& - __any_matcher = [](_CharT) -> bool + __any_matcher = [](_CharT __ch) -> bool { return true; }; _M_stack.push(_StateSeqT(_M_state_store, @@ -723,9 +213,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION (__any_matcher))); return true; } - if (_M_match_token(_ScannerT::_S_token_ord_char)) + if (_M_try_char()) { - auto __c = _M_cur_value[0]; + _CharT __c = _M_value[0]; __detail::_Matcher<_CharT> f; if (_M_flags & regex_constants::icase) { @@ -744,7 +234,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } if (_M_match_token(_ScannerT::_S_token_backref)) { - // __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value); _M_stack.push(_StateSeqT(_M_state_store, _M_state_store. _M_insert_backref(_M_cur_int_value(10)))); return true; @@ -770,90 +259,111 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return _M_bracket_expression(); } - template<typename _InputIter, typename _CharT, typename _TraitsT> + template<typename _FwdIter, typename _CharT, typename _TraitsT> bool - _Compiler<_InputIter, _CharT, _TraitsT>:: + _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_bracket_expression() { - bool __inverse = - _M_match_token(_ScannerT::_S_token_bracket_inverse_begin); - if (!(__inverse || _M_match_token(_ScannerT::_S_token_bracket_begin))) + bool __neg = + _M_match_token(_ScannerT::_S_token_bracket_neg_begin); + if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) return false; - _BMatcherT __matcher( __inverse, _M_traits, _M_flags); - // special case: only if _not_ chr first after - // '[' or '[^' or if ECMAscript - if (!_M_bracket_list(__matcher) // list is empty - && !(_M_flags & regex_constants::ECMAScript)) - __throw_regex_error(regex_constants::error_brack); + _BMatcherT __matcher(__neg, _M_traits, _M_flags); + _M_bracket_list(__matcher); _M_stack.push(_StateSeqT(_M_state_store, _M_state_store._M_insert_matcher(__matcher))); return true; } - template<typename _InputIter, typename _CharT, typename _TraitsT> - bool // list is non-empty - _Compiler<_InputIter, _CharT, _TraitsT>:: + template<typename _FwdIter, typename _CharT, typename _TraitsT> + void + _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_bracket_list(_BMatcherT& __matcher) { if (_M_match_token(_ScannerT::_S_token_bracket_end)) - return false; + return; _M_expression_term(__matcher); _M_bracket_list(__matcher); - return true; + return; } - template<typename _InputIter, typename _CharT, typename _TraitsT> + template<typename _FwdIter, typename _CharT, typename _TraitsT> void - _Compiler<_InputIter, _CharT, _TraitsT>:: + _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_expression_term(_BMatcherT& __matcher) { if (_M_match_token(_ScannerT::_S_token_collsymbol)) { - __matcher._M_add_collating_element(_M_cur_value); + __matcher._M_add_collating_element(_M_value); return; } if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) { - __matcher._M_add_equivalence_class(_M_cur_value); + __matcher._M_add_equivalence_class(_M_value); return; } if (_M_match_token(_ScannerT::_S_token_char_class_name)) { - __matcher._M_add_character_class(_M_cur_value); + __matcher._M_add_character_class(_M_value); return; } - if (_M_match_token(_ScannerT::_S_token_collelem_single)) // [a + if (_M_try_char()) // [a { - auto __ch = _M_cur_value[0]; - if (_M_match_token(_ScannerT::_S_token_dash)) // [a- + auto __ch = _M_value[0]; + if (_M_try_char()) { - // If the dash is the last character in the bracket expression, - // it is not special. - if (_M_scanner._M_token() == _ScannerT::_S_token_bracket_end) - __matcher._M_add_char(_M_cur_value[0]); // [a-] <=> [a\-] - else // [a-z] + if (_M_value[0] == std::use_facet<std::ctype<_CharT>> + (_M_traits.getloc()).widen('-')) // [a- { - if (!_M_match_token(_ScannerT::_S_token_collelem_single)) + if (_M_try_char()) // [a-z] + { + __matcher._M_make_range(__ch, _M_value[0]); + return; + } + // If the dash is the last character in the bracket + // expression, it is not special. + if (_M_scanner._M_get_token() + != _ScannerT::_S_token_bracket_end) __throw_regex_error(regex_constants::error_range); - __matcher._M_make_range(__ch, _M_cur_value[0]); } + __matcher._M_add_char(_M_value[0]); } - else // [a] - __matcher._M_add_char(__ch); + __matcher._M_add_char(__ch); return; } __throw_regex_error(regex_constants::error_brack); } - template<typename _InputIter, typename _CharT, typename _TraitsT> + template<typename _FwdIter, typename _CharT, typename _TraitsT> + bool + _Compiler<_FwdIter, _CharT, _TraitsT>:: + _M_try_char() + { + bool __is_char = false; + if (_M_match_token(_ScannerT::_S_token_oct_num)) + { + __is_char = true; + _M_value.assign(1, _M_cur_int_value(8)); + } + else if (_M_match_token(_ScannerT::_S_token_hex_num)) + { + __is_char = true; + _M_value.assign(1, _M_cur_int_value(16)); + } + else if (_M_match_token(_ScannerT::_S_token_ord_char)) + __is_char = true; + return __is_char; + } + + template<typename _FwdIter, typename _CharT, typename _TraitsT> int - _Compiler<_InputIter, _CharT, _TraitsT>:: + _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_cur_int_value(int __radix) { int __v = 0; for (typename _StringT::size_type __i = 0; - __i < _M_cur_value.length(); ++__i) - __v =__v * __radix + _M_traits.value(_M_cur_value[__i], __radix); + __i < _M_value.length(); ++__i) + __v =__v * __radix + _M_traits.value(_M_value[__i], __radix); return __v; } @@ -861,35 +371,34 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _BracketMatcher<_CharT, _TraitsT>:: operator()(_CharT __ch) const { - auto __oldch = __ch; - if (_M_flags & regex_constants::collate) - if (_M_is_icase()) - __ch = _M_traits.translate_nocase(__ch); - else - __ch = _M_traits.translate(__ch); - bool __ret = false; - for (auto __c : _M_char_set) - if (__c == __ch) - { - __ret = true; - break; - } - if (!__ret && _M_traits.isctype(__oldch, _M_class_set)) + if (_M_traits.isctype(__ch, _M_class_set)) __ret = true; else { - _StringT __s = _M_get_str(__ch); - for (auto& __it : _M_range_set) - if (__it.first <= __s && __s <= __it.second) + __ch = _M_translate(__ch); + + for (auto __c : _M_char_set) + if (__c == __ch) { __ret = true; break; } + if (!__ret) + { + _StringT __s = _M_get_str(__ch); + for (auto& __it : _M_range_set) + if (__it.first <= __s && __s <= __it.second) + { + __ret = true; + break; + } + } } if (_M_is_non_matching) - __ret = !__ret; - return __ret; + return !__ret; + else + return __ret; } _GLIBCXX_END_NAMESPACE_VERSION diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index bc99331ce8e..edfd0b649ff 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -260,7 +260,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __size = __u.size(); for (auto __i = 0; __i < __size; __i++) { - auto& __uit = __u[__i], __vit = __v[__i]; + auto __uit = __u[__i], __vit = __v[__i]; if (__uit.matched && !__vit.matched) return true; if (!__uit.matched && __vit.matched) diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h new file mode 100644 index 00000000000..080ef635b0c --- /dev/null +++ b/libstdc++-v3/include/bits/regex_scanner.h @@ -0,0 +1,194 @@ +// class template regex -*- C++ -*- + +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// Under Section 7 of GPL version 3, you are granted additional +// permissions described in the GCC Runtime Library Exception, version +// 3.1, as published by the Free Software Foundation. + +// You should have received a copy of the GNU General Public License and +// a copy of the GCC Runtime Library Exception along with this program; +// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +// <http://www.gnu.org/licenses/>. + +/** + * @file bits/regex_scanner.h + * This is an internal header file, included by other library headers. + * Do not attempt to use it directly. @headername{regex} + */ + +namespace std _GLIBCXX_VISIBILITY(default) +{ +namespace __detail +{ +_GLIBCXX_BEGIN_NAMESPACE_VERSION + + /** + * @addtogroup regex-detail + * @{ + */ + + /** + * @brief struct _Scanner. Scans an input range for regex tokens. + * + * The %_Scanner class interprets the regular expression pattern in + * the input range passed to its constructor as a sequence of parse + * tokens passed to the regular expression compiler. The sequence + * of tokens provided depends on the flag settings passed to the + * constructor: different regular expression grammars will interpret + * the same input pattern in syntactically different ways. + */ + template<typename _FwdIter> + class _Scanner + { + public: + typedef typename std::iterator_traits<_FwdIter>::value_type _CharT; + typedef std::basic_string<_CharT> _StringT; + typedef regex_constants::syntax_option_type _FlagT; + typedef const std::ctype<_CharT> _CtypeT; + + /// Token types returned from the scanner. + enum _TokenT + { + _S_token_anychar, + _S_token_ord_char, + _S_token_oct_num, + _S_token_hex_num, + _S_token_backref, + _S_token_subexpr_begin, + _S_token_subexpr_no_group_begin, + _S_token_subexpr_lookahead_begin, + _S_token_subexpr_neg_lookahead_begin, + _S_token_subexpr_end, + _S_token_bracket_begin, + _S_token_bracket_neg_begin, + _S_token_bracket_end, + _S_token_interval_begin, + _S_token_interval_end, + _S_token_quoted_class, + _S_token_char_class_name, + _S_token_collsymbol, + _S_token_equiv_class_name, + _S_token_opt, + _S_token_or, + _S_token_closure0, + _S_token_closure1, + _S_token_line_begin, + _S_token_line_end, + _S_token_comma, + _S_token_dup_count, + _S_token_eof, + _S_token_unknown + }; + + _Scanner(_FwdIter __begin, _FwdIter __end, + _FlagT __flags, std::locale __loc); + + void + _M_advance(); + + _TokenT + _M_get_token() const + { return _M_token; } + + const _StringT& + _M_get_value() const + { return _M_value; } + +#ifdef _GLIBCXX_DEBUG + std::ostream& + _M_print(std::ostream&); +#endif + + private: + enum _StateT + { + _S_state_normal, + _S_state_in_brace, + _S_state_in_bracket, + }; + + void + _M_scan_normal(); + + void + _M_scan_in_bracket(); + + void + _M_scan_in_brace(); + + void + _M_eat_escape_ecma(); + + void + _M_eat_escape_posix(); + + void + _M_eat_escape_awk(); + + void + _M_eat_class(char); + + constexpr bool + _M_is_ecma() + { return _M_flags & regex_constants::ECMAScript; } + + constexpr bool + _M_is_basic() + { return _M_flags & (regex_constants::basic | regex_constants::grep); } + + constexpr bool + _M_is_extended() + { + return _M_flags & (regex_constants::extended + | regex_constants::egrep + | regex_constants::awk); + } + + constexpr bool + _M_is_grep() + { return _M_flags & (regex_constants::grep | regex_constants::egrep); } + + constexpr bool + _M_is_awk() + { return _M_flags & regex_constants::awk; } + + _StateT _M_state; + _FwdIter _M_current; + _FwdIter _M_end; + _FlagT _M_flags; + _CtypeT& _M_ctype; + _TokenT _M_token; + _StringT _M_value; + bool _M_at_bracket_start; + public: + // TODO: make them static when this file is stable. + const std::map<char, _TokenT> _M_token_map; + const std::map<char, char> _M_ecma_escape_map; + const std::map<char, char> _M_awk_escape_map; + const std::set<char> _M_ecma_spec_char; + const std::set<char> _M_basic_spec_char; + const std::set<char> _M_extended_spec_char; + + const std::map<char, char>& _M_escape_map; + const std::set<char>& _M_spec_char; + void (_Scanner::* _M_eat_escape)(); + }; + + //@} regex-detail +_GLIBCXX_END_NAMESPACE_VERSION +} // namespace __detail +} // namespace std + +#include <bits/regex_scanner.tcc> diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc new file mode 100644 index 00000000000..0d1d2cd9778 --- /dev/null +++ b/libstdc++-v3/include/bits/regex_scanner.tcc @@ -0,0 +1,609 @@ +// class template regex -*- C++ -*- + +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// Under Section 7 of GPL version 3, you are granted additional +// permissions described in the GCC Runtime Library Exception, version +// 3.1, as published by the Free Software Foundation. + +// You should have received a copy of the GNU General Public License and +// a copy of the GCC Runtime Library Exception along with this program; +// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +// <http://www.gnu.org/licenses/>. + +/** + * @file bits/regex_scanner.tcc + * This is an internal header file, included by other library headers. + * Do not attempt to use it directly. @headername{regex} + */ + +// TODO make comments doxygen format + +// N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep +// and awk +// 1) grep is basic except '\n' is treated as '|' +// 2) egrep is extended except '\n' is treated as '|' +// 3) awk is extended except special escaping rules, and there's no +// back-reference. +// +// References: +// +// ECMAScript: ECMA-262 15.10 +// +// basic, extended: +// http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html +// +// awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html + +namespace std _GLIBCXX_VISIBILITY(default) +{ +namespace __detail +{ +_GLIBCXX_BEGIN_NAMESPACE_VERSION + + template<typename _FwdIter> + _Scanner<_FwdIter>:: + _Scanner(_FwdIter __begin, _FwdIter __end, + _FlagT __flags, std::locale __loc) + : _M_current(__begin) , _M_end(__end) , _M_flags(__flags), + _M_ctype(std::use_facet<_CtypeT>(__loc)), _M_state(_S_state_normal), + _M_at_bracket_start(false), + _M_token_map + { + {'^', _S_token_line_begin}, + {'$', _S_token_line_end}, + {'.', _S_token_anychar}, + {'*', _S_token_closure0}, + {'+', _S_token_closure1}, + {'?', _S_token_opt}, + {'|', _S_token_or}, + // grep and egrep + {'\n', _S_token_or}, + }, + _M_ecma_escape_map + { + {'0', '\0'}, + {'b', '\b'}, + {'f', '\f'}, + {'n', '\n'}, + {'r', '\r'}, + {'t', '\t'}, + {'v', '\v'}, + }, + _M_awk_escape_map + { + {'"', '"'}, + {'/', '/'}, + {'\\', '\\'}, + {'a', '\a'}, + {'b', '\b'}, + {'f', '\f'}, + {'n', '\n'}, + {'r', '\r'}, + {'t', '\t'}, + {'v', '\v'}, + }, + _M_escape_map(_M_is_ecma() + ? _M_ecma_escape_map + : _M_awk_escape_map), + _M_ecma_spec_char + { + '^', + '$', + '\\', + '.', + '*', + '+', + '?', + '(', + ')', + '[', + ']', + '{', + '}', + '|', + }, + _M_basic_spec_char + { + '.', + '[', + '\\', + '*', + '^', + '$', + }, + _M_extended_spec_char + { + '.', + '[', + '\\', + '(', + ')', + '*', + '+', + '?', + '{', + '|', + '^', + '$', + }, + _M_eat_escape(_M_is_ecma() + ? &_Scanner::_M_eat_escape_ecma + : &_Scanner::_M_eat_escape_posix), + _M_spec_char(_M_is_ecma() + ? _M_ecma_spec_char + : _M_is_basic() + ? _M_basic_spec_char + : _M_extended_spec_char) + { _M_advance(); } + + template<typename _FwdIter> + void + _Scanner<_FwdIter>:: + _M_advance() + { + if (_M_current == _M_end) + { + _M_token = _S_token_eof; + return; + } + + if (_M_state == _S_state_normal) + _M_scan_normal(); + else if (_M_state == _S_state_in_bracket) + _M_scan_in_bracket(); + else if (_M_state == _S_state_in_brace) + _M_scan_in_brace(); + else + _GLIBCXX_DEBUG_ASSERT(false); + } + + // Differences between styles: + // 1) "\(", "\)", "\{" in basic. It's not escaping. + // 2) "(?:", "(?=", "(?!" in ECMAScript. + template<typename _FwdIter> + void + _Scanner<_FwdIter>:: + _M_scan_normal() + { + auto __c = *_M_current++; + + if (__c == '\\') + { + if (_M_current == _M_end) + __throw_regex_error(regex_constants::error_escape); + + if (!_M_is_basic() + || (*_M_current != '(' + && *_M_current != ')' + && *_M_current != '{')) + { + (this->*_M_eat_escape)(); + return; + } + __c = *_M_current++; + } + if (__c == '(') + { + if (_M_is_ecma() && *_M_current == '?') + { + if (++_M_current == _M_end) + __throw_regex_error(regex_constants::error_paren); + + if (*_M_current == ':') + { + ++_M_current; + _M_token = _S_token_subexpr_no_group_begin; + } + else if (*_M_current == '=') + { + ++_M_current; + _M_token = _S_token_subexpr_lookahead_begin; + } + else if (*_M_current == '!') + { + ++_M_current; + _M_token = _S_token_subexpr_neg_lookahead_begin; + } + else + __throw_regex_error(regex_constants::error_paren); + } + else + _M_token = _S_token_subexpr_begin; + } + else if (__c == ')') + _M_token = _S_token_subexpr_end; + else if (__c == '[') + { + _M_state = _S_state_in_bracket; + _M_at_bracket_start = true; + if (_M_current != _M_end && *_M_current == '^') + { + _M_token = _S_token_bracket_neg_begin; + ++_M_current; + } + else + _M_token = _S_token_bracket_begin; + } + else if (__c == '{') + { + _M_state = _S_state_in_brace; + _M_token = _S_token_interval_begin; + } + else if (_M_spec_char.count(__c) + && __c != ']' + && __c != '}' + || (_M_is_grep() && __c == '\n')) + _M_token = _M_token_map.at(__c); + else + { + _M_token = _S_token_ord_char; + _M_value.assign(1, __c); + } + } + + // Differences between styles: + // 1) different semantics of "[]" and "[^]". + // 2) Escaping in bracket expr. + template<typename _FwdIter> + void + _Scanner<_FwdIter>:: + _M_scan_in_bracket() + { + if (_M_current == _M_end) + __throw_regex_error(regex_constants::error_brack); + + auto __c = *_M_current++; + + if (__c == '[') + { + if (_M_current == _M_end) + __throw_regex_error(regex_constants::error_brack); + + if (*_M_current == '.') + { + _M_token = _S_token_collsymbol; + _M_eat_class(*_M_current++); + } + else if (*_M_current == ':') + { + _M_token = _S_token_char_class_name; + _M_eat_class(*_M_current++); + } + else if (*_M_current == '=') + { + _M_token = _S_token_equiv_class_name; + _M_eat_class(*_M_current++); + } + else + { + _M_token = _S_token_ord_char; + _M_value.assign(1, __c); + } + } + // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted + // literally. So "[]]" or "[^]]" is valid regex. See the testcases + // `*/empty_range.cc`. + else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) + { + _M_token = _S_token_bracket_end; + _M_state = _S_state_normal; + } + // ECMAScirpt and awk permmits escaping in bracket. + else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) + (this->*_M_eat_escape)(); + else + { + _M_token = _S_token_ord_char; + _M_value.assign(1, __c); + } + _M_at_bracket_start = false; + } + + // Differences between styles: + // 1) "\}" in basic style. + template<typename _FwdIter> + void + _Scanner<_FwdIter>:: + _M_scan_in_brace() + { + if (_M_current == _M_end) + __throw_regex_error(regex_constants::error_brace); + + auto __c = *_M_current++; + + if (_M_ctype.is(_CtypeT::digit, __c)) + { + _M_token = _S_token_dup_count; + _M_value.assign(1, __c); + while (_M_current != _M_end + && _M_ctype.is(_CtypeT::digit, *_M_current)) + _M_value += *_M_current++; + } + else if (__c == ',') + _M_token = _S_token_comma; + // basic use \}. + else if (_M_is_basic()) + { + if (__c == '\\' && _M_current != _M_end && *_M_current == '}') + { + _M_state = _S_state_normal; + _M_token = _S_token_interval_end; + ++_M_current; + } + else + __throw_regex_error(regex_constants::error_brace); + } + else if (__c == '}') + { + _M_state = _S_state_normal; + _M_token = _S_token_interval_end; + } + else + __throw_regex_error(regex_constants::error_brace); + } + + template<typename _FwdIter> + void + _Scanner<_FwdIter>:: + _M_eat_escape_ecma() + { + if (_M_current == _M_end) + __throw_regex_error(regex_constants::error_escape); + + auto __c = *_M_current++; + + if (_M_escape_map.count(__c) + && (__c != 'b' || _M_state == _S_state_in_bracket)) + { + _M_token = _S_token_ord_char; + _M_value.assign(1, _M_escape_map.at(__c)); + } + // N3376 28.13 + else if (__c == 'b' + || __c == 'B' + || __c == 'd' + || __c == 'D' + || __c == 's' + || __c == 'S' + || __c == 'w' + || __c == 'W') + { + _M_token = _S_token_quoted_class; + _M_value.assign(1, __c); + } + else if (__c == 'c') + { + if (_M_current == _M_end) + __throw_regex_error(regex_constants::error_escape); + _M_token = _S_token_ord_char; + _M_value.assign(1, *_M_current++); + } + else if (__c == 'x' || __c == 'u') + { + _M_value.erase(); + for (int i = 0; i < (__c == 'x' ? 2 : 4); i++) + { + if (_M_current == _M_end + || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) + __throw_regex_error(regex_constants::error_escape); + _M_value += *_M_current++; + } + _M_token = _S_token_hex_num; + } + // ECMAScript recongnizes multi-digit back-references. + else if (_M_ctype.is(_CtypeT::digit, __c)) + { + _M_value.assign(1, __c); + while (_M_current != _M_end + && _M_ctype.is(_CtypeT::digit, *_M_current)) + _M_value += *_M_current++; + _M_token = _S_token_backref; + } + else + { + _M_token = _S_token_ord_char; + _M_value.assign(1, __c); + } + } + + template<typename _FwdIter> + void + _Scanner<_FwdIter>:: + _M_eat_escape_posix() + { + if (_M_current == _M_end) + __throw_regex_error(regex_constants::error_escape); + + auto __c = *_M_current; + + if (_M_spec_char.count(__c)) + { + _M_token = _S_token_ord_char; + _M_value.assign(1, __c); + } + // We MUST judge awk before handling backrefs. There's no backref in awk. + else if (_M_is_awk()) + { + _M_eat_escape_awk(); + return; + } + else if (_M_ctype.is(_CtypeT::digit, __c) && __c != '0') + { + _M_token = _S_token_backref; + _M_value.assign(1, __c); + } + else + __throw_regex_error(regex_constants::error_escape); + ++_M_current; + } + + template<typename _FwdIter> + void + _Scanner<_FwdIter>:: + _M_eat_escape_awk() + { + auto __c = *_M_current++; + + if (_M_escape_map.count(__c)) + { + _M_token = _S_token_ord_char; + _M_value.assign(1, _M_escape_map.at(__c)); + } + // \ddd for oct representation + else if (_M_ctype.is(_CtypeT::digit, __c) + && __c != '8' + && __c != '9') + { + _M_value.assign(1, __c); + for (int __i = 0; + __i < 2 + && _M_current != _M_end + && _M_ctype.is(_CtypeT::digit, *_M_current) + && *_M_current != '8' + && *_M_current != '9'; + __i++) + _M_value += *_M_current++; + _M_token = _S_token_oct_num; + return; + } + else + __throw_regex_error(regex_constants::error_escape); + } + + // Eats a character class or throwns an exception. + // __ch cound be ':', '.' or '=', _M_current is the char after ']' when + // returning. + template<typename _FwdIter> + void + _Scanner<_FwdIter>:: + _M_eat_class(char __ch) + { + for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) + _M_value += *_M_current++; + if (_M_current == _M_end + || *_M_current++ != __ch + || _M_current == _M_end // skip __ch + || *_M_current++ != ']') // skip ']' + if (__ch == ':') + __throw_regex_error(regex_constants::error_ctype); + else + __throw_regex_error(regex_constants::error_collate); + } + +#ifdef _GLIBCXX_DEBUG + template<typename _FwdIter> + std::ostream& + _Scanner<_FwdIter>:: + _M_print(std::ostream& ostr) + { + switch (_M_token) + { + case _S_token_anychar: + ostr << "any-character\n"; + break; + case _S_token_backref: + ostr << "backref\n"; + break; + case _S_token_bracket_begin: + ostr << "bracket-begin\n"; + break; + case _S_token_bracket_neg_begin: + ostr << "bracket-neg-begin\n"; + break; + case _S_token_bracket_end: + ostr << "bracket-end\n"; + break; + case _S_token_char_class_name: + ostr << "char-class-name \"" << _M_value << "\"\n"; + break; + case _S_token_closure0: + ostr << "closure0\n"; + break; + case _S_token_closure1: + ostr << "closure1\n"; + break; + case _S_token_collsymbol: + ostr << "collsymbol \"" << _M_value << "\"\n"; + break; + case _S_token_comma: + ostr << "comma\n"; + break; + case _S_token_dup_count: + ostr << "dup count: " << _M_value << "\n"; + break; + case _S_token_eof: + ostr << "EOF\n"; + break; + case _S_token_equiv_class_name: + ostr << "equiv-class-name \"" << _M_value << "\"\n"; + break; + case _S_token_interval_begin: + ostr << "interval begin\n"; + break; + case _S_token_interval_end: + ostr << "interval end\n"; + break; + case _S_token_line_begin: + ostr << "line begin\n"; + break; + case _S_token_line_end: + ostr << "line end\n"; + break; + case _S_token_opt: + ostr << "opt\n"; + break; + case _S_token_or: + ostr << "or\n"; + break; + case _S_token_ord_char: + ostr << "ordinary character: \"" << _M_value << "\"\n"; + break; + case _S_token_subexpr_begin: + ostr << "subexpr begin\n"; + break; + case _S_token_subexpr_no_group_begin: + ostr << "no grouping subexpr begin\n"; + break; + case _S_token_subexpr_lookahead_begin: + ostr << "lookahead subexpr begin\n"; + break; + case _S_token_subexpr_neg_lookahead_begin: + ostr << "neg lookahead subexpr begin\n"; + break; + case _S_token_subexpr_end: + ostr << "subexpr end\n"; + break; + case _S_token_unknown: + ostr << "-- unknown token --\n"; + break; + case _S_token_oct_num: + ostr << "oct number " << _M_value << "\n"; + break; + case _S_token_hex_num: + ostr << "hex number " << _M_value << "\n"; + break; + case _S_token_quoted_class: + ostr << "quoted class " << "\\" << _M_value << "\n"; + break; + default: + _GLIBCXX_DEBUG_ASSERT(false); + } + return ostr; + } +#endif + +_GLIBCXX_END_NAMESPACE_VERSION +} // namespace __detail +} // namespace diff --git a/libstdc++-v3/include/std/regex b/libstdc++-v3/include/std/regex index ac9a2a85b9b..36dd0a97b8f 100644 --- a/libstdc++-v3/include/std/regex +++ b/libstdc++-v3/include/std/regex @@ -56,6 +56,7 @@ #include <bits/regex_constants.h> #include <bits/regex_error.h> +#include <bits/regex_scanner.h> #include <bits/regex_automaton.h> #include <bits/regex_compiler.h> #include <bits/regex_executor.h> |