diff options
author | timshen <timshen@138bc75d-0d04-0410-961f-82ee72b054a4> | 2013-09-18 15:56:20 +0000 |
---|---|---|
committer | timshen <timshen@138bc75d-0d04-0410-961f-82ee72b054a4> | 2013-09-18 15:56:20 +0000 |
commit | a504122233268b6ce669a07c2f83cd5e9a6cd084 (patch) | |
tree | db5788d9dc9a22dd1aeac31c52fb72395dd7756d /libstdc++-v3/include/bits | |
parent | 803ed83f4ecf58326158d384d0c00fe8f50e00e5 (diff) | |
download | ppe42-gcc-a504122233268b6ce669a07c2f83cd5e9a6cd084.tar.gz ppe42-gcc-a504122233268b6ce669a07c2f83cd5e9a6cd084.zip |
2013-09-18 Tim Shen <timshen91@gmail.com>
* include/bits/regex.h: Add friend classes.
(match_results<>::position, regex_iterator<>::operator++):
Implement position specification in regex_iterator.
(regex_match<>, regex_search<>):
Move match_results initializations to these function. Remove `todo`.
* include/bits/regex_compiler.tcc:
(_Compiler<>::_M_quantifier): Fix greedy/ungreedy of interval matching.
* include/bits/regex_constants.h:
Fix indentation. Change match_flag_type to enum type.
* include/bits/regex_executor.h:
Merge identical code to the base class _Executor.
Support flags in regex_constants.
* include/bits/regex_executor.tcc: Likewise.
* include/bits/regex_scanner.h: Add comments.
* include/bits/regex_scanner.tcc: Same.
* testsuite/28_regex/algorithms/regex_search/ecma/assertion.cc:
Add a testcase.
* testsuite/28_regex/algorithms/regex_search/ecma/flags.cc: New.
* testsuite/28_regex/iterators/regex_iterator/char/
string_position_01.cc: Remove `xfail`.
* testsuite/28_regex/iterators/regex_iterator/wchar_t/string_02.cc:
Remove `xfail` and make the case really work.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@202706 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libstdc++-v3/include/bits')
-rw-r--r-- | libstdc++-v3/include/bits/regex.h | 63 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_compiler.tcc | 21 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_constants.h | 274 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_executor.h | 282 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_executor.tcc | 238 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_scanner.h | 6 | ||||
-rw-r--r-- | libstdc++-v3/include/bits/regex_scanner.tcc | 2 |
7 files changed, 504 insertions, 382 deletions
diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h index 659bee13120..9d1438aab23 100644 --- a/libstdc++-v3/include/bits/regex.h +++ b/libstdc++-v3/include/bits/regex.h @@ -1004,6 +1004,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION const basic_regex<_Cp, _Rp>&, regex_constants::match_flag_type); + template<typename, typename, typename, typename> + friend class __detail::_Executor; + + template<typename, typename, typename, typename> + friend class __detail::_DFSExecutor; + + template<typename, typename, typename, typename> + friend class __detail::_BFSExecutor; + flag_type _M_flags; _Rx_traits _M_traits; _AutomatonPtr _M_automaton; @@ -1783,21 +1792,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION */ explicit match_results(const _Alloc& __a = _Alloc()) - : _Base_type(__a) + : _Base_type(__a), _M_in_iterator(false) { } /** * @brief Copy constructs a %match_results. */ match_results(const match_results& __rhs) - : _Base_type(__rhs) + : _Base_type(__rhs), _M_in_iterator(false) { } /** * @brief Move constructs a %match_results. */ match_results(match_results&& __rhs) noexcept - : _Base_type(std::move(__rhs)) + : _Base_type(std::move(__rhs)), _M_in_iterator(false) { } /** @@ -1905,8 +1914,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION difference_type position(size_type __sub = 0) const { - return __sub < size() ? std::distance(this->prefix().first, - (*this)[__sub].first) : -1; + // [28.12.1.4.5] + if (_M_in_iterator) + return __sub < size() ? std::distance(_M_begin, + (*this)[__sub].first) : -1; + else + return __sub < size() ? std::distance(this->prefix().first, + (*this)[__sub].first) : -1; } /** @@ -2106,6 +2120,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template<typename, typename, typename, typename> friend class __detail::_BFSExecutor; + template<typename, typename, typename> + friend class regex_iterator; + template<typename _Bp, typename _Ap, typename _Ch_type, typename _Rx_traits> friend bool @@ -2121,6 +2138,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION const basic_regex<_Ch_type, _Rx_traits>&, regex_constants::match_flag_type); + + _Bi_iter _M_begin; + bool _M_in_iterator; }; typedef match_results<const char*> cmatch; @@ -2200,8 +2220,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION * @retval false Otherwise. * * @throws an exception of type regex_error. - * - * @todo Implement this function. */ template<typename _Bi_iter, typename _Alloc, typename _Ch_type, typename _Rx_traits> @@ -2215,6 +2233,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { if (__re._M_automaton == nullptr) return false; + + auto __size = __re._M_automaton->_M_sub_count(); + __size += 2; + __m.resize(__size); + for (decltype(__size) __i = 0; __i < __size; ++__i) + __m.at(__i).matched = false; + if (__detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match()) { for (auto __it : __m) @@ -2360,8 +2385,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION * undefined. * * @throws an exception of type regex_error. - * - * @todo Implement this function. */ template<typename _Bi_iter, typename _Alloc, typename _Ch_type, typename _Rx_traits> @@ -2374,6 +2397,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { if (__re._M_automaton == nullptr) return false; + + auto __size = __re._M_automaton->_M_sub_count(); + __size += 2; + __m.resize(__size); + for (decltype(__size) __i = 0; __i < __size; ++__i) + __m.at(__i).matched = false; + if (__detail::__get_executor(__first, __last, __m, __re, __flags) ->_M_search()) { @@ -2677,7 +2707,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: operator++() { - // FIXME: In all cases in which the call to regex_search returns true, + // In all cases in which the call to regex_search returns true, // match.prefix().first shall be equal to the previous value of // match[0].second, and for each index i in the half-open range // [0, match.size()) for which match[i].matched is true, @@ -2697,12 +2727,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags | regex_constants::match_not_null | regex_constants::match_continuous)) - return *this; + { + _M_match._M_in_iterator = true; + _M_match._M_begin = _M_begin; + return *this; + } else ++__start; } _M_flags |= regex_constants::match_prev_avail; - if (!regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags)) + if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags)) + { + _M_match._M_in_iterator = true; + _M_match._M_begin = _M_begin; + } + else _M_match = value_type(); } return *this; diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 8dc779b68e1..7f9a19af2d9 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -28,7 +28,7 @@ * Do not attempt to use it directly. @headername{regex} */ -// TODO make comments doxygen format. +// FIXME make comments doxygen format. // This compiler refers to "Regular Expression Matching Can Be Simple And Fast" // (http://swtch.com/~rsc/regexp/regexp1.html"), @@ -223,16 +223,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (__n < 0) __throw_regex_error(regex_constants::error_badbrace); auto __end = _M_nfa._M_insert_dummy(); + // _M_alt is the "match more" branch, and _M_next is the + // "match less" one. Switch _M_alt and _M_next of all created + // nodes. This is a hacking but IMO works well. + std::stack<_StateIdT> __stack; for (int __i = 0; __i < __n; ++__i) { auto __tmp = __r._M_clone(); - __e._M_append - (_StateSeqT(_M_nfa, - _M_nfa._M_insert_alt(__tmp._M_start, - __end, __neg), - __tmp._M_end)); + auto __alt = _M_nfa._M_insert_alt(__tmp._M_start, + __end, __neg); + __stack.push(__alt); + __e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end)); } __e._M_append(__end); + while (!__stack.empty()) + { + auto& __tmp = _M_nfa[__stack.top()]; + __stack.pop(); + swap(__tmp._M_next, __tmp._M_alt); + } } else // {3,} { diff --git a/libstdc++-v3/include/bits/regex_constants.h b/libstdc++-v3/include/bits/regex_constants.h index 10b962ad21a..94c25e531b3 100644 --- a/libstdc++-v3/include/bits/regex_constants.h +++ b/libstdc++-v3/include/bits/regex_constants.h @@ -52,19 +52,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION */ //@{ enum __syntax_option - { - _S_icase, - _S_nosubs, - _S_optimize, - _S_collate, - _S_ECMAScript, - _S_basic, - _S_extended, - _S_awk, - _S_grep, - _S_egrep, - _S_syntax_last - }; + { + _S_icase, + _S_nosubs, + _S_optimize, + _S_collate, + _S_ECMAScript, + _S_basic, + _S_extended, + _S_awk, + _S_grep, + _S_egrep, + _S_syntax_last + }; /** * @brief This is a bitmask type indicating how to interpret the regex. @@ -211,20 +211,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION //@{ enum __match_flag - { - _S_not_bol, - _S_not_eol, - _S_not_bow, - _S_not_eow, - _S_any, - _S_not_null, - _S_continuous, - _S_prev_avail, - _S_sed, - _S_no_copy, - _S_first_only, - _S_match_flag_last - }; + { + _S_not_bol, + _S_not_eol, + _S_not_bow, + _S_not_eow, + _S_any, + _S_not_null, + _S_continuous, + _S_prev_avail, + _S_sed, + _S_no_copy, + _S_first_only, + _S_match_flag_last + }; /** * @brief This is a bitmask type indicating regex matching rules. @@ -233,110 +233,148 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION * perform bitwise operations on these values and expect the right thing to * happen. */ - typedef std::bitset<_S_match_flag_last> match_flag_type; + enum match_flag_type : unsigned int + { + /** + * The default matching rules. + */ + match_default = 0, - /** - * The default matching rules. - */ - constexpr match_flag_type match_default = 0; + /** + * The first character in the sequence [first, last) is treated as though it + * is not at the beginning of a line, so the character (^) in the regular + * expression shall not match [first, first). + */ + match_not_bol = 1 << _S_not_bol, - /** - * The first character in the sequence [first, last) is treated as though it - * is not at the beginning of a line, so the character (^) in the regular - * expression shall not match [first, first). - */ - constexpr match_flag_type match_not_bol = 1 << _S_not_bol; + /** + * The last character in the sequence [first, last) is treated as though it + * is not at the end of a line, so the character ($) in the regular + * expression shall not match [last, last). + */ + match_not_eol = 1 << _S_not_eol, - /** - * The last character in the sequence [first, last) is treated as though it - * is not at the end of a line, so the character ($) in the regular - * expression shall not match [last, last). - */ - constexpr match_flag_type match_not_eol = 1 << _S_not_eol; + /** + * The expression \\b is not matched against the sub-sequence + * [first,first). + */ + match_not_bow = 1 << _S_not_bow, - /** - * The expression \\b is not matched against the sub-sequence - * [first,first). - */ - constexpr match_flag_type match_not_bow = 1 << _S_not_bow; + /** + * The expression \\b should not be matched against the sub-sequence + * [last,last). + */ + match_not_eow = 1 << _S_not_eow, - /** - * The expression \\b should not be matched against the sub-sequence - * [last,last). - */ - constexpr match_flag_type match_not_eow = 1 << _S_not_eow; + /** + * If more than one match is possible then any match is an acceptable + * result. + */ + match_any = 1 << _S_any, - /** - * If more than one match is possible then any match is an acceptable - * result. - */ - constexpr match_flag_type match_any = 1 << _S_any; + /** + * The expression does not match an empty sequence. + */ + match_not_null = 1 << _S_not_null, - /** - * The expression does not match an empty sequence. - */ - constexpr match_flag_type match_not_null = 1 << _S_not_null; + /** + * The expression only matches a sub-sequence that begins at first . + */ + match_continuous = 1 << _S_continuous, - /** - * The expression only matches a sub-sequence that begins at first . - */ - constexpr match_flag_type match_continuous = 1 << _S_continuous; + /** + * --first is a valid iterator position. When this flag is set then the + * flags match_not_bol and match_not_bow are ignored by the regular + * expression algorithms 28.11 and iterators 28.12. + */ + match_prev_avail = 1 << _S_prev_avail, - /** - * --first is a valid iterator position. When this flag is set then the - * flags match_not_bol and match_not_bow are ignored by the regular - * expression algorithms 28.11 and iterators 28.12. - */ - constexpr match_flag_type match_prev_avail = 1 << _S_prev_avail; + /** + * When a regular expression match is to be replaced by a new string, the + * new string is constructed using the rules used by the ECMAScript replace + * function in ECMA- 262 [Ecma International, ECMAScript Language + * Specification, Standard Ecma-262, third edition, 1999], part 15.5.4.11 + * String.prototype.replace. In addition, during search and replace + * operations all non-overlapping occurrences of the regular expression + * are located and replaced, and sections of the input that did not match + * the expression are copied unchanged to the output string. + * + * Format strings (from ECMA-262 [15.5.4.11]): + * @li $$ The dollar-sign itself ($) + * @li $& The matched substring. + * @li $` The portion of @a string that precedes the matched substring. + * This would be match_results::prefix(). + * @li $' The portion of @a string that follows the matched substring. + * This would be match_results::suffix(). + * @li $n The nth capture, where n is in [1,9] and $n is not followed by a + * decimal digit. If n <= match_results::size() and the nth capture + * is undefined, use the empty string instead. If n > + * match_results::size(), the result is implementation-defined. + * @li $nn The nnth capture, where nn is a two-digit decimal number on + * [01, 99]. If nn <= match_results::size() and the nth capture is + * undefined, use the empty string instead. If + * nn > match_results::size(), the result is implementation-defined. + */ + format_default = 0, - /** - * When a regular expression match is to be replaced by a new string, the - * new string is constructed using the rules used by the ECMAScript replace - * function in ECMA- 262 [Ecma International, ECMAScript Language - * Specification, Standard Ecma-262, third edition, 1999], part 15.5.4.11 - * String.prototype.replace. In addition, during search and replace - * operations all non-overlapping occurrences of the regular expression - * are located and replaced, and sections of the input that did not match - * the expression are copied unchanged to the output string. - * - * Format strings (from ECMA-262 [15.5.4.11]): - * @li $$ The dollar-sign itself ($) - * @li $& The matched substring. - * @li $` The portion of @a string that precedes the matched substring. - * This would be match_results::prefix(). - * @li $' The portion of @a string that follows the matched substring. - * This would be match_results::suffix(). - * @li $n The nth capture, where n is in [1,9] and $n is not followed by a - * decimal digit. If n <= match_results::size() and the nth capture - * is undefined, use the empty string instead. If n > - * match_results::size(), the result is implementation-defined. - * @li $nn The nnth capture, where nn is a two-digit decimal number on - * [01, 99]. If nn <= match_results::size() and the nth capture is - * undefined, use the empty string instead. If - * nn > match_results::size(), the result is implementation-defined. - */ - constexpr match_flag_type format_default = 0; + /** + * When a regular expression match is to be replaced by a new string, the + * new string is constructed using the rules used by the POSIX sed utility + * in IEEE Std 1003.1- 2001 [IEEE, Information Technology -- Portable + * Operating System Interface (POSIX), IEEE Standard 1003.1-2001]. + */ + format_sed = 1 << _S_sed, - /** - * When a regular expression match is to be replaced by a new string, the - * new string is constructed using the rules used by the POSIX sed utility - * in IEEE Std 1003.1- 2001 [IEEE, Information Technology -- Portable - * Operating System Interface (POSIX), IEEE Standard 1003.1-2001]. - */ - constexpr match_flag_type format_sed = 1 << _S_sed; + /** + * During a search and replace operation, sections of the character + * container sequence being searched that do not match the regular + * expression shall not be copied to the output string. + */ + format_no_copy = 1 << _S_no_copy, - /** - * During a search and replace operation, sections of the character - * container sequence being searched that do not match the regular - * expression shall not be copied to the output string. - */ - constexpr match_flag_type format_no_copy = 1 << _S_no_copy; + /** + * When specified during a search and replace operation, only the first + * occurrence of the regular expression shall be replaced. + */ + format_first_only = 1 << _S_first_only, + }; - /** - * When specified during a search and replace operation, only the first - * occurrence of the regular expression shall be replaced. - */ - constexpr match_flag_type format_first_only = 1 << _S_first_only; + constexpr inline match_flag_type + operator&(match_flag_type __a, match_flag_type __b) + { + return (match_flag_type)(static_cast<unsigned int>(__a) + & static_cast<unsigned int>(__b)); + } + + constexpr inline match_flag_type + operator|(match_flag_type __a, match_flag_type __b) + { + return (match_flag_type)(static_cast<unsigned int>(__a) + | static_cast<unsigned int>(__b)); + } + + constexpr inline match_flag_type + operator^(match_flag_type __a, match_flag_type __b) + { + return (match_flag_type)(static_cast<unsigned int>(__a) + ^ static_cast<unsigned int>(__b)); + } + + constexpr inline match_flag_type + operator~(match_flag_type __a) + { return (match_flag_type)(~static_cast<unsigned int>(__a)); } + + inline match_flag_type& + operator&=(match_flag_type& __a, match_flag_type __b) + { return __a = __a & __b; } + + inline match_flag_type& + operator|=(match_flag_type& __a, match_flag_type __b) + { return __a = __a | __b; } + + inline match_flag_type& + operator^=(match_flag_type& __a, match_flag_type __b) + { return __a = __a ^ __b; } //@} diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 3df33e03024..b8e9266f910 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -28,7 +28,11 @@ * Do not attempt to use it directly. @headername{regex} */ -// TODO: convert comments to doxygen format. +// FIXME convert comments to doxygen format. + +// TODO Put _DFSExecutor and _BFSExecutor into one class. They are becoming +// much more similar. Also, make grouping seperated. The +// regex_constants::nosubs enables much more simpler execution. namespace std _GLIBCXX_VISIBILITY(default) { @@ -57,55 +61,107 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION class _Executor { public: + typedef basic_regex<_CharT, _TraitsT> _RegexT; typedef match_results<_BiIter, _Alloc> _ResultsT; typedef std::vector<sub_match<_BiIter>, _Alloc> _ResultsVec; typedef regex_constants::match_flag_type _FlagT; + typedef typename _TraitsT::char_class_type _ClassT; - virtual - ~_Executor() + public: + _Executor(_BiIter __begin, + _BiIter __end, + _ResultsT& __results, + const _RegexT& __re, + _FlagT __flags) + : _M_begin(__begin), + _M_end(__end), + _M_results(__results), + _M_re(__re), + _M_flags(__flags) { } // Set matched when string exactly match the pattern. - virtual bool - _M_match() = 0; + bool + _M_match() + { + _M_match_mode = true; + _M_init(_M_begin); + return _M_main(); + } // Set matched when some prefix of the string matches the pattern. - virtual bool - _M_search() = 0; - - protected: - typedef typename _NFA<_CharT, _TraitsT>::_SizeT _SizeT; - typedef typename _TraitsT::char_class_type _ClassT; + bool + _M_search_from_first() + { + _M_match_mode = false; + _M_init(_M_begin); + return _M_main(); + } - _Executor(_BiIter __begin, - _BiIter __end, - _ResultsT& __results, - _FlagT __flags, - _SizeT __size, - const _TraitsT& __traits) - : _M_current(__begin), _M_begin(__begin), _M_end(__end), - _M_results(__results), _M_flags(__flags), _M_traits(__traits) + bool + _M_search() { - __size += 2; - _M_results.resize(__size); - for (_SizeT __i = 0; __i < __size; ++__i) - _M_results[__i].matched = false; + if (_M_flags & regex_constants::match_continuous) + return _M_search_from_first(); + auto __cur = _M_begin; + do + { + _M_match_mode = false; + _M_init(__cur); + if (_M_main()) + return true; + } + // Continue when __cur == _M_end + while (__cur++ != _M_end); + return false; } bool - _M_is_word(_CharT __ch) + _M_is_word(_CharT __ch) const { static const _CharT __s = 'w'; - return _M_traits.isctype(__ch, - _M_traits.lookup_classname(&__s, &__s+1)); + return _M_re._M_traits.isctype + (__ch, _M_re._M_traits.lookup_classname(&__s, &__s+1)); + } + + bool + _M_at_begin() const + { + return _M_current == _M_begin + && !(_M_flags & (regex_constants::match_not_bol + | regex_constants::match_prev_avail)); } + bool + _M_at_end() const + { + return _M_current == _M_end + && !(_M_flags & regex_constants::match_not_eol); + } + + bool + _M_word_boundry(_State<_CharT, _TraitsT> __state) const; + + bool + _M_lookahead(_State<_CharT, _TraitsT> __state) const; + + public: + virtual void + _M_init(_BiIter __cur) = 0; + + virtual void + _M_set_start(_StateIdT __start) = 0; + + virtual bool + _M_main() = 0; + _BiIter _M_current; const _BiIter _M_begin; const _BiIter _M_end; - _ResultsVec& _M_results; - const _TraitsT& _M_traits; - _FlagT _M_flags; + const _RegexT& _M_re; + _ResultsT& _M_results; + const _FlagT _M_flags; + bool _M_match_mode; }; // A _DFSExecutor perform a DFS on given NFA and input string. At the very @@ -128,61 +184,46 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { public: typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT; - typedef _NFA<_CharT, _TraitsT> _RegexT; + typedef _NFA<_CharT, _TraitsT> _NFAT; + typedef typename _BaseT::_RegexT _RegexT; typedef typename _BaseT::_ResultsT _ResultsT; typedef typename _BaseT::_ResultsVec _ResultsVec; - typedef regex_constants::match_flag_type _FlagT; + typedef typename _BaseT::_FlagT _FlagT; + public: _DFSExecutor(_BiIter __begin, _BiIter __end, _ResultsT& __results, - const _RegexT& __nfa, - const _TraitsT& __traits, + const _RegexT& __re, _FlagT __flags) - : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count(), - __traits), - _M_traits(__traits), _M_nfa(__nfa), _M_cur_results(this->_M_results), - _M_start_state(__nfa._M_start()) + : _BaseT(__begin, __end, __results, __re, __flags), + _M_nfa(*std::static_pointer_cast<_NFA<_CharT, _TraitsT>> + (__re._M_automaton)), + _M_start_state(_M_nfa._M_start()) { } - bool - _M_match() + private: + void + _M_init(_BiIter __cur) { - this->_M_current = this->_M_begin; - return _M_dfs<true>(_M_start_state); + _M_cur_results.resize(_M_nfa._M_sub_count() + 2); + this->_M_current = __cur; } - bool - _M_search_from_first() - { - this->_M_current = this->_M_begin; - return _M_dfs<false>(_M_start_state); - } + void + _M_set_start(_StateIdT __start) + { _M_start_state = __start; } bool - _M_search() - { - auto __cur = this->_M_begin; - do - { - this->_M_current = __cur; - if (_M_dfs<false>(_M_start_state)) - return true; - } - // Continue when __cur == _M_end - while (__cur++ != this->_M_end); - return false; - } + _M_main() + { return _M_dfs(this->_M_start_state); } - private: - template<bool __match_mode> - bool - _M_dfs(_StateIdT __i); + bool + _M_dfs(_StateIdT __start); // To record current solution. _ResultsVec _M_cur_results; - const _TraitsT& _M_traits; - const _RegexT& _M_nfa; + const _NFAT& _M_nfa; _StateIdT _M_start_state; }; @@ -206,47 +247,57 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { public: typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT; - typedef _NFA<_CharT, _TraitsT> _RegexT; + typedef _NFA<_CharT, _TraitsT> _NFAT; + typedef typename _BaseT::_RegexT _RegexT; typedef typename _BaseT::_ResultsT _ResultsT; + typedef typename _BaseT::_ResultsVec _ResultsVec; + typedef typename _BaseT::_FlagT _FlagT; // Here's a solution for greedy/ungreedy mode in BFS approach. We need to // carefully work out how to compare to conflict matching states. // // A matching state is a pair(where, when); `where` is a NFA node; `when` - // is a _BiIter, indicating which char is the next to be mathed one. Two - // matching states conflict means that they have equivalent `where` and - // `when`. + // is a _BiIter, indicating which char is the next to be matched. Two + // matching states conflict if they have equivalent `where` and `when`. // - // Now since we need to drop one and keep another, because at most one of - // them could be the final optimal solution. This behavior is affected by + // Now we need to drop one and keep another, because at most one of them + // could be the final optimal solution. This behavior is affected by // greedy policy. // // The definition of `greedy`: // For the sequence of quantifiers in NFA sorted by there start position, - // now maintain a vector in a matching state, with equal length to + // now maintain a vector in every matching state, with equal length to // quantifier seq, recording repeating times of every quantifier. Now to // compare two matching states, we just lexically compare these two // vectors. To win the compare(to survive), one matching state needs to // make its greedy quantifier count larger, and ungreedy quantifiers // count smaller. // - // In the implementation, we recorded negtive numbers for greedy - // quantifiers and positive numbers of ungreedy ones. Now a simple + // In the implementation, we recorded negtive counts for greedy + // quantifiers and positive counts of ungreedy ones. Now the implicit // operator<() for lexicographical_compare will emit the answer. // // When two vectors equal, it means the `where`, `when` and quantifier - // counts are identical, it indicates the same answer, so just return + // counts are identical, and indicates the same solution; so just return // false. struct _ResultsEntry - : private _BaseT::_ResultsVec + : private _ResultsVec { public: _ResultsEntry(unsigned int __res_sz, unsigned int __sz) - : _BaseT::_ResultsVec(__res_sz), _M_quant_keys(__sz) + : _ResultsVec(__res_sz), _M_quant_keys(__sz) { } + void + resize(unsigned int __n) + { _ResultsVec::resize(__n); } + + unsigned int + size() + { return _ResultsVec::size(); } + sub_match<_BiIter>& operator[](unsigned int __idx) - { return this->_BaseT::_ResultsVec::operator[](__idx); } + { return _ResultsVec::operator[](__idx); } bool operator<(const _ResultsEntry& __rhs) const @@ -263,75 +314,47 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_inc(unsigned int __idx, bool __neg) { _M_quant_keys[__idx] += __neg ? 1 : -1; } - typename _BaseT::_ResultsVec + _ResultsVec _M_get() { return *this; } public: std::vector<int> _M_quant_keys; }; - typedef std::unique_ptr<_ResultsEntry> _ResultsPtr; - typedef regex_constants::match_flag_type _FlagT; + public: _BFSExecutor(_BiIter __begin, _BiIter __end, _ResultsT& __results, - const _RegexT& __nfa, - const _TraitsT& __traits, + const _RegexT& __re, _FlagT __flags) - : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count(), - __traits), - _M_nfa(__nfa), - _M_cur_results(nullptr), - _M_start_state(__nfa._M_start()) + : _BaseT(__begin, __end, __results, __re, __flags), + _M_nfa(*std::static_pointer_cast<_NFA<_CharT, _TraitsT>> + (__re._M_automaton)), + _M_start_state(_M_nfa._M_start()) { } - bool - _M_match() - { - _M_init(this->_M_begin); - return _M_main_loop<true>(); - } - - bool - _M_search_from_first() - { - _M_init(this->_M_begin); - return _M_main_loop<false>(); - } - - bool - _M_search() - { - auto __cur = this->_M_begin; - do - { - _M_init(__cur); - if (_M_main_loop<false>()) - return true; - } - // Continue when __cur == _M_end - while (__cur++ != this->_M_end); - return false; - } - private: void _M_init(_BiIter __cur) { - _GLIBCXX_DEBUG_ASSERT(_M_start_state != _S_invalid_state_id); + _GLIBCXX_DEBUG_ASSERT(this->_M_start_state != _S_invalid_state_id); this->_M_current = __cur; _M_covered.clear(); - _M_covered[_M_start_state] = - _ResultsPtr(new _ResultsEntry(this->_M_results.size(), + _ResultsVec& __res(this->_M_results); + _M_covered[this->_M_start_state] = + _ResultsPtr(new _ResultsEntry(__res.size(), _M_nfa._M_quant_count)); _M_e_closure(); } - template<bool __match_mode> - bool - _M_main_loop(); + void + _M_set_start(_StateIdT __start) + { _M_start_state = __start; } + + bool + _M_main(); void _M_e_closure(); @@ -345,10 +368,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION std::map<_StateIdT, _ResultsPtr> _M_covered; // To record global optimal solution. _ResultsPtr _M_cur_results; - const _RegexT& _M_nfa; + const _NFAT& _M_nfa; _StateIdT _M_start_state; }; + template<typename _BiIter, typename _Alloc, + typename _CharT, typename _TraitsT> + std::unique_ptr<_Executor<_BiIter, _Alloc, _CharT, _TraitsT>> + __get_executor(_BiIter __b, + _BiIter __e, + match_results<_BiIter, _Alloc>& __m, + const basic_regex<_CharT, _TraitsT>& __re, + regex_constants::match_flag_type __flags); + //@} regex-detail _GLIBCXX_END_NAMESPACE_VERSION } // namespace __detail diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index b110c5dc2f0..af2455b8a4e 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -36,7 +36,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template<typename _BiIter, typename _Alloc, typename _CharT, typename _TraitsT> - template<bool __match_mode> bool _DFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>:: _M_dfs(_StateIdT __i) { @@ -44,9 +43,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // This is not that certain. Need deeper investigate. return false; auto& __current = this->_M_current; - auto& __begin = this->_M_begin; - auto& __end = this->_M_end; - auto& __results = _M_cur_results; const auto& __state = _M_nfa[__i]; bool __ret = false; switch (__state._M_opcode) @@ -54,129 +50,115 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION case _S_opcode_alternative: // Greedy or not, this is a question ;) if (!__state._M_neg) - __ret = _M_dfs<__match_mode>(__state._M_alt) - || _M_dfs<__match_mode>(__state._M_next); + __ret = _M_dfs(__state._M_alt) + || _M_dfs(__state._M_next); else - __ret = _M_dfs<__match_mode>(__state._M_next) - || _M_dfs<__match_mode>(__state._M_alt); + __ret = _M_dfs(__state._M_next) + || _M_dfs(__state._M_alt); break; case _S_opcode_subexpr_begin: // Here's the critical part: if there's nothing changed since last // visit, do NOT continue. This prevents the executor from get into // infinite loop when use "()*" to match "". // - // Every change on __results will be roll back after the recursion - // step finished. - if (!__results[__state._M_subexpr].matched - || __results[__state._M_subexpr].first != __current) + // Every change on _M_cur_results will be roll back after the + // recursion step finished. + if (!_M_cur_results[__state._M_subexpr].matched + || _M_cur_results[__state._M_subexpr].first != __current) { auto __back = __current; - __results[__state._M_subexpr].first = __current; - __ret = _M_dfs<__match_mode>(__state._M_next); - __results[__state._M_subexpr].first = __back; + _M_cur_results[__state._M_subexpr].first = __current; + __ret = _M_dfs(__state._M_next); + _M_cur_results[__state._M_subexpr].first = __back; } break; case _S_opcode_subexpr_end: - if (__results[__state._M_subexpr].second != __current - || __results[__state._M_subexpr].matched != true) + if (_M_cur_results[__state._M_subexpr].second != __current + || _M_cur_results[__state._M_subexpr].matched != true) { - auto __back = __results[__state._M_subexpr]; - __results[__state._M_subexpr].second = __current; - __results[__state._M_subexpr].matched = true; - __ret = _M_dfs<__match_mode>(__state._M_next); - __results[__state._M_subexpr] = __back; + auto __back = _M_cur_results[__state._M_subexpr]; + _M_cur_results[__state._M_subexpr].second = __current; + _M_cur_results[__state._M_subexpr].matched = true; + __ret = _M_dfs(__state._M_next); + _M_cur_results[__state._M_subexpr] = __back; } else - __ret = _M_dfs<__match_mode>(__state._M_next); + __ret = _M_dfs(__state._M_next); break; case _S_opcode_line_begin_assertion: - if (__current == __begin) - __ret = _M_dfs<__match_mode>(__state._M_next); + if (this->_M_at_begin()) + __ret = _M_dfs(__state._M_next); break; case _S_opcode_line_end_assertion: - if (__current == __end) - __ret = _M_dfs<__match_mode>(__state._M_next); + if (this->_M_at_end()) + __ret = _M_dfs(__state._M_next); break; - // By definition. case _S_opcode_word_boundry: - { - bool __ans = false; - if (__current == __begin && this->_M_is_word(*__current)) - __ans = true; - else if (__current == __end && this->_M_is_word(*__current)) - __ans = true; - else - { - auto __pre = __current; - --__pre; - if (this->_M_is_word(*__current) - != this->_M_is_word(*__pre)) - __ans = true; - } - if (__ans == !__state._M_neg) - __ret = _M_dfs<__match_mode>(__state._M_next); - } + if (this->_M_word_boundry(__state) == !__state._M_neg) + __ret = _M_dfs(__state._M_next); break; // Here __state._M_alt offers a single start node for a sub-NFA. // We recursivly invoke our algorithm to match the sub-NFA. case _S_opcode_subexpr_lookahead: - { - _ResultsT __m; - // FIXME Here's not necessarily a DFSExecutor. But we need to - // refactor the whole NFA to a recursive tree structure first. - _DFSExecutor __sub(this->_M_current, - this->_M_end, - __m, - this->_M_nfa, - this->_M_traits, - this->_M_flags); - __sub._M_start_state = __state._M_alt; - if (__sub._M_search_from_first() == !__state._M_neg) - __ret = _M_dfs<__match_mode>(__state._M_next); - } + if (this->_M_lookahead(__state) == !__state._M_neg) + __ret = _M_dfs(__state._M_next); break; case _S_opcode_match: - if (__current != __end && __state._M_matches(*__current)) + if (__current != this->_M_end && __state._M_matches(*__current)) { ++__current; - __ret = _M_dfs<__match_mode>(__state._M_next); + __ret = _M_dfs(__state._M_next); --__current; } break; - // First fetch the matched result from __results as __submatch; + // First fetch the matched result from _M_cur_results as __submatch; // then compare it with // (__current, __current + (__submatch.second - __submatch.first)) // If matched, keep going; else just return to try another state. case _S_opcode_backref: { - auto& __submatch = __results[__state._M_backref_index]; + auto& __submatch = _M_cur_results[__state._M_backref_index]; if (!__submatch.matched) break; auto __last = __current; for (auto __tmp = __submatch.first; - __last != __end && __tmp != __submatch.second; + __last != this->_M_end && __tmp != __submatch.second; ++__tmp) ++__last; - if (_M_traits.transform(__submatch.first, __submatch.second) - == _M_traits.transform(__current, __last)) + if (this->_M_re._M_traits.transform(__submatch.first, + __submatch.second) + == this->_M_re._M_traits.transform(__current, __last)) if (__last != __current) { auto __backup = __current; __current = __last; - __ret = _M_dfs<__match_mode>(__state._M_next); + __ret = _M_dfs(__state._M_next); __current = __backup; } else - __ret = _M_dfs<__match_mode>(__state._M_next); + __ret = _M_dfs(__state._M_next); } break; case _S_opcode_accept: - if (__match_mode) - __ret = __current == __end; + if (this->_M_match_mode) + __ret = __current == this->_M_end; else __ret = true; + if (__current == this->_M_begin + && (this->_M_flags & regex_constants::match_not_null)) + __ret = false; if (__ret) - this->_M_results = __results; + { + _ResultsVec& __res(this->_M_results); + if (this->_M_re.flags() & regex_constants::nosubs) + { + _M_cur_results.resize(3); // truncate + __res.resize(3); + } + for (unsigned int __i = 0; __i < _M_cur_results.size(); ++__i) + if (_M_cur_results[__i].matched) + __res[__i] = _M_cur_results[__i]; + } break; default: _GLIBCXX_DEBUG_ASSERT(false); @@ -186,23 +168,37 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template<typename _BiIter, typename _Alloc, typename _CharT, typename _TraitsT> - template<bool __match_mode> bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>:: - _M_main_loop() + _M_main() { bool __ret = false; + if (!this->_M_match_mode + && !(this->_M_flags & regex_constants::match_not_null)) + __ret = _M_includes_some() || __ret; while (this->_M_current != this->_M_end) { - if (!__match_mode) - // To keep regex_search greedy, no "return true" here. - __ret = _M_includes_some() || __ret; _M_move(); ++this->_M_current; _M_e_closure(); + if (!this->_M_match_mode) + // To keep regex_search greedy, no "return true" here. + __ret = _M_includes_some() || __ret; } - __ret = _M_includes_some() || __ret; + if (this->_M_match_mode) + __ret = _M_includes_some(); if (__ret) - this->_M_results = _M_cur_results->_M_get(); + { + _ResultsVec& __res(this->_M_results); + if (this->_M_re.flags() & regex_constants::nosubs) + { + // truncate + _M_cur_results->resize(3); + __res.resize(3); + } + for (unsigned int __i = 0; __i < _M_cur_results->size(); ++__i) + if ((*_M_cur_results)[__i].matched) + __res[__i] = (*_M_cur_results)[__i]; + } return __ret; } @@ -211,11 +207,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>:: _M_e_closure() { - auto& __current = this->_M_current; std::queue<_StateIdT> __q; std::vector<bool> __in_q(_M_nfa.size(), false); - auto& __begin = this->_M_begin; - auto& __end = this->_M_end; + auto& __current = this->_M_current; for (auto& __it : _M_covered) { @@ -292,46 +286,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } break; case _S_opcode_line_begin_assertion: - if (__current == __begin) + if (this->_M_at_begin()) __add_visited_state(__state._M_next); break; case _S_opcode_line_end_assertion: - if (__current == __end) + if (this->_M_at_end()) __add_visited_state(__state._M_next); break; case _S_opcode_word_boundry: - { - bool __ans = false; - if (__current == __begin && this->_M_is_word(*__current)) - __ans = true; - else if (__current == __end && this->_M_is_word(*__current)) - __ans = true; - else - { - auto __pre = __current; - --__pre; - if (this->_M_is_word(*__current) - != this->_M_is_word(*__pre)) - __ans = true; - } - if (__ans == !__state._M_neg) - __add_visited_state(__state._M_next); - } + if (this->_M_word_boundry(__state) == !__state._M_neg) + __add_visited_state(__state._M_next); break; case _S_opcode_subexpr_lookahead: - { - _ResultsT __m; - // Same comment as in DFS. - _BFSExecutor __sub(this->_M_current, - this->_M_end, - __m, - this->_M_nfa, - this->_M_traits, - this->_M_flags); - __sub._M_start_state = __state._M_alt; - if (__sub._M_search_from_first() == !__state._M_neg) - __add_visited_state(__state._M_next); - } + if (this->_M_lookahead(__state) == !__state._M_neg) + __add_visited_state(__state._M_next); break; case _S_opcode_match: break; @@ -395,6 +363,44 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return __succ; } + // Return whether now is at some word boundry. + template<typename _BiIter, typename _Alloc, + typename _CharT, typename _TraitsT> + bool _Executor<_BiIter, _Alloc, _CharT, _TraitsT>:: + _M_word_boundry(_State<_CharT, _TraitsT> __state) const + { + // By definition. + bool __ans = false; + auto __pre = _M_current; + --__pre; + if (!(_M_at_begin() && _M_at_end())) + if (_M_at_begin()) + __ans = _M_is_word(*_M_current) + && !(_M_flags & regex_constants::match_not_bow); + else if (_M_at_end()) + __ans = _M_is_word(*__pre) + && !(_M_flags & regex_constants::match_not_eow); + else + __ans = _M_is_word(*_M_current) + != _M_is_word(*__pre); + return __ans; + } + + // Return whether now match the given sub-NFA. + template<typename _BiIter, typename _Alloc, + typename _CharT, typename _TraitsT> + bool _Executor<_BiIter, _Alloc, _CharT, _TraitsT>:: + _M_lookahead(_State<_CharT, _TraitsT> __state) const + { + auto __sub = __get_executor(this->_M_current, + this->_M_end, + this->_M_results, + this->_M_re, + this->_M_flags); + __sub->_M_set_start(__state._M_alt); + return __sub->_M_search_from_first(); + } + template<typename _BiIter, typename _Alloc, typename _CharT, typename _TraitsT> std::unique_ptr<_Executor<_BiIter, _Alloc, _CharT, _TraitsT>> @@ -411,10 +417,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __p = std::static_pointer_cast<_NFA<_CharT, _TraitsT>> (__re._M_automaton); if (__p->_M_has_backref) - return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, - __re._M_traits, __flags)); - return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, - __re._M_traits, __flags)); + return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, __re, __flags)); + return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, __re, __flags)); } _GLIBCXX_END_NAMESPACE_VERSION diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h index 824d6ce1081..09a18f634a0 100644 --- a/libstdc++-v3/include/bits/regex_scanner.h +++ b/libstdc++-v3/include/bits/regex_scanner.h @@ -68,7 +68,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _S_token_backref, _S_token_subexpr_begin, _S_token_subexpr_no_group_begin, - _S_token_subexpr_lookahead_begin, + _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n' _S_token_subexpr_end, _S_token_bracket_begin, _S_token_bracket_neg_begin, @@ -86,7 +86,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _S_token_ungreedy, _S_token_line_begin, _S_token_line_end, - _S_token_word_bound, + _S_token_word_bound, // neg if _M_value[0] == 'n' _S_token_comma, _S_token_dup_count, _S_token_eof, @@ -174,7 +174,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _StringT _M_value; bool _M_at_bracket_start; public: - // TODO: make them static when this file is stable. + // FIXME: make them static when this file is stable. const std::map<char, _TokenT> _M_token_map; const std::map<char, char> _M_ecma_escape_map; const std::map<char, char> _M_awk_escape_map; diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc index 4b66157278b..abdbcc64f1f 100644 --- a/libstdc++-v3/include/bits/regex_scanner.tcc +++ b/libstdc++-v3/include/bits/regex_scanner.tcc @@ -28,7 +28,7 @@ * Do not attempt to use it directly. @headername{regex} */ -// TODO make comments doxygen format. +// FIXME make comments doxygen format. // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep // and awk |