From 3ab9a836c988c3ca480bee0f90597cdd102e153e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Fri, 28 Mar 2025 19:24:18 +0100 Subject: [PATCH 1/3] ``: Make back-references to unmatched capture groups not match anything in POSIX basic regexes --- stl/inc/regex | 4 ++++ tests/std/tests/VSO_0000000_regex_use/test.cpp | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/stl/inc/regex b/stl/inc/regex index 17fd2ee72c..43549469ff 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3654,6 +3654,10 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Match_pat(_Node_base* _Nx) { // c } else { _Tgt_state._Cur = _Res0; } + } else if (_Sflags + & (regex_constants::basic | regex_constants::extended | regex_constants::grep + | regex_constants::egrep | regex_constants::awk)) { + _Failed = true; } break; } diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index ba9c575940..41b6e5890d 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -1171,6 +1171,17 @@ void test_gh_5253() { g_regexTester.should_not_match("a", "()*"); } +void test_gh_5374() { + // GH-5374: : Back-references to unmatched capture groups + // should not match in POSIX basic regular expressions + for (syntax_option_type option : {basic, grep}) { + g_regexTester.should_not_match("", R"(\(.\)*\1)", option); + g_regexTester.should_match("", R"(\(.*\)\1)", option); + g_regexTester.should_not_match("bc", R"(\(a\)*b\1c)", option); + g_regexTester.should_match("bc", R"(\(a*\)b\1c)", option); + } +} + int main() { test_dev10_449367_case_insensitivity_should_work(); test_dev11_462743_regex_collate_should_not_disable_regex_icase(); @@ -1208,6 +1219,7 @@ int main() { test_gh_5192(); test_gh_5214(); test_gh_5253(); + test_gh_5374(); return g_regexTester.result(); } From 1425012622067b3feff9878d0b15550f9a2a1d62 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 1 Apr 2025 15:44:39 -0700 Subject: [PATCH 2/3] Extract an internal check for the grammars that lack backreferences. --- stl/inc/regex | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 43549469ff..0d9cac2647 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3643,6 +3643,9 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Match_pat(_Node_base* _Nx) { // c case _N_back: { // check back reference + _STL_INTERNAL_CHECK( + (_Sflags & (regex_constants::extended | regex_constants::egrep | regex_constants::awk)) + == 0); // these grammars don't have backreferences _Node_back* _Node = static_cast<_Node_back*>(_Nx); if (_Tgt_state._Grp_valid[_Node->_Idx]) { // check for match _It _Res0 = _Tgt_state._Cur; @@ -3654,9 +3657,7 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Match_pat(_Node_base* _Nx) { // c } else { _Tgt_state._Cur = _Res0; } - } else if (_Sflags - & (regex_constants::basic | regex_constants::extended | regex_constants::grep - | regex_constants::egrep | regex_constants::awk)) { + } else if (_Sflags & (regex_constants::basic | regex_constants::grep)) { _Failed = true; } break; From bc1e3d192d29e6362cb0c0ac09ffca5ade5344e4 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 1 Apr 2025 16:03:41 -0700 Subject: [PATCH 3/3] Add ECMAScript coverage. --- tests/std/tests/VSO_0000000_regex_use/test.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 41b6e5890d..26647e9260 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -1180,6 +1180,12 @@ void test_gh_5374() { g_regexTester.should_not_match("bc", R"(\(a\)*b\1c)", option); g_regexTester.should_match("bc", R"(\(a*\)b\1c)", option); } + + // ECMAScript's behavior is different: + g_regexTester.should_match("", R"((.)*\1)", ECMAScript); + g_regexTester.should_match("", R"((.*)\1)", ECMAScript); + g_regexTester.should_match("bc", R"((a)*b\1c)", ECMAScript); + g_regexTester.should_match("bc", R"((a*)b\1c)", ECMAScript); } int main() {