Skip to content

Commit dbe76cc

Browse files
1 parent f9e8afa commit dbe76cc

File tree

4 files changed

+204
-10
lines changed

4 files changed

+204
-10
lines changed

stl/inc/regex

+31-8
Original file line numberDiff line numberDiff line change
@@ -1638,11 +1638,12 @@ public:
16381638

16391639
if (_Matches) { // copy results to _Matches
16401640
_Matches->_Resize(_Get_ncap());
1641+
const auto& _Result = _Longest ? _Res : _Tgt_state;
16411642
for (unsigned int _Idx = 0; _Idx < _Get_ncap(); ++_Idx) { // copy submatch _Idx
1642-
if (_Res._Grp_valid[_Idx]) { // copy successful match
1643+
if (_Result._Grp_valid[_Idx]) { // copy successful match
16431644
_Matches->_At(_Idx).matched = true;
1644-
_Matches->_At(_Idx).first = _Res._Grps[_Idx]._Begin;
1645-
_Matches->_At(_Idx).second = _Res._Grps[_Idx]._End;
1645+
_Matches->_At(_Idx).first = _Result._Grps[_Idx]._Begin;
1646+
_Matches->_At(_Idx).second = _Result._Grps[_Idx]._End;
16461647
} else { // copy failed match
16471648
_Matches->_At(_Idx).matched = false;
16481649
_Matches->_At(_Idx).first = _End;
@@ -3277,6 +3278,20 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_rep(_Node_rep* _Node, bool _Gr
32773278
_Psav->_Loop_iter = _STD addressof(_Cur_iter);
32783279
_Matched0 = _Match_pat(_Node->_Next);
32793280
}
3281+
} else if (_Longest) { // longest, try any number of repetitions
3282+
3283+
// match with no further repetition
3284+
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
3285+
// match with at least one more repetition if last repetition made progress
3286+
if (_Progress) {
3287+
_Tgt_state = _St;
3288+
_Psav->_Loop_idx = _Init_idx + 1;
3289+
_Psav->_Loop_iter = _STD addressof(_Cur_iter);
3290+
3291+
if (_Match_pat(_Node->_Next)) { // always call _Match_pat, even when _Matched0 is already true
3292+
_Matched0 = true;
3293+
}
3294+
}
32803295
} else if (!_Greedy) { // not greedy, favor minimum number of reps
32813296
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
32823297
if (!_Matched0 && _Progress) { // tail failed, try another rep
@@ -3437,16 +3452,24 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap
34373452
}
34383453

34393454
template <class _BidIt, class _Elem, class _RxTraits, class _It>
3440-
bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under UNIX rules
3455+
bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under leftmost-longest rule
34413456
for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { // check each capture group
3442-
if (_Res._Grp_valid[_Ix] && _Tgt_state._Grp_valid[_Ix]) {
3457+
// any match (even an empty one) is better than no match at all
3458+
if (_Res._Grp_valid[_Ix] != _Tgt_state._Grp_valid[_Ix]) {
3459+
return _Tgt_state._Grp_valid[_Ix];
3460+
}
3461+
3462+
if (_Res._Grp_valid[_Ix]) { // now known to be equal to _Tgt_state._Grp_valid[_Ix], no need to test both
3463+
// if both groups are matched, prefer the leftmost one
34433464
if (_Res._Grps[_Ix]._Begin != _Tgt_state._Grps[_Ix]._Begin) {
34443465
return _STD distance(_Begin, _Res._Grps[_Ix]._Begin)
3445-
< _STD distance(_Begin, _Tgt_state._Grps[_Ix]._Begin);
3466+
> _STD distance(_Begin, _Tgt_state._Grps[_Ix]._Begin);
34463467
}
34473468

3469+
// if both groups start at the same position, prefer the longer one
34483470
if (_Res._Grps[_Ix]._End != _Tgt_state._Grps[_Ix]._End) {
3449-
return _STD distance(_Begin, _Res._Grps[_Ix]._End) < _STD distance(_Begin, _Tgt_state._Grps[_Ix]._End);
3471+
return _STD distance(_Res._Grps[_Ix]._Begin, _Res._Grps[_Ix]._End)
3472+
< _STD distance(_Tgt_state._Grps[_Ix]._Begin, _Tgt_state._Grps[_Ix]._End);
34503473
}
34513474
}
34523475
}
@@ -3665,7 +3688,7 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Match_pat(_Node_base* _Nx) { // c
36653688
&& _Begin == _Tgt_state._Cur)
36663689
|| (_Full && _Tgt_state._Cur != _End)) {
36673690
_Failed = true;
3668-
} else if (!_Matched || _Better_match()) { // record successful match
3691+
} else if (_Longest && (!_Matched || _Better_match())) { // record successful match
36693692
_Res = _Tgt_state;
36703693
_Matched = true;
36713694
}

tests/std/include/test_regex_support.hpp

+90
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
33

44
#pragma once
5+
#include <cstddef>
56
#include <cstdio>
7+
#include <initializer_list>
68
#include <regex>
79
#include <string>
10+
#include <utility>
811

912
class regex_fixture {
1013
int regex_test_result = 0;
@@ -241,6 +244,93 @@ class test_regex {
241244
fixture->fail_regex();
242245
}
243246
}
247+
248+
void should_search_match_capture_groups(const std::string& subject, const std::string& expected,
249+
const std::regex_constants::match_flag_type match_flags,
250+
std::initializer_list<std::pair<std::ptrdiff_t, std::ptrdiff_t>> capture_groups) const {
251+
std::smatch mr;
252+
try {
253+
const bool search_result = std::regex_search(subject, mr, r, match_flags);
254+
if (!search_result || mr[0] != expected) {
255+
printf(R"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to find "%s", )", subject.c_str(),
256+
pattern.c_str(), static_cast<unsigned int>(syntax), static_cast<unsigned int>(match_flags),
257+
expected.c_str());
258+
if (search_result) {
259+
printf(R"(but it matched "%s")"
260+
"\n",
261+
mr.str().c_str());
262+
} else {
263+
puts("but it failed to match");
264+
}
265+
266+
fixture->fail_regex();
267+
} else if (capture_groups.size() + 1 != mr.size()) {
268+
printf(R"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to match %zu capture groups in "%s", )",
269+
subject.c_str(), pattern.c_str(), static_cast<unsigned int>(syntax),
270+
static_cast<unsigned int>(match_flags), capture_groups.size() + 1, expected.c_str());
271+
printf(R"(but it matched %zu groups)"
272+
"\n",
273+
mr.size());
274+
fixture->fail_regex();
275+
} else {
276+
bool submatches_success = true;
277+
for (std::size_t i = 1U; i < mr.size(); ++i) {
278+
const auto& expected_capture = capture_groups.begin()[i - 1];
279+
const auto& actual_capture = mr[i];
280+
if (expected_capture.first == -1) {
281+
if (actual_capture.matched) {
282+
submatches_success = false;
283+
break;
284+
}
285+
} else if (!actual_capture.matched || actual_capture.first != (mr[0].first + expected_capture.first)
286+
|| actual_capture.second != (mr[0].first + expected_capture.second)) {
287+
submatches_success = false;
288+
break;
289+
}
290+
}
291+
if (!submatches_success) {
292+
printf(R"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to find capture groups {)",
293+
subject.c_str(), pattern.c_str(), static_cast<unsigned int>(syntax),
294+
static_cast<unsigned int>(match_flags));
295+
296+
bool initial = true;
297+
for (const auto& expected_capture : capture_groups) {
298+
std::string capture = "(unmatched)";
299+
if (expected_capture.first != -1) {
300+
capture.assign(mr[0].first + expected_capture.first, mr[0].first + expected_capture.second);
301+
}
302+
printf(R"(%s"%s" [%td %td])", initial ? "" : ", ", capture.c_str(), expected_capture.first,
303+
expected_capture.second);
304+
initial = false;
305+
}
306+
printf(R"(} in "%s", but found {)", expected.c_str());
307+
308+
initial = true;
309+
for (std::size_t i = 1U; i < mr.size(); ++i) {
310+
const auto& actual_capture = mr[i];
311+
std::string capture = "(unmatched)";
312+
std::ptrdiff_t first = -1;
313+
std::ptrdiff_t last = -1;
314+
if (actual_capture.matched) {
315+
capture = actual_capture.str();
316+
first = actual_capture.first - mr[0].first;
317+
last = actual_capture.second - mr[0].first;
318+
}
319+
printf(R"(%s"%s" [%td %td])", initial ? "" : ", ", capture.c_str(), first, last);
320+
initial = false;
321+
}
322+
printf("}\n");
323+
fixture->fail_regex();
324+
}
325+
}
326+
} catch (const std::regex_error& e) {
327+
printf(R"(Failed to regex_search("%s", regex("%s", 0x%X), 0x%X): regex_error: "%s")"
328+
"\n",
329+
subject.c_str(), pattern.c_str(), static_cast<unsigned int>(syntax),
330+
static_cast<unsigned int>(match_flags), e.what());
331+
fixture->fail_regex();
332+
}
333+
}
244334
};
245335

246336
class test_wregex {

tests/std/tests/VSO_0000000_regex_use/test.cpp

+79
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,84 @@ void test_construction_from_nullptr_and_zero() {
558558
}
559559
}
560560

561+
void test_gh_731() {
562+
// GH-731 <regex>: Incorrect behavior for capture groups
563+
// GH-996: regex_search behaves incorrectly when the regex contains R"(\[)"
564+
565+
// Several bugs were fixed in ECMAScript (depth-first) and POSIX (leftmost-longest) matching rules.
566+
{
567+
const test_regex ecma_regex(&g_regexTester, R"((A+)\s*(B+)?\s*B*)", ECMAScript);
568+
ecma_regex.should_search_match_capture_groups("AAA BBB", "AAA BBB", match_default, {{0, 3}, {4, 7}});
569+
}
570+
for (syntax_option_type option : {extended, egrep, awk}) {
571+
const test_regex posix_regex(&g_regexTester, R"((A+)[[:space:]]*(B+)?[[:space:]]*B*)", option);
572+
posix_regex.should_search_match_capture_groups("AAA BBB", "AAA BBB", match_default, {{0, 3}, {4, 7}});
573+
}
574+
575+
{
576+
const test_regex ecma_regex(&g_regexTester, ".*(cat|concatenate)", ECMAScript);
577+
ecma_regex.should_search_match_capture_groups("WXconcatenateYZ", "WXconcat", match_default, {{5, 8}});
578+
}
579+
for (syntax_option_type option : {extended, egrep, awk}) {
580+
const test_regex posix_regex(&g_regexTester, ".*(cat|concatenate)", option);
581+
posix_regex.should_search_match_capture_groups("WXconcatenateYZ", "WXconcatenate", match_default, {{2, 13}});
582+
}
583+
584+
{
585+
const test_regex ecma_regex(&g_regexTester, "(aa|aabaac|ba|b|c)*", ECMAScript);
586+
ecma_regex.should_search_match_capture_groups("aabaac", "aaba", match_default, {{2, 4}});
587+
}
588+
for (syntax_option_type option : {extended, egrep, awk}) {
589+
const test_regex posix_regex(&g_regexTester, "(aa|aabaac|ba|b|c)*", option);
590+
posix_regex.should_search_match_capture_groups("aabaac", "aabaac", match_default, {{0, 6}});
591+
}
592+
593+
{
594+
const test_regex ecma_regex(&g_regexTester, ".*(a|bacc|baccc)", ECMAScript);
595+
ecma_regex.should_search_match_capture_groups("ddbacccd", "ddba", match_default, {{3, 4}});
596+
}
597+
{
598+
const test_regex ecma_regex(&g_regexTester, ".*?(a|bacc|baccc)", ECMAScript);
599+
ecma_regex.should_search_match_capture_groups("ddbacccd", "ddbacc", match_default, {{2, 6}});
600+
}
601+
for (syntax_option_type option : {extended, egrep, awk}) {
602+
const test_regex posix_regex(&g_regexTester, ".*(a|bacc|baccc)", option);
603+
posix_regex.should_search_match_capture_groups("ddbacccd", "ddbaccc", match_default, {{2, 7}});
604+
}
605+
606+
{
607+
const test_regex ecma_regex(&g_regexTester, R"(^[[:blank:]]*#([^\n]*\\[[:space:]]+)*[^\n]*)", ECMAScript);
608+
ecma_regex.should_search_match_capture_groups("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);",
609+
"#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);", match_default, {{30, 42}});
610+
}
611+
{
612+
const test_regex awk_regex(&g_regexTester, R"(^[[:blank:]]*#([^\n]*\\[[:space:]]+)*[^\n]*)", awk);
613+
awk_regex.should_search_match_capture_groups("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);",
614+
"#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);", match_default, {{28, 42}});
615+
}
616+
{
617+
const test_regex extended_regex(&g_regexTester, "^[[:blank:]]*#([^\n]*\\\\[[:space:]]+)*[^\n]*", extended);
618+
extended_regex.should_search_match_capture_groups("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);",
619+
"#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);", match_default, {{28, 42}});
620+
}
621+
622+
{
623+
const test_regex ecma_regex(&g_regexTester, "(ab*)*(ce|bbceef)", ECMAScript);
624+
ecma_regex.should_search_match_capture_groups("aababbbceef", "aababbbce", match_default, {{3, 7}, {7, 9}});
625+
}
626+
for (syntax_option_type option : {extended, egrep, awk}) {
627+
const test_regex posix_regex(&g_regexTester, "(ab*)*(ce|bbceef)", option);
628+
posix_regex.should_search_match_capture_groups("aababbbceef", "aababbbceef", match_default, {{3, 5}, {5, 11}});
629+
}
630+
631+
{
632+
// GH-996 test case
633+
const test_regex ecma_regex(&g_regexTester, R"( *((<<)|(\[)|(.+)))", ECMAScript);
634+
ecma_regex.should_search_match_capture_groups(
635+
" [<</Category/Export>>]>>", " [", match_default, {{1, 2}, {-1, -1}, {1, 2}, {-1, -1}});
636+
}
637+
}
638+
561639
void test_gh_993() {
562640
// GH-993 regex::icase is not handled correctly for some input.
563641
{
@@ -1100,6 +1178,7 @@ int main() {
11001178
test_VSO_225160_match_eol_flag();
11011179
test_VSO_226914_word_boundaries();
11021180
test_construction_from_nullptr_and_zero();
1181+
test_gh_731();
11031182
test_gh_993();
11041183
test_gh_4995();
11051184
test_gh_5058();

tests/tr1/tests/regex2/test.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,7 @@ static const regex_test tests[] = {
659659
{__LINE__, T("a[a-z]\\{2,4\\}"), T("abcdefghi"), "1 0 5", BASIC | GREP},
660660
{__LINE__, T("a[a-z]{2,4}?"), T("abcdefghi"), "1 0 3", ECMA},
661661
{__LINE__, T("(aa|aabaac|ba|b|c)*"), T("aabaac"), "2 0 4 2 4", ECMA},
662-
{__LINE__, T("(aa|aabaac|ba|b|c)*"), T("aabaac"), "2 0 6 5 6", EEA},
662+
{__LINE__, T("(aa|aabaac|ba|b|c)*"), T("aabaac"), "2 0 6 0 6", EEA},
663663
{__LINE__, T("(z)((a+)?(b+)?(c))*"), T("zaacbbbcac"), "6 0 10 0 1 8 10 8 9 -1 -1 9 10", ECMA},
664664
{__LINE__, T("(a*)b\\1+"), T("baaaac"), "2 0 1 0 0", ECMA},
665665
{__LINE__, T("(?=(a+))"), T("baaabac"), "2 1 1 1 4", ECMA},
@@ -774,7 +774,9 @@ static const regex_test tests[] = {
774774
{__LINE__, T("^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*"), T("#define some_symbol(x) #x"), "2 0 25 -1 -1",
775775
ECMA | AWK},
776776
{__LINE__, T("^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*"),
777-
T("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);"), "2 0 53 30 42", ECMA | AWK},
777+
T("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);"), "2 0 53 30 42", ECMA},
778+
{__LINE__, T("^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*"),
779+
T("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);"), "2 0 53 28 42", AWK},
778780
};
779781

780782
static STD string check_matches(

0 commit comments

Comments
 (0)