Skip to content

Commit e959e34

Browse files
committed
[GR-64899] Fix binary Regexp compilation to TRegex
PullRequest: truffleruby/4530
2 parents 81980d6 + 61e842c commit e959e34

File tree

3 files changed

+16
-13
lines changed

3 files changed

+16
-13
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Compatibility:
2323

2424
Performance:
2525

26+
* Use TRegex for binary Regexps with non-US-ASCII characters in the pattern like `/[\x80-\xff]/n` (#3858, @eregon).
2627

2728
Changes:
2829

spec/ruby/core/regexp/linear_time_spec.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
Regexp.linear_time?('a').should == true
88
end
99

10+
it "returns true if matching can be done in linear time for a binary Regexp" do
11+
Regexp.linear_time?(/[\x80-\xff]/n).should == true
12+
end
13+
1014
it "return false if matching can't be done in linear time" do
1115
Regexp.linear_time?(/(a)\1/).should == false
1216
Regexp.linear_time?("(a)\\1").should == false

src/main/java/org/truffleruby/core/regexp/TRegexCache.java

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
*/
1010
package org.truffleruby.core.regexp;
1111

12-
import java.nio.charset.UnsupportedCharsetException;
13-
1412
import com.oracle.truffle.api.CompilerDirectives;
1513
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
1614
import com.oracle.truffle.api.interop.InteropLibrary;
@@ -139,6 +137,11 @@ public static String toTRegexEncoding(RubyEncoding encoding) {
139137

140138
@TruffleBoundary
141139
private static Object compileTRegex(RubyContext context, RubyRegexp regexp, boolean atStart, RubyEncoding enc) {
140+
String tRegexEncoding = TRegexCache.toTRegexEncoding(enc);
141+
if (tRegexEncoding == null) {
142+
return null;
143+
}
144+
142145
String processedRegexpSource;
143146
RubyEncoding[] fixedEnc = new RubyEncoding[]{ null };
144147
final TStringBuilder tstringBuilder;
@@ -155,21 +158,16 @@ private static Object compileTRegex(RubyContext context, RubyRegexp regexp, bool
155158
var tstring = tstringBuilder.toTString();
156159
try {
157160
processedRegexpSource = TStringUtils.toJavaStringOrThrow(tstring, tstringBuilder.getRubyEncoding());
158-
} catch (CannotConvertBinaryRubyStringToJavaString | UnsupportedCharsetException e) {
159-
// Some strings cannot be converted to Java strings, e.g. strings with the
160-
// BINARY encoding containing characters higher than 127.
161-
// Also, some charsets might not be supported on the JVM and therefore
162-
// a conversion to j.l.String might be impossible.
163-
return null;
161+
} catch (CannotConvertBinaryRubyStringToJavaString e) {
162+
// A BINARY regexp with non-US-ASCII bytes, pass it as "raw bytes" instead.
163+
// TRegex knows how to interpret those bytes correctly as we pass the encoding name as well.
164+
var latin1string = tstring.forceEncodingUncached(Encodings.BINARY.tencoding,
165+
Encodings.ISO_8859_1.tencoding);
166+
processedRegexpSource = TStringUtils.toJavaStringOrThrow(latin1string, Encodings.ISO_8859_1);
164167
}
165168

166169
String flags = optionsToFlags(regexp.options, atStart);
167170

168-
String tRegexEncoding = TRegexCache.toTRegexEncoding(enc);
169-
if (tRegexEncoding == null) {
170-
return null;
171-
}
172-
173171
String ignoreAtomicGroups = context.getOptions().TRUFFLE_REGEX_IGNORE_ATOMIC_GROUPS
174172
? ",IgnoreAtomicGroups=true"
175173
: "";

0 commit comments

Comments
 (0)