From 21550be5b6c0f84cba7303c15c5f43c82455b7ce Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Fri, 9 May 2025 14:47:51 +0200 Subject: [PATCH 1/3] Add spec for Regexp.linear_time? + binary Regexp --- spec/ruby/core/regexp/linear_time_spec.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spec/ruby/core/regexp/linear_time_spec.rb b/spec/ruby/core/regexp/linear_time_spec.rb index 4dc436264fd7..2f5795fefdd4 100644 --- a/spec/ruby/core/regexp/linear_time_spec.rb +++ b/spec/ruby/core/regexp/linear_time_spec.rb @@ -7,6 +7,10 @@ Regexp.linear_time?('a').should == true end + it "returns true if matching can be done in linear time for a binary Regexp" do + Regexp.linear_time?(/[\x80-\xff]/n).should == true + end + it "return false if matching can't be done in linear time" do Regexp.linear_time?(/(a)\1/).should == false Regexp.linear_time?("(a)\\1").should == false From 9eff8294549e8d96cd26807eb77a01da7c1a43c1 Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Fri, 9 May 2025 14:48:35 +0200 Subject: [PATCH 2/3] [GR-64899] Fix binary Regexp compilation to TRegex * See https://github.com/oracle/truffleruby/issues/3858 * We need to pass a java.lang.String to TRegex, in this case we can pass it as raw bytes since we also pass the encoding name to TRegex. * Remove the UnsupportedCharsetException catch clause as no Charset should be involved in this conversion since the migration to TruffleString. --- CHANGELOG.md | 1 + .../org/truffleruby/core/regexp/TRegexCache.java | 14 ++++++-------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a273d5dd01f..70d2ebf1c0fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ Compatibility: Performance: +* Use TRegex for binary Regexps with non-US-ASCII characters in the pattern like `/[\x80-\xff]/n` (#3858, @eregon). Changes: diff --git a/src/main/java/org/truffleruby/core/regexp/TRegexCache.java b/src/main/java/org/truffleruby/core/regexp/TRegexCache.java index ddb0f13c82a2..a894394a4709 100644 --- a/src/main/java/org/truffleruby/core/regexp/TRegexCache.java +++ b/src/main/java/org/truffleruby/core/regexp/TRegexCache.java @@ -9,8 +9,6 @@ */ package org.truffleruby.core.regexp; -import java.nio.charset.UnsupportedCharsetException; - import com.oracle.truffle.api.CompilerDirectives; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; import com.oracle.truffle.api.interop.InteropLibrary; @@ -155,12 +153,12 @@ private static Object compileTRegex(RubyContext context, RubyRegexp regexp, bool var tstring = tstringBuilder.toTString(); try { processedRegexpSource = TStringUtils.toJavaStringOrThrow(tstring, tstringBuilder.getRubyEncoding()); - } catch (CannotConvertBinaryRubyStringToJavaString | UnsupportedCharsetException e) { - // Some strings cannot be converted to Java strings, e.g. strings with the - // BINARY encoding containing characters higher than 127. - // Also, some charsets might not be supported on the JVM and therefore - // a conversion to j.l.String might be impossible. - return null; + } catch (CannotConvertBinaryRubyStringToJavaString e) { + // A BINARY regexp with non-US-ASCII bytes, pass it as "raw bytes" instead. + // TRegex knows how to interpret those bytes correctly as we pass the encoding name as well. + var latin1string = tstring.forceEncodingUncached(Encodings.BINARY.tencoding, + Encodings.ISO_8859_1.tencoding); + processedRegexpSource = TStringUtils.toJavaStringOrThrow(latin1string, Encodings.ISO_8859_1); } String flags = optionsToFlags(regexp.options, atStart); From 61e842c75523576294d57e917128765d7cfd3d4b Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Fri, 9 May 2025 14:55:44 +0200 Subject: [PATCH 3/3] Check if TRegex supports the encoding first as this is very fast * Does not matter currently as all callers already check the encoding before, but that could change. --- .../java/org/truffleruby/core/regexp/TRegexCache.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/truffleruby/core/regexp/TRegexCache.java b/src/main/java/org/truffleruby/core/regexp/TRegexCache.java index a894394a4709..1bdfbdb7c590 100644 --- a/src/main/java/org/truffleruby/core/regexp/TRegexCache.java +++ b/src/main/java/org/truffleruby/core/regexp/TRegexCache.java @@ -137,6 +137,11 @@ public static String toTRegexEncoding(RubyEncoding encoding) { @TruffleBoundary private static Object compileTRegex(RubyContext context, RubyRegexp regexp, boolean atStart, RubyEncoding enc) { + String tRegexEncoding = TRegexCache.toTRegexEncoding(enc); + if (tRegexEncoding == null) { + return null; + } + String processedRegexpSource; RubyEncoding[] fixedEnc = new RubyEncoding[]{ null }; final TStringBuilder tstringBuilder; @@ -163,11 +168,6 @@ private static Object compileTRegex(RubyContext context, RubyRegexp regexp, bool String flags = optionsToFlags(regexp.options, atStart); - String tRegexEncoding = TRegexCache.toTRegexEncoding(enc); - if (tRegexEncoding == null) { - return null; - } - String ignoreAtomicGroups = context.getOptions().TRUFFLE_REGEX_IGNORE_ATOMIC_GROUPS ? ",IgnoreAtomicGroups=true" : "";