Skip to content

Commit 73df14e

Browse files
committed
[GR-58535] Adopt "JDK-8314794: Improve UTF8 String supports"
PullRequest: graal/18910
2 parents 1767a11 + 55609a7 commit 73df14e

File tree

2 files changed

+59
-24
lines changed
  • substratevm/src
    • com.oracle.objectfile/src/com/oracle/objectfile/io
    • com.oracle.svm.core/src/com/oracle/svm/core/util

2 files changed

+59
-24
lines changed

substratevm/src/com.oracle.objectfile/src/com/oracle/objectfile/io/Utf8.java

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,22 @@ public final class Utf8 {
3636
private Utf8() {
3737
}
3838

39+
private static int utf8Size(char c) {
40+
// Based On
41+
// https://github.com/openjdk/jdk/blob/jdk-24+16/src/hotspot/share/utilities/utf8.cpp#L479-L488
42+
if ((0x0001 <= c) && (c <= 0x007F)) {
43+
// ASCII character
44+
return 1;
45+
} else if (c <= 0x07FF) {
46+
return 2;
47+
} else {
48+
return 3;
49+
}
50+
}
51+
3952
/**
40-
* @return the length in bytes of the UTF8 representation of the string
53+
* @return the length as {@code int} in bytes of the UTF8 representation of the string. Might
54+
* return a truncated size if the value does not fit into {@code int} (see JDK-8328877).
4155
*/
4256
public static int utf8Length(String string) {
4357
return utf8Length(string, 0, string.length());
@@ -46,24 +60,29 @@ public static int utf8Length(String string) {
4660
/**
4761
* @param beginIndex first index that is part of the region, inclusive
4862
* @param endIndex index at the end of the region, exclusive
49-
* @return the length in bytes of the UTF8 representation of the string region
63+
* @return the length as {@code int} in bytes of the UTF8 representation of the string region.
64+
* Might return a truncated size if the value does not fit into {@code int} (see
65+
* JDK-8328877).
5066
*/
5167
public static int utf8Length(String s, int beginIndex, int endIndex) {
68+
// Based on
69+
// https://github.com/openjdk/jdk/blob/jdk-24+16/src/hotspot/share/utilities/utf8.cpp#L511-L526.
5270
if (beginIndex < 0 || endIndex > s.length() || beginIndex > endIndex) {
5371
throw new StringIndexOutOfBoundsException();
5472
}
55-
int length = 0;
56-
for (int i = beginIndex; i < endIndex; i++) {
57-
final int c = s.charAt(i);
58-
if ((c >= 0x0001) && (c <= 0x007F)) {
59-
length++;
60-
} else if (c > 0x07FF) {
61-
length += 3;
62-
} else {
63-
length += 2;
73+
long result = 0;
74+
for (int index = beginIndex; index < endIndex; index++) {
75+
char c = s.charAt(index);
76+
long sz = utf8Size(c);
77+
// If the length is > INT_MAX-1 we truncate at a completed
78+
// modified-UTF8 encoding. This allows for +1 to be added
79+
// by the caller for NUL-termination, without overflow.
80+
if (result + sz > Integer.MAX_VALUE - 1) {
81+
break;
6482
}
83+
result += sz;
6584
}
66-
return length;
85+
return (int) result;
6786
}
6887

6988
/**

substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/util/Utf8.java

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,21 @@ public final class Utf8 {
3939
private Utf8() {
4040
}
4141

42+
@BasedOnJDKFile("https://github.com/openjdk/jdk/blob/jdk-24+16/src/hotspot/share/utilities/utf8.cpp#L479-L488")
43+
private static int utf8Size(char c) {
44+
if ((0x0001 <= c) && (c <= 0x007F)) {
45+
// ASCII character
46+
return 1;
47+
} else if (c <= 0x07FF) {
48+
return 2;
49+
} else {
50+
return 3;
51+
}
52+
}
53+
4254
/**
43-
* @return the length in bytes of the UTF8 representation of the string
55+
* @return the length as {@code int} in bytes of the UTF8 representation of the string. Might
56+
* return a truncated size if the value does not fit into {@code int} (see JDK-8328877).
4457
*/
4558
public static int utf8Length(String string) {
4659
return utf8Length(string, 0, string.length());
@@ -49,24 +62,27 @@ public static int utf8Length(String string) {
4962
/**
5063
* @param beginIndex first index that is part of the region, inclusive
5164
* @param endIndex index at the end of the region, exclusive
52-
* @return the length in bytes of the UTF8 representation of the string region
65+
* @return the length as {@code int} in bytes of the UTF8 representation of the string. Might
66+
* return a truncated size if the value does not fit into {@code int} (see JDK-8328877).
5367
*/
68+
@BasedOnJDKFile("https://github.com/openjdk/jdk/blob/jdk-24+16/src/hotspot/share/utilities/utf8.cpp#L511-L526")
5469
public static int utf8Length(String s, int beginIndex, int endIndex) {
5570
if (beginIndex < 0 || endIndex > s.length() || beginIndex > endIndex) {
5671
throw new StringIndexOutOfBoundsException();
5772
}
58-
int length = 0;
59-
for (int i = beginIndex; i < endIndex; i++) {
60-
final int c = s.charAt(i);
61-
if ((c >= 0x0001) && (c <= 0x007F)) {
62-
length++;
63-
} else if (c > 0x07FF) {
64-
length += 3;
65-
} else {
66-
length += 2;
73+
long result = 0;
74+
for (int index = beginIndex; index < endIndex; index++) {
75+
char c = s.charAt(index);
76+
long sz = utf8Size(c);
77+
// If the length is > INT_MAX-1 we truncate at a completed
78+
// modified-UTF8 encoding. This allows for +1 to be added
79+
// by the caller for NUL-termination, without overflow.
80+
if (result + sz > Integer.MAX_VALUE - 1) {
81+
break;
6782
}
83+
result += sz;
6884
}
69-
return length;
85+
return (int) result;
7086
}
7187

7288
/**

0 commit comments

Comments
 (0)