@@ -39,8 +39,21 @@ public final class Utf8 {
39
39
private Utf8 () {
40
40
}
41
41
42
+ @ BasedOnJDKFile ("https://github.com/openjdk/jdk/blob/jdk-24+16/src/hotspot/share/utilities/utf8.cpp#L479-L488" )
43
+ private static int utf8Size (char c ) {
44
+ if ((0x0001 <= c ) && (c <= 0x007F )) {
45
+ // ASCII character
46
+ return 1 ;
47
+ } else if (c <= 0x07FF ) {
48
+ return 2 ;
49
+ } else {
50
+ return 3 ;
51
+ }
52
+ }
53
+
42
54
/**
43
- * @return the length in bytes of the UTF8 representation of the string
55
+ * @return the length as {@code int} in bytes of the UTF8 representation of the string. Might
56
+ * return a truncated size if the value does not fit into {@code int} (see JDK-8328877).
44
57
*/
45
58
public static int utf8Length (String string ) {
46
59
return utf8Length (string , 0 , string .length ());
@@ -49,24 +62,27 @@ public static int utf8Length(String string) {
49
62
/**
50
63
* @param beginIndex first index that is part of the region, inclusive
51
64
* @param endIndex index at the end of the region, exclusive
52
- * @return the length in bytes of the UTF8 representation of the string region
65
+ * @return the length as {@code int} in bytes of the UTF8 representation of the string. Might
66
+ * return a truncated size if the value does not fit into {@code int} (see JDK-8328877).
53
67
*/
68
+ @ BasedOnJDKFile ("https://github.com/openjdk/jdk/blob/jdk-24+16/src/hotspot/share/utilities/utf8.cpp#L511-L526" )
54
69
public static int utf8Length (String s , int beginIndex , int endIndex ) {
55
70
if (beginIndex < 0 || endIndex > s .length () || beginIndex > endIndex ) {
56
71
throw new StringIndexOutOfBoundsException ();
57
72
}
58
- int length = 0 ;
59
- for (int i = beginIndex ; i < endIndex ; i ++) {
60
- final int c = s .charAt (i );
61
- if (( c >= 0x0001 ) && ( c <= 0x007F )) {
62
- length ++;
63
- } else if ( c > 0x07FF ) {
64
- length += 3 ;
65
- } else {
66
- length += 2 ;
73
+ long result = 0 ;
74
+ for (int index = beginIndex ; index < endIndex ; index ++) {
75
+ char c = s .charAt (index );
76
+ long sz = utf8Size ( c );
77
+ // If the length is > INT_MAX-1 we truncate at a completed
78
+ // modified-UTF8 encoding. This allows for +1 to be added
79
+ // by the caller for NUL-termination, without overflow.
80
+ if ( result + sz > Integer . MAX_VALUE - 1 ) {
81
+ break ;
67
82
}
83
+ result += sz ;
68
84
}
69
- return length ;
85
+ return ( int ) result ;
70
86
}
71
87
72
88
/**
0 commit comments