@@ -3,7 +3,7 @@ mod codegen;
3
3
use pg_query:: protobuf:: { KeywordKind , ScanToken } ;
4
4
use regex:: Regex ;
5
5
use std:: { collections:: VecDeque , sync:: LazyLock } ;
6
- use text_size:: { TextRange , TextSize } ;
6
+ use text_size:: { TextLen , TextRange , TextSize } ;
7
7
8
8
pub use crate :: codegen:: SyntaxKind ;
9
9
@@ -119,25 +119,24 @@ pub fn lex(text: &str) -> Vec<Token> {
119
119
120
120
// merge the two token lists
121
121
let mut tokens: Vec < Token > = Vec :: new ( ) ;
122
- let mut pos = 0 ;
122
+ let mut pos = TextSize :: from ( 0 ) ;
123
123
124
- while pos < text. len ( ) {
125
- if !pg_query_tokens. is_empty ( ) && pg_query_tokens[ 0 ] . start == i32:: try_from ( pos) . unwrap ( ) {
124
+ while pos < text. text_len ( ) {
125
+ if !pg_query_tokens. is_empty ( )
126
+ && TextSize :: from ( u32:: try_from ( pg_query_tokens[ 0 ] . start ) . unwrap ( ) ) == pos
127
+ {
126
128
let pg_query_token = pg_query_tokens. pop_front ( ) . unwrap ( ) ;
127
- let token_text: String = text
128
- . chars ( )
129
- . skip ( usize:: try_from ( pg_query_token. start ) . unwrap ( ) )
130
- . take (
131
- usize:: try_from ( pg_query_token. end ) . unwrap ( )
132
- - usize:: try_from ( pg_query_token. start ) . unwrap ( ) ,
133
- )
134
- . collect ( ) ;
135
- let len = token_text. len ( ) ;
129
+
130
+ // the lexer returns byte indices, so we need to slice
131
+ let token_text = & text[ usize:: try_from ( pg_query_token. start ) . unwrap ( )
132
+ ..usize:: try_from ( pg_query_token. end ) . unwrap ( ) ] ;
133
+
134
+ let len = token_text. text_len ( ) ;
136
135
let has_whitespace = token_text. contains ( " " ) || token_text. contains ( "\n " ) ;
137
136
tokens. push ( Token {
138
137
token_type : TokenType :: from ( & pg_query_token) ,
139
138
kind : SyntaxKind :: from ( & pg_query_token) ,
140
- text : token_text,
139
+ text : token_text. to_string ( ) ,
141
140
span : TextRange :: new (
142
141
TextSize :: from ( u32:: try_from ( pg_query_token. start ) . unwrap ( ) ) ,
143
142
TextSize :: from ( u32:: try_from ( pg_query_token. end ) . unwrap ( ) ) ,
@@ -147,8 +146,7 @@ pub fn lex(text: &str) -> Vec<Token> {
147
146
148
147
if has_whitespace {
149
148
while !whitespace_tokens. is_empty ( )
150
- && whitespace_tokens[ 0 ] . span . start ( )
151
- < TextSize :: from ( u32:: try_from ( pos) . unwrap ( ) )
149
+ && whitespace_tokens[ 0 ] . span . start ( ) < TextSize :: from ( u32:: from ( pos) )
152
150
{
153
151
whitespace_tokens. pop_front ( ) ;
154
152
}
@@ -158,16 +156,21 @@ pub fn lex(text: &str) -> Vec<Token> {
158
156
}
159
157
160
158
if !whitespace_tokens. is_empty ( )
161
- && whitespace_tokens[ 0 ] . span . start ( ) == TextSize :: from ( u32:: try_from ( pos) . unwrap ( ) )
159
+ && whitespace_tokens[ 0 ] . span . start ( ) == TextSize :: from ( u32:: from ( pos) )
162
160
{
163
161
let whitespace_token = whitespace_tokens. pop_front ( ) . unwrap ( ) ;
164
- let len = whitespace_token. text . len ( ) ;
162
+ let len = whitespace_token. text . text_len ( ) ;
165
163
tokens. push ( whitespace_token) ;
166
164
pos += len;
167
165
continue ;
168
166
}
169
167
170
- panic ! ( "No token found at position {}" , pos) ;
168
+ let usize_pos = usize:: from ( pos) ;
169
+ panic ! (
170
+ "No token found at position {:?}: '{:?}'" ,
171
+ pos,
172
+ text. get( usize_pos..usize_pos + 1 )
173
+ ) ;
171
174
}
172
175
173
176
tokens
@@ -177,6 +180,13 @@ pub fn lex(text: &str) -> Vec<Token> {
177
180
mod tests {
178
181
use super :: * ;
179
182
183
+ #[ test]
184
+ fn test_special_chars ( ) {
185
+ let input = "insert into c (name, full_name) values ('Å', 1);" ;
186
+ let tokens = lex ( input) ;
187
+ assert ! ( !tokens. is_empty( ) ) ;
188
+ }
189
+
180
190
#[ test]
181
191
fn test_tab_tokens ( ) {
182
192
let input = "select\t 1" ;
0 commit comments