Skip to content

Commit afd3d19

Browse files
authored
fix: handle unicode in lexer (#187)
1 parent 3f8b479 commit afd3d19

File tree

3 files changed

+31
-28
lines changed

3 files changed

+31
-28
lines changed

crates/pg_lexer/src/lib.rs

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ mod codegen;
33
use pg_query::protobuf::{KeywordKind, ScanToken};
44
use regex::Regex;
55
use std::{collections::VecDeque, sync::LazyLock};
6-
use text_size::{TextRange, TextSize};
6+
use text_size::{TextLen, TextRange, TextSize};
77

88
pub use crate::codegen::SyntaxKind;
99

@@ -119,25 +119,24 @@ pub fn lex(text: &str) -> Vec<Token> {
119119

120120
// merge the two token lists
121121
let mut tokens: Vec<Token> = Vec::new();
122-
let mut pos = 0;
122+
let mut pos = TextSize::from(0);
123123

124-
while pos < text.len() {
125-
if !pg_query_tokens.is_empty() && pg_query_tokens[0].start == i32::try_from(pos).unwrap() {
124+
while pos < text.text_len() {
125+
if !pg_query_tokens.is_empty()
126+
&& TextSize::from(u32::try_from(pg_query_tokens[0].start).unwrap()) == pos
127+
{
126128
let pg_query_token = pg_query_tokens.pop_front().unwrap();
127-
let token_text: String = text
128-
.chars()
129-
.skip(usize::try_from(pg_query_token.start).unwrap())
130-
.take(
131-
usize::try_from(pg_query_token.end).unwrap()
132-
- usize::try_from(pg_query_token.start).unwrap(),
133-
)
134-
.collect();
135-
let len = token_text.len();
129+
130+
// the lexer returns byte indices, so we need to slice
131+
let token_text = &text[usize::try_from(pg_query_token.start).unwrap()
132+
..usize::try_from(pg_query_token.end).unwrap()];
133+
134+
let len = token_text.text_len();
136135
let has_whitespace = token_text.contains(" ") || token_text.contains("\n");
137136
tokens.push(Token {
138137
token_type: TokenType::from(&pg_query_token),
139138
kind: SyntaxKind::from(&pg_query_token),
140-
text: token_text,
139+
text: token_text.to_string(),
141140
span: TextRange::new(
142141
TextSize::from(u32::try_from(pg_query_token.start).unwrap()),
143142
TextSize::from(u32::try_from(pg_query_token.end).unwrap()),
@@ -147,8 +146,7 @@ pub fn lex(text: &str) -> Vec<Token> {
147146

148147
if has_whitespace {
149148
while !whitespace_tokens.is_empty()
150-
&& whitespace_tokens[0].span.start()
151-
< TextSize::from(u32::try_from(pos).unwrap())
149+
&& whitespace_tokens[0].span.start() < TextSize::from(u32::from(pos))
152150
{
153151
whitespace_tokens.pop_front();
154152
}
@@ -158,16 +156,21 @@ pub fn lex(text: &str) -> Vec<Token> {
158156
}
159157

160158
if !whitespace_tokens.is_empty()
161-
&& whitespace_tokens[0].span.start() == TextSize::from(u32::try_from(pos).unwrap())
159+
&& whitespace_tokens[0].span.start() == TextSize::from(u32::from(pos))
162160
{
163161
let whitespace_token = whitespace_tokens.pop_front().unwrap();
164-
let len = whitespace_token.text.len();
162+
let len = whitespace_token.text.text_len();
165163
tokens.push(whitespace_token);
166164
pos += len;
167165
continue;
168166
}
169167

170-
panic!("No token found at position {}", pos);
168+
let usize_pos = usize::from(pos);
169+
panic!(
170+
"No token found at position {:?}: '{:?}'",
171+
pos,
172+
text.get(usize_pos..usize_pos + 1)
173+
);
171174
}
172175

173176
tokens
@@ -177,6 +180,13 @@ pub fn lex(text: &str) -> Vec<Token> {
177180
mod tests {
178181
use super::*;
179182

183+
#[test]
184+
fn test_special_chars() {
185+
let input = "insert into c (name, full_name) values ('Å', 1);";
186+
let tokens = lex(input);
187+
assert!(!tokens.is_empty());
188+
}
189+
180190
#[test]
181191
fn test_tab_tokens() {
182192
let input = "select\t1";

crates/pg_workspace/src/settings.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ use std::{
44
borrow::Cow,
55
num::NonZeroU64,
66
path::{Path, PathBuf},
7-
str::FromStr,
87
sync::{RwLock, RwLockReadGuard, RwLockWriteGuard},
98
};
109

@@ -13,7 +12,7 @@ use pg_configuration::{
1312
database::PartialDatabaseConfiguration,
1413
diagnostics::InvalidIgnorePattern,
1514
files::FilesConfiguration,
16-
migrations::{self, MigrationsConfiguration, PartialMigrationsConfiguration},
15+
migrations::{MigrationsConfiguration, PartialMigrationsConfiguration},
1716
ConfigurationDiagnostic, LinterConfiguration, PartialConfiguration,
1817
};
1918
use pg_fs::FileSystem;

crates/pg_workspace/src/workspace/server.rs

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,4 @@
1-
use std::{
2-
fs,
3-
future::Future,
4-
panic::RefUnwindSafe,
5-
path::{Path, PathBuf},
6-
sync::RwLock,
7-
};
1+
use std::{fs, future::Future, panic::RefUnwindSafe, path::Path, sync::RwLock};
82

93
use analyser::AnalyserVisitorBuilder;
104
use change::StatementChange;

0 commit comments

Comments
 (0)