From 5fcf5e2e8c79b5e7d2b205026bc66f2f243214bf Mon Sep 17 00:00:00 2001 From: liam Date: Fri, 4 Oct 2024 11:25:54 -0400 Subject: [PATCH] Improve lexer implementation and add additional integration tests (#29) --- src/lexer.rs | 123 ++++++++++++++++++++++++++++++------------- tests/integration.rs | 85 +++++++++++++++++++++++++++++- 2 files changed, 170 insertions(+), 38 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index f25513d..dd24cb8 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -16,61 +16,78 @@ impl<'a> Lexer<'a> { fn tokenize(&self) -> Result> { let mut tokens = Vec::new(); - let mut chars = self.src.chars().peekable(); + let mut current_token = String::new(); + while let Some(ch) = chars.next() { match ch { '\'' | '"' => { - let mut group = String::new(); - - chars - .clone() - .collect::>() - .iter() - .find(|next| **next == ch) - .ok_or(Error::LexError { - message: "Unmatched delimeter".into(), - })?; - - for next in chars.by_ref() { - match next { - next if next == ch => break, - _ => group.push(next), - } + if !current_token.is_empty() { + tokens.push(current_token); + current_token = String::new(); } - chars.next(); + let quoted_string = self.parse_quoted_string(ch, &mut chars)?; - tokens.push(group); + tokens.push(quoted_string); + } + ' ' | '\t' => { + if !current_token.is_empty() { + tokens.push(current_token); + current_token = String::new(); + } + } + '\\' => { + if let Some(next_ch) = chars.next() { + current_token.push(next_ch); + } } _ => { - let mut group = String::new(); + current_token.push(ch); + } + } + } - group.push(ch); + if !current_token.is_empty() { + tokens.push(current_token); + } + + Ok(tokens) + } + + fn parse_quoted_string( + &self, + quote: char, + chars: &mut std::iter::Peekable, + ) -> Result { + let mut result = String::new(); + let mut escaped = false; - while let Some(next) = chars.peek() { - match next { - '\'' | '"' => break, - _ => { - group.push(*next); - chars.next(); - } + for ch in chars.by_ref() { + match ch { + _ if escaped => { + match ch { + '\\' | '\'' | '"' => result.push(ch), + 'n' => result.push('\n'), + 't' => result.push('\t'), + 'r' => result.push('\r'), + _ => { + result.push('\\'); + result.push(ch); } } - - tokens.extend( - group - .trim() - .split(' ') - .map(|argument| argument.to_owned()) - .collect::>(), - ); + escaped = false; } + '\\' => escaped = true, + ch if ch == quote => return Ok(result), + _ => result.push(ch), } } - Ok(tokens) + Err(Error::LexError { + message: "Unmatched delimiter".into(), + }) } } @@ -115,4 +132,36 @@ mod tests { fn unmatched_delimiter() { assert!(lex("-c 'echo foo").is_err()); } + + #[test] + fn escaped_quotes() { + assert_eq!( + lex(r#"echo "Hello \"World\"""#).unwrap(), + vec!["echo", r#"Hello "World""#] + ); + } + + #[test] + fn nested_quotes() { + assert_eq!( + lex(r#"echo "outer 'inner' outer""#).unwrap(), + vec!["echo", r#"outer 'inner' outer"#] + ); + } + + #[test] + fn complex_command() { + assert_eq!( + lex(r#"bash -c "echo 'hello world' | tr ' ' '\n' | sort | uniq -c | sort -nr""#).unwrap(), + vec!["bash", "-c", "echo 'hello world' | tr ' ' '\n' | sort | uniq -c | sort -nr"] + ); + } + + #[test] + fn escaped_characters() { + assert_eq!( + lex(r#"echo "Hello\nWorld\t\"\\" 'Single\'Quote'"#).unwrap(), + vec!["echo", "Hello\nWorld\t\"\\", "Single'Quote"] + ); + } } diff --git a/tests/integration.rs b/tests/integration.rs index 6858147..8dbe7c2 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -533,7 +533,7 @@ fn inline_unmatched_delimiter() -> Result { .expected_status(1) .expected_stderr( " - error: Lex Error: Unmatched delimeter + error: Lex Error: Unmatched delimiter ", ) .run() @@ -612,3 +612,86 @@ fn grapheme_handling() -> Result { ) .run() } + +#[test] +fn large_output_handling() -> Result { + Test::new()? + .markdown( + r#" + ```present python -c "print('Large ' * 1000)" + ``` + "#, + ) + .expected_status(0) + .expected_stdout(&format!( + r#" + ```present python -c "print('Large ' * 1000)" + {} + ``` + "#, + "Large ".repeat(1000) + )) + .run() +} + +#[test] +fn escaping_special_characters() -> Result { + Test::new()? + .markdown( + r#" + ```present echo "Special chars: && || > < | ; \" ' \\" + ``` + "#, + ) + .expected_status(0) + .expected_stdout( + r#" + ```present echo "Special chars: && || > < | ; \" ' \\" + Special chars: && || > < | ; " ' \ + ``` + "#, + ) + .run() +} + +#[test] +fn complex_shell_pipeline() -> Result { + Test::new()? + .markdown( + r#" + ```present bash -c "echo 'hello world' | tr ' ' '\n' | sort | uniq -c | sort -nr | sed 's/^[[:space:]]*//' " + ``` + "#, + ) + .expected_status(0) + .expected_stdout( + r#" + ```present bash -c "echo 'hello world' | tr ' ' '\n' | sort | uniq -c | sort -nr | sed 's/^[[:space:]]*//' " + 1 world + 1 hello + ``` + "#, + ) + .run() +} + +#[test] +fn unicode_normalization() -> Result { + Test::new()? + .markdown( + r#" + ```present bash -c "echo \"é\" | xxd -p && echo \"é\" | xxd -p" + ``` + "#, + ) + .expected_status(0) + .expected_stdout( + r#" + ```present bash -c "echo \"é\" | xxd -p && echo \"é\" | xxd -p" + c3a90a + 65cc810a + ``` + "#, + ) + .run() +}