Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Properly handle unicode and grapheme clusters #26

Merged
merged 3 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
593 changes: 343 additions & 250 deletions Cargo.lock

Large diffs are not rendered by default.

21 changes: 11 additions & 10 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,18 @@ keywords = ["command-line", "productivity", "utility", "markdown", "bash"]
resolver = "2"

[dependencies]
clap = { version = "4.0.9", features = ["derive"] }
console = "0.15.2"
pulldown-cmark = { version = "0.9.2", default-features = false, features = ["simd"] }
ropey = "1.5.0"
similar = "2.2.0"
snafu = { version = "0.7.1", default-features = false, features = ["std"] }
termimad = "0.20.3"
walkdir = "2.3.2"
clap = { version = "4.5.19", features = ["derive"] }
console = "0.15.8"
pulldown-cmark = { version = "0.9.6", default-features = false, features = ["simd"] }
ropey = "1.6.1"
similar = "2.6.0"
snafu = { version = "0.7.5", default-features = false, features = ["std"] }
termimad = "0.20.6"
unicode-segmentation = "1.12.0"
walkdir = "2.5.0"

[dev-dependencies]
executable-path = "1.0.0"
pretty_assertions = "1.3.0"
pretty_assertions = "1.4.1"
tempdir = "0.3.7"
unindent = "0.1.10"
unindent = "0.1.11"
3 changes: 1 addition & 2 deletions src/common.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
//! Contains commonly used stuff from external crates

// std
pub(crate) use std::{
fs,
io::{self, Write},
Expand All @@ -9,12 +8,12 @@ pub(crate) use std::{
process, str,
};

// dependencies
pub(crate) use {
console::Style,
pulldown_cmark::{CodeBlockKind, Event, Parser as MarkdownParser, Tag},
ropey::Rope,
similar::{ChangeTag, TextDiff},
snafu::Snafu,
termimad::print_inline,
unicode_segmentation::UnicodeSegmentation,
};
36 changes: 31 additions & 5 deletions src/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,20 @@ pub struct Diff {
}

impl Diff {
/// Adjusts the diff's range by the given offset.
///
/// This method modifies the start and end points of the diff's range
/// based on the provided offset. It handles both positive and negative
/// offsets, using saturating arithmetic to prevent underflow or overflow.
pub(crate) fn offset(&mut self, offset: isize) {
if offset < 0 {
self.range.start = self.range.start.saturating_sub(offset.unsigned_abs());
self.range.end = self.range.end.saturating_sub(offset.unsigned_abs());
if offset >= 0 {
let offset = offset as usize;
self.range.start = self.range.start.saturating_add(offset);
self.range.end = self.range.end.saturating_add(offset);
} else {
self.range.start += offset as usize;
self.range.end += offset as usize;
let abs_offset = offset.unsigned_abs();
self.range.start = self.range.start.saturating_sub(abs_offset);
self.range.end = self.range.end.saturating_sub(abs_offset);
}
}

Expand Down Expand Up @@ -73,4 +80,23 @@ mod tests {
diff.offset(-10);
assert_eq!(diff.range, 0..0);
}

#[test]
fn offset_positive_large() {
let mut diff = diff();

diff.offset(isize::MAX);

assert_eq!(
diff.range,
(1 + isize::MAX as usize)..(4 + isize::MAX as usize)
);
}

#[test]
fn offset_negative_large() {
let mut diff = diff();
diff.offset(isize::MIN);
assert_eq!(diff.range, 0..0);
}
}
6 changes: 3 additions & 3 deletions src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,12 @@ impl File {
/// If [`interactive`](File::interactive) is set to `true`, the user will be
/// asked if they want to apply the change for each diff.
pub fn present(&mut self) -> Result {
let mut offset = 0;
let mut offset: isize = 0;

let diffs = self.diffs().collect::<Result<Vec<Diff>>>()?;

for mut diff in diffs {
let prev = self.content.len_chars();
let prev = self.content.len_bytes();

diff.offset(offset);

Expand All @@ -111,7 +111,7 @@ impl File {
}

self.content.apply(diff.clone());
offset += self.content.len_chars() as isize - prev as isize;
offset += self.content.len_bytes() as isize - prev as isize;
}

Ok(())
Expand Down
20 changes: 20 additions & 0 deletions src/grapheme.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use crate::common::*;

pub(crate) fn byte_index_to_grapheme_index(
s: &str,
byte_index: usize,
) -> usize {
s.grapheme_indices(true)
.take_while(|(i, _)| *i < byte_index)
.count()
}

pub(crate) fn grapheme_index_to_byte_index(
s: &str,
grapheme_index: usize,
) -> usize {
s.grapheme_indices(true)
.nth(grapheme_index)
.map(|(i, _)| i)
.unwrap_or(s.len())
}
11 changes: 9 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ mod common;
mod diff;
mod error;
mod file;
mod grapheme;
mod lexer;
mod parser;
mod position;
Expand All @@ -34,8 +35,14 @@ pub use crate::{diff::Diff, error::Error, file::File};

// Public only to crate
pub(crate) use crate::{
codeblock::Codeblock, command::Command, lexer::Lexer, parser::Parser,
position::Position, prompt::prompt, rope_ext::RopeExt,
codeblock::Codeblock,
command::Command,
grapheme::{byte_index_to_grapheme_index, grapheme_index_to_byte_index},
lexer::Lexer,
parser::Parser,
position::Position,
prompt::prompt,
rope_ext::RopeExt,
};

/// Present's internal result type
Expand Down
85 changes: 70 additions & 15 deletions src/parser.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use crate::{common::*, Codeblock, Command, Position, Result};
use crate::{
byte_index_to_grapheme_index, common::*, grapheme_index_to_byte_index,
Codeblock, Command, Position, Result,
};

#[derive(Debug, Clone)]
pub(crate) struct Parser<'a> {
Expand Down Expand Up @@ -39,33 +42,63 @@ impl<'a> Parser<'a> {
let start_start = range.start;
let mut start_end = start_start;

while let Some(ch) = self.src.chars().nth(start_end) {
match ch {
'`' => start_end += 1,
let src_graphemes: Vec<&str> = self.src.graphemes(true).collect();

while let Some(grapheme) =
src_graphemes.get(byte_index_to_grapheme_index(self.src, start_end))
{
match *grapheme {
"`" => {
start_end = grapheme_index_to_byte_index(
self.src,
byte_index_to_grapheme_index(self.src, start_end) + 1,
)
}
_ => break,
}
}

while let Some(ch) = self.src.chars().nth(start_end) {
match ch {
'`' | '\n' => break,
_ => start_end += 1,
while let Some(grapheme) =
src_graphemes.get(byte_index_to_grapheme_index(self.src, start_end))
{
match *grapheme {
"`" | "\n" => break,
_ => {
start_end = grapheme_index_to_byte_index(
self.src,
byte_index_to_grapheme_index(self.src, start_end) + 1,
)
}
}
}

let end_end = range.end - 1;
let mut end_start = end_end;

while let Some(ch) = self.src.chars().nth(end_start) {
match ch {
'`' => break,
_ => end_start -= 1,
while let Some(grapheme) =
src_graphemes.get(byte_index_to_grapheme_index(self.src, end_start))
{
match *grapheme {
"`" => break,
_ => {
end_start = grapheme_index_to_byte_index(
self.src,
byte_index_to_grapheme_index(self.src, end_start) - 1,
)
}
}
}

while let Some(ch) = self.src.chars().nth(end_start) {
match ch {
'`' => end_start -= 1,
while let Some(grapheme) =
src_graphemes.get(byte_index_to_grapheme_index(self.src, end_start))
{
match *grapheme {
"`" => {
end_start = grapheme_index_to_byte_index(
self.src,
byte_index_to_grapheme_index(self.src, end_start) - 1,
)
}
_ => break,
}
}
Expand Down Expand Up @@ -137,4 +170,26 @@ mod tests {
}
);
}

#[test]
fn parse_codeblock_with_unicode() {
let parser = Parser::new("```present echo 🚀\n```");

let codeblock = parser.parse_codeblock(0..23).unwrap().unwrap();

assert_eq!(
codeblock.command,
Command::from(vec!["present".into(), "echo".into(), "🚀".into()])
.unwrap()
.unwrap()
);

assert_eq!(
codeblock.position,
Position {
start: 0..20,
end: 20..22
}
);
}
}
12 changes: 8 additions & 4 deletions src/rope_ext.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@ pub(crate) trait RopeExt {

impl RopeExt for Rope {
fn apply(&mut self, diff: Diff) {
self.remove(diff.range.clone());
self.insert(diff.range.start, &diff.content);
let start = self.byte_to_char(diff.range.start);
let end = self.byte_to_char(diff.range.end);
self.remove(start..end);
self.insert(start, &diff.content);
}

fn simulate(&self, diff: Diff) -> Rope {
let mut clone = self.clone();
clone.remove(diff.range.clone());
clone.insert(diff.range.start, &diff.content);
let start = clone.byte_to_char(diff.range.start);
let end = clone.byte_to_char(diff.range.end);
clone.remove(start..end);
clone.insert(start, &diff.content);
clone
}
}
28 changes: 28 additions & 0 deletions tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -584,3 +584,31 @@ fn interactive_reject() -> Result {

Ok(())
}

#[test]
fn grapheme_handling() -> Result {
Test::new()?
.markdown(
r#"
Hello, 世界! 👋

```present echo "🚀 Grapheme test: é, 世界, 👨‍👩‍👧‍👦"
```

Grapheme cluster: 👨‍👩‍👧‍👦
"#,
)
.expected_status(0)
.expected_stdout(
r#"
Hello, 世界! 👋

```present echo "🚀 Grapheme test: é, 世界, 👨‍👩‍👧‍👦"
🚀 Grapheme test: é, 世界, 👨‍👩‍👧‍👦
```

Grapheme cluster: 👨‍👩‍👧‍👦
"#,
)
.run()
}
Loading