diff --git a/Cargo.lock b/Cargo.lock index c2d24604..4411be0c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1285,6 +1285,7 @@ dependencies = [ "profiling", "ron", "serde", + "unicode-segmentation", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index a0051513..d272498f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -99,6 +99,7 @@ serde = { version = "1", features = ["derive"] } similar-asserts = "1.4.2" thiserror = "1.0.37" type-map = "0.5.0" +unicode-segmentation = "1.12.0" wasm-bindgen = "0.2" wasm-bindgen-futures = "0.4" web-sys = "0.3.73" diff --git a/crates/egui/Cargo.toml b/crates/egui/Cargo.toml index e00202d1..e7641ef3 100644 --- a/crates/egui/Cargo.toml +++ b/crates/egui/Cargo.toml @@ -87,6 +87,7 @@ ahash.workspace = true bitflags.workspace = true nohash-hasher.workspace = true profiling.workspace = true +unicode-segmentation.workspace = true #! ### Optional dependencies accesskit = { workspace = true, optional = true } diff --git a/crates/egui/src/text_selection/text_cursor_state.rs b/crates/egui/src/text_selection/text_cursor_state.rs index baf7b046..aaa3beb9 100644 --- a/crates/egui/src/text_selection/text_cursor_state.rs +++ b/crates/egui/src/text_selection/text_cursor_state.rs @@ -1,6 +1,7 @@ //! Text cursor changes/interaction, without modifying the text. use epaint::text::{cursor::CCursor, Galley}; +use unicode_segmentation::UnicodeSegmentation; use crate::{epaint, NumExt, Rect, Response, Ui}; @@ -166,7 +167,7 @@ fn select_line_at(text: &str, ccursor: CCursor) -> CCursorRange { pub fn ccursor_next_word(text: &str, ccursor: CCursor) -> CCursor { CCursor { - index: next_word_boundary_char_index(text.chars(), ccursor.index), + index: next_word_boundary_char_index(text, ccursor.index), prefer_next_row: false, } } @@ -180,9 +181,10 @@ fn ccursor_next_line(text: &str, ccursor: CCursor) -> CCursor { pub fn ccursor_previous_word(text: &str, ccursor: CCursor) -> CCursor { let num_chars = text.chars().count(); + let reversed: String = text.graphemes(true).rev().collect(); CCursor { index: num_chars - - next_word_boundary_char_index(text.chars().rev(), num_chars - ccursor.index), + - next_word_boundary_char_index(&reversed, num_chars - ccursor.index).min(num_chars), prefer_next_row: true, } } @@ -196,22 +198,25 @@ fn ccursor_previous_line(text: &str, ccursor: CCursor) -> CCursor { } } -fn next_word_boundary_char_index(it: impl Iterator, mut index: usize) -> usize { - let mut it = it.skip(index); - if let Some(_first) = it.next() { - index += 1; - - if let Some(second) = it.next() { - index += 1; - for next in it { - if is_word_char(next) != is_word_char(second) { - break; - } - index += 1; - } +fn next_word_boundary_char_index(text: &str, index: usize) -> usize { + for word in text.split_word_bound_indices() { + // Splitting considers contiguous whitespace as one word, such words must be skipped, + // this handles cases for example ' abc' (a space and a word), the cursor is at the beginning + // (before space) - this jumps at the end of 'abc' (this is consistent with text editors + // or browsers) + let ci = char_index_from_byte_index(text, word.0); + if ci > index && !skip_word(word.1) { + return ci; } } - index + + char_index_from_byte_index(text, text.len()) +} + +fn skip_word(text: &str) -> bool { + // skip words that contain anything other than alphanumeric characters and underscore + // (i.e. whitespace, dashes, etc.) + !text.chars().any(|c| !is_word_char(c)) } fn next_line_boundary_char_index(it: impl Iterator, mut index: usize) -> usize { @@ -233,7 +238,7 @@ fn next_line_boundary_char_index(it: impl Iterator, mut index: usiz } pub fn is_word_char(c: char) -> bool { - c.is_ascii_alphanumeric() || c == '_' + c.is_alphanumeric() || c == '_' } fn is_linebreak(c: char) -> bool { @@ -270,6 +275,16 @@ pub fn byte_index_from_char_index(s: &str, char_index: usize) -> usize { s.len() } +pub fn char_index_from_byte_index(input: &str, byte_index: usize) -> usize { + for (ci, (bi, _)) in input.char_indices().enumerate() { + if bi == byte_index { + return ci; + } + } + + input.char_indices().last().map_or(0, |(i, _)| i + 1) +} + pub fn slice_char_range(s: &str, char_range: std::ops::Range) -> &str { assert!( char_range.start <= char_range.end, @@ -293,3 +308,38 @@ pub fn cursor_rect(galley: &Galley, cursor: &CCursor, row_height: f32) -> Rect { cursor_pos } + +#[cfg(test)] +mod test { + use crate::text_selection::text_cursor_state::next_word_boundary_char_index; + + #[test] + fn test_next_word_boundary_char_index() { + // ASCII only + let text = "abc d3f g_h i-j"; + assert_eq!(next_word_boundary_char_index(text, 1), 3); + assert_eq!(next_word_boundary_char_index(text, 3), 7); + assert_eq!(next_word_boundary_char_index(text, 9), 11); + assert_eq!(next_word_boundary_char_index(text, 12), 13); + assert_eq!(next_word_boundary_char_index(text, 13), 15); + assert_eq!(next_word_boundary_char_index(text, 15), 15); + + assert_eq!(next_word_boundary_char_index("", 0), 0); + assert_eq!(next_word_boundary_char_index("", 1), 0); + + // Unicode graphemes, some of which consist of multiple Unicode characters, + // !!! Unicode character is not always what is tranditionally considered a character, + // the values below are correct despite not seeming that way on the first look, + // handling of and around emojis is kind of weird and is not consistent across + // text editors and browsers + let text = "❤️👍 skvělá knihovna 👍❤️"; + assert_eq!(next_word_boundary_char_index(text, 0), 2); + assert_eq!(next_word_boundary_char_index(text, 2), 3); // this does not skip the space between thumbs-up and 'skvělá' + assert_eq!(next_word_boundary_char_index(text, 6), 10); + assert_eq!(next_word_boundary_char_index(text, 9), 10); + assert_eq!(next_word_boundary_char_index(text, 12), 19); + assert_eq!(next_word_boundary_char_index(text, 15), 19); + assert_eq!(next_word_boundary_char_index(text, 19), 20); + assert_eq!(next_word_boundary_char_index(text, 20), 21); + } +} diff --git a/crates/egui/src/widgets/text_edit/text_buffer.rs b/crates/egui/src/widgets/text_edit/text_buffer.rs index 6cf7da15..ebf33b09 100644 --- a/crates/egui/src/widgets/text_edit/text_buffer.rs +++ b/crates/egui/src/widgets/text_edit/text_buffer.rs @@ -8,8 +8,8 @@ use epaint::{ use crate::{ text::CCursorRange, text_selection::text_cursor_state::{ - byte_index_from_char_index, ccursor_next_word, ccursor_previous_word, find_line_start, - slice_char_range, + byte_index_from_char_index, ccursor_next_word, ccursor_previous_word, + char_index_from_byte_index, find_line_start, slice_char_range, }, }; @@ -48,6 +48,10 @@ pub trait TextBuffer { byte_index_from_char_index(self.as_str(), char_index) } + fn char_index_from_byte_index(&self, char_index: usize) -> usize { + char_index_from_byte_index(self.as_str(), char_index) + } + /// Clears all characters in this buffer fn clear(&mut self) { self.delete_char_range(0..self.as_str().len());