Rework `TextEdit` arrow navigation to handle Unicode graphemes (#5812)
* [x] I have followed the instructions in the PR template Previously, navigating text in `TextEdit` with Ctrl + left/right arrow would jump inside words that contained combining characters (i.e. diacritics). This PR introduces new dependency of `unicode-segmentation` to handle grapheme encoding. The new implementation ignores whitespace and other separators such as `-` (dash) between words, but respects `_` (underscore). --------- Co-authored-by: lucasmerlin <hi@lucasmerlin.me>
This commit is contained in:
parent
a0f072ab1e
commit
69b9f0eede
|
|
@ -1285,6 +1285,7 @@ dependencies = [
|
||||||
"profiling",
|
"profiling",
|
||||||
"ron",
|
"ron",
|
||||||
"serde",
|
"serde",
|
||||||
|
"unicode-segmentation",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
||||||
|
|
@ -99,6 +99,7 @@ serde = { version = "1", features = ["derive"] }
|
||||||
similar-asserts = "1.4.2"
|
similar-asserts = "1.4.2"
|
||||||
thiserror = "1.0.37"
|
thiserror = "1.0.37"
|
||||||
type-map = "0.5.0"
|
type-map = "0.5.0"
|
||||||
|
unicode-segmentation = "1.12.0"
|
||||||
wasm-bindgen = "0.2"
|
wasm-bindgen = "0.2"
|
||||||
wasm-bindgen-futures = "0.4"
|
wasm-bindgen-futures = "0.4"
|
||||||
web-sys = "0.3.73"
|
web-sys = "0.3.73"
|
||||||
|
|
|
||||||
|
|
@ -87,6 +87,7 @@ ahash.workspace = true
|
||||||
bitflags.workspace = true
|
bitflags.workspace = true
|
||||||
nohash-hasher.workspace = true
|
nohash-hasher.workspace = true
|
||||||
profiling.workspace = true
|
profiling.workspace = true
|
||||||
|
unicode-segmentation.workspace = true
|
||||||
|
|
||||||
#! ### Optional dependencies
|
#! ### Optional dependencies
|
||||||
accesskit = { workspace = true, optional = true }
|
accesskit = { workspace = true, optional = true }
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
//! Text cursor changes/interaction, without modifying the text.
|
//! Text cursor changes/interaction, without modifying the text.
|
||||||
|
|
||||||
use epaint::text::{cursor::CCursor, Galley};
|
use epaint::text::{cursor::CCursor, Galley};
|
||||||
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
|
|
||||||
use crate::{epaint, NumExt, Rect, Response, Ui};
|
use crate::{epaint, NumExt, Rect, Response, Ui};
|
||||||
|
|
||||||
|
|
@ -166,7 +167,7 @@ fn select_line_at(text: &str, ccursor: CCursor) -> CCursorRange {
|
||||||
|
|
||||||
pub fn ccursor_next_word(text: &str, ccursor: CCursor) -> CCursor {
|
pub fn ccursor_next_word(text: &str, ccursor: CCursor) -> CCursor {
|
||||||
CCursor {
|
CCursor {
|
||||||
index: next_word_boundary_char_index(text.chars(), ccursor.index),
|
index: next_word_boundary_char_index(text, ccursor.index),
|
||||||
prefer_next_row: false,
|
prefer_next_row: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -180,9 +181,10 @@ fn ccursor_next_line(text: &str, ccursor: CCursor) -> CCursor {
|
||||||
|
|
||||||
pub fn ccursor_previous_word(text: &str, ccursor: CCursor) -> CCursor {
|
pub fn ccursor_previous_word(text: &str, ccursor: CCursor) -> CCursor {
|
||||||
let num_chars = text.chars().count();
|
let num_chars = text.chars().count();
|
||||||
|
let reversed: String = text.graphemes(true).rev().collect();
|
||||||
CCursor {
|
CCursor {
|
||||||
index: num_chars
|
index: num_chars
|
||||||
- next_word_boundary_char_index(text.chars().rev(), num_chars - ccursor.index),
|
- next_word_boundary_char_index(&reversed, num_chars - ccursor.index).min(num_chars),
|
||||||
prefer_next_row: true,
|
prefer_next_row: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -196,22 +198,25 @@ fn ccursor_previous_line(text: &str, ccursor: CCursor) -> CCursor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn next_word_boundary_char_index(it: impl Iterator<Item = char>, mut index: usize) -> usize {
|
fn next_word_boundary_char_index(text: &str, index: usize) -> usize {
|
||||||
let mut it = it.skip(index);
|
for word in text.split_word_bound_indices() {
|
||||||
if let Some(_first) = it.next() {
|
// Splitting considers contiguous whitespace as one word, such words must be skipped,
|
||||||
index += 1;
|
// this handles cases for example ' abc' (a space and a word), the cursor is at the beginning
|
||||||
|
// (before space) - this jumps at the end of 'abc' (this is consistent with text editors
|
||||||
|
// or browsers)
|
||||||
|
let ci = char_index_from_byte_index(text, word.0);
|
||||||
|
if ci > index && !skip_word(word.1) {
|
||||||
|
return ci;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(second) = it.next() {
|
char_index_from_byte_index(text, text.len())
|
||||||
index += 1;
|
}
|
||||||
for next in it {
|
|
||||||
if is_word_char(next) != is_word_char(second) {
|
fn skip_word(text: &str) -> bool {
|
||||||
break;
|
// skip words that contain anything other than alphanumeric characters and underscore
|
||||||
}
|
// (i.e. whitespace, dashes, etc.)
|
||||||
index += 1;
|
!text.chars().any(|c| !is_word_char(c))
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
index
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn next_line_boundary_char_index(it: impl Iterator<Item = char>, mut index: usize) -> usize {
|
fn next_line_boundary_char_index(it: impl Iterator<Item = char>, mut index: usize) -> usize {
|
||||||
|
|
@ -233,7 +238,7 @@ fn next_line_boundary_char_index(it: impl Iterator<Item = char>, mut index: usiz
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_word_char(c: char) -> bool {
|
pub fn is_word_char(c: char) -> bool {
|
||||||
c.is_ascii_alphanumeric() || c == '_'
|
c.is_alphanumeric() || c == '_'
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_linebreak(c: char) -> bool {
|
fn is_linebreak(c: char) -> bool {
|
||||||
|
|
@ -270,6 +275,16 @@ pub fn byte_index_from_char_index(s: &str, char_index: usize) -> usize {
|
||||||
s.len()
|
s.len()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn char_index_from_byte_index(input: &str, byte_index: usize) -> usize {
|
||||||
|
for (ci, (bi, _)) in input.char_indices().enumerate() {
|
||||||
|
if bi == byte_index {
|
||||||
|
return ci;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
input.char_indices().last().map_or(0, |(i, _)| i + 1)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn slice_char_range(s: &str, char_range: std::ops::Range<usize>) -> &str {
|
pub fn slice_char_range(s: &str, char_range: std::ops::Range<usize>) -> &str {
|
||||||
assert!(
|
assert!(
|
||||||
char_range.start <= char_range.end,
|
char_range.start <= char_range.end,
|
||||||
|
|
@ -293,3 +308,38 @@ pub fn cursor_rect(galley: &Galley, cursor: &CCursor, row_height: f32) -> Rect {
|
||||||
|
|
||||||
cursor_pos
|
cursor_pos
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use crate::text_selection::text_cursor_state::next_word_boundary_char_index;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_next_word_boundary_char_index() {
|
||||||
|
// ASCII only
|
||||||
|
let text = "abc d3f g_h i-j";
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 1), 3);
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 3), 7);
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 9), 11);
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 12), 13);
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 13), 15);
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 15), 15);
|
||||||
|
|
||||||
|
assert_eq!(next_word_boundary_char_index("", 0), 0);
|
||||||
|
assert_eq!(next_word_boundary_char_index("", 1), 0);
|
||||||
|
|
||||||
|
// Unicode graphemes, some of which consist of multiple Unicode characters,
|
||||||
|
// !!! Unicode character is not always what is tranditionally considered a character,
|
||||||
|
// the values below are correct despite not seeming that way on the first look,
|
||||||
|
// handling of and around emojis is kind of weird and is not consistent across
|
||||||
|
// text editors and browsers
|
||||||
|
let text = "❤️👍 skvělá knihovna 👍❤️";
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 0), 2);
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 2), 3); // this does not skip the space between thumbs-up and 'skvělá'
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 6), 10);
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 9), 10);
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 12), 19);
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 15), 19);
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 19), 20);
|
||||||
|
assert_eq!(next_word_boundary_char_index(text, 20), 21);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -8,8 +8,8 @@ use epaint::{
|
||||||
use crate::{
|
use crate::{
|
||||||
text::CCursorRange,
|
text::CCursorRange,
|
||||||
text_selection::text_cursor_state::{
|
text_selection::text_cursor_state::{
|
||||||
byte_index_from_char_index, ccursor_next_word, ccursor_previous_word, find_line_start,
|
byte_index_from_char_index, ccursor_next_word, ccursor_previous_word,
|
||||||
slice_char_range,
|
char_index_from_byte_index, find_line_start, slice_char_range,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -48,6 +48,10 @@ pub trait TextBuffer {
|
||||||
byte_index_from_char_index(self.as_str(), char_index)
|
byte_index_from_char_index(self.as_str(), char_index)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn char_index_from_byte_index(&self, char_index: usize) -> usize {
|
||||||
|
char_index_from_byte_index(self.as_str(), char_index)
|
||||||
|
}
|
||||||
|
|
||||||
/// Clears all characters in this buffer
|
/// Clears all characters in this buffer
|
||||||
fn clear(&mut self) {
|
fn clear(&mut self) {
|
||||||
self.delete_char_range(0..self.as_str().len());
|
self.delete_char_range(0..self.as_str().len());
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue