Skip to content

Commit

Permalink
switch to regex-cursor (helix-editor#9422)
Browse files Browse the repository at this point in the history
  • Loading branch information
pascalkuthe authored and postsolar committed Apr 4, 2024
1 parent ef809c9 commit 5a93e0d
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 86 deletions.
18 changes: 16 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

96 changes: 60 additions & 36 deletions helix-core/src/selection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ use crate::{
ensure_grapheme_boundary_next, ensure_grapheme_boundary_prev, next_grapheme_boundary,
prev_grapheme_boundary,
},
line_ending::get_line_ending,
movement::Direction,
Assoc, ChangeSet, RopeGraphemes, RopeSlice,
};
use helix_stdx::rope::{self, RopeSliceExt};
use smallvec::{smallvec, SmallVec};
use std::borrow::Cow;

Expand Down Expand Up @@ -708,12 +710,12 @@ impl IntoIterator for Selection {
pub fn keep_or_remove_matches(
text: RopeSlice,
selection: &Selection,
regex: &crate::regex::Regex,
regex: &rope::Regex,
remove: bool,
) -> Option<Selection> {
let result: SmallVec<_> = selection
.iter()
.filter(|range| regex.is_match(&range.fragment(text)) ^ remove)
.filter(|range| regex.is_match(text.regex_input_at(range.from()..range.to())) ^ remove)
.copied()
.collect();

Expand All @@ -724,25 +726,20 @@ pub fn keep_or_remove_matches(
None
}

// TODO: support to split on capture #N instead of whole match
pub fn select_on_matches(
text: RopeSlice,
selection: &Selection,
regex: &crate::regex::Regex,
regex: &rope::Regex,
) -> Option<Selection> {
let mut result = SmallVec::with_capacity(selection.len());

for sel in selection {
// TODO: can't avoid occasional allocations since Regex can't operate on chunks yet
let fragment = sel.fragment(text);

let sel_start = sel.from();
let start_byte = text.char_to_byte(sel_start);

for mat in regex.find_iter(&fragment) {
for mat in regex.find_iter(text.regex_input_at(sel.from()..sel.to())) {
// TODO: retain range direction

let start = text.byte_to_char(start_byte + mat.start());
let end = text.byte_to_char(start_byte + mat.end());
let start = text.byte_to_char(mat.start());
let end = text.byte_to_char(mat.end());

let range = Range::new(start, end);
// Make sure the match is not right outside of the selection.
Expand All @@ -761,12 +758,7 @@ pub fn select_on_matches(
None
}

// TODO: support to split on capture #N instead of whole match
pub fn split_on_matches(
text: RopeSlice,
selection: &Selection,
regex: &crate::regex::Regex,
) -> Selection {
pub fn split_on_newline(text: RopeSlice, selection: &Selection) -> Selection {
let mut result = SmallVec::with_capacity(selection.len());

for sel in selection {
Expand All @@ -776,21 +768,47 @@ pub fn split_on_matches(
continue;
}

// TODO: can't avoid occasional allocations since Regex can't operate on chunks yet
let fragment = sel.fragment(text);

let sel_start = sel.from();
let sel_end = sel.to();

let start_byte = text.char_to_byte(sel_start);
let mut start = sel_start;

for mat in sel.slice(text).lines() {
let len = mat.len_chars();
let line_end_len = get_line_ending(&mat).map(|le| le.len_chars()).unwrap_or(0);
// TODO: retain range direction
result.push(Range::new(start, start + len - line_end_len));
start += len;
}

if start < sel_end {
result.push(Range::new(start, sel_end));
}
}

// TODO: figure out a new primary index
Selection::new(result, 0)
}

pub fn split_on_matches(text: RopeSlice, selection: &Selection, regex: &rope::Regex) -> Selection {
let mut result = SmallVec::with_capacity(selection.len());

for sel in selection {
// Special case: zero-width selection.
if sel.from() == sel.to() {
result.push(*sel);
continue;
}

let sel_start = sel.from();
let sel_end = sel.to();
let mut start = sel_start;

for mat in regex.find_iter(&fragment) {
for mat in regex.find_iter(text.regex_input_at(sel_start..sel_end)) {
// TODO: retain range direction
let end = text.byte_to_char(start_byte + mat.start());
let end = text.byte_to_char(mat.start());
result.push(Range::new(start, end));
start = text.byte_to_char(start_byte + mat.end());
start = text.byte_to_char(mat.end());
}

if start < sel_end {
Expand Down Expand Up @@ -1021,14 +1039,12 @@ mod test {

#[test]
fn test_select_on_matches() {
use crate::regex::{Regex, RegexBuilder};

let r = Rope::from_str("Nobody expects the Spanish inquisition");
let s = r.slice(..);

let selection = Selection::single(0, r.len_chars());
assert_eq!(
select_on_matches(s, &selection, &Regex::new(r"[A-Z][a-z]*").unwrap()),
select_on_matches(s, &selection, &rope::Regex::new(r"[A-Z][a-z]*").unwrap()),
Some(Selection::new(
smallvec![Range::new(0, 6), Range::new(19, 26)],
0
Expand All @@ -1038,8 +1054,14 @@ mod test {
let r = Rope::from_str("This\nString\n\ncontains multiple\nlines");
let s = r.slice(..);

let start_of_line = RegexBuilder::new(r"^").multi_line(true).build().unwrap();
let end_of_line = RegexBuilder::new(r"$").multi_line(true).build().unwrap();
let start_of_line = rope::RegexBuilder::new()
.syntax(rope::Config::new().multi_line(true))
.build(r"^")
.unwrap();
let end_of_line = rope::RegexBuilder::new()
.syntax(rope::Config::new().multi_line(true))
.build(r"$")
.unwrap();

// line without ending
assert_eq!(
Expand Down Expand Up @@ -1077,9 +1099,9 @@ mod test {
select_on_matches(
s,
&Selection::single(0, s.len_chars()),
&RegexBuilder::new(r"^[a-z ]*$")
.multi_line(true)
.build()
&rope::RegexBuilder::new()
.syntax(rope::Config::new().multi_line(true))
.build(r"^[a-z ]*$")
.unwrap()
),
Some(Selection::new(
Expand Down Expand Up @@ -1171,13 +1193,15 @@ mod test {

#[test]
fn test_split_on_matches() {
use crate::regex::Regex;

let text = Rope::from(" abcd efg wrs xyz 123 456");

let selection = Selection::new(smallvec![Range::new(0, 9), Range::new(11, 20),], 0);

let result = split_on_matches(text.slice(..), &selection, &Regex::new(r"\s+").unwrap());
let result = split_on_matches(
text.slice(..),
&selection,
&rope::Regex::new(r"\s+").unwrap(),
);

assert_eq!(
result.ranges(),
Expand Down
12 changes: 9 additions & 3 deletions helix-core/src/syntax.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use arc_swap::{ArcSwap, Guard};
use bitflags::bitflags;
use globset::GlobSet;
use hashbrown::raw::RawTable;
use helix_stdx::rope::{self, RopeSliceExt};
use slotmap::{DefaultKey as LayerId, HopSlotMap};

use std::{
Expand Down Expand Up @@ -1961,11 +1962,16 @@ impl HighlightConfiguration {
node_slice
};

static SHEBANG_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(SHEBANG).unwrap());
static SHEBANG_REGEX: Lazy<rope::Regex> =
Lazy::new(|| rope::Regex::new(SHEBANG).unwrap());

injection_capture = SHEBANG_REGEX
.captures(&Cow::from(lines))
.map(|cap| InjectionLanguageMarker::Shebang(cap[1].to_owned()))
.captures_iter(lines.regex_input())
.map(|cap| {
let cap = lines.byte_slice(cap.get_group(1).unwrap().range());
InjectionLanguageMarker::Shebang(cap.into())
})
.next()
} else if index == self.injection_content_capture_index {
content_node = Some(capture.node);
}
Expand Down
1 change: 1 addition & 0 deletions helix-stdx/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dunce = "1.0"
etcetera = "0.8"
ropey = { version = "1.6.1", default-features = false }
which = "6.0"
regex-cursor = "0.1.3"

[dev-dependencies]
tempfile = "3.10"
45 changes: 43 additions & 2 deletions helix-stdx/src/rope.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
use std::ops::{Bound, RangeBounds};

pub use regex_cursor::engines::meta::{Builder as RegexBuilder, Regex};
pub use regex_cursor::regex_automata::util::syntax::Config;
use regex_cursor::{Input as RegexInput, RopeyCursor};
use ropey::RopeSlice;

pub trait RopeSliceExt: Sized {
pub trait RopeSliceExt<'a>: Sized {
fn ends_with(self, text: &str) -> bool;
fn starts_with(self, text: &str) -> bool;
fn regex_input(self) -> RegexInput<RopeyCursor<'a>>;
fn regex_input_at_bytes<R: RangeBounds<usize>>(
self,
byte_range: R,
) -> RegexInput<RopeyCursor<'a>>;
fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>>;
}

impl RopeSliceExt for RopeSlice<'_> {
impl<'a> RopeSliceExt<'a> for RopeSlice<'a> {
fn ends_with(self, text: &str) -> bool {
let len = self.len_bytes();
if len < text.len() {
Expand All @@ -23,4 +34,34 @@ impl RopeSliceExt for RopeSlice<'_> {
self.get_byte_slice(..len - text.len())
.map_or(false, |start| start == text)
}

fn regex_input(self) -> RegexInput<RopeyCursor<'a>> {
RegexInput::new(self)
}

fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>> {
let start_bound = match char_range.start_bound() {
Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),
Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),
Bound::Unbounded => Bound::Unbounded,
};
let end_bound = match char_range.end_bound() {
Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),
Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),
Bound::Unbounded => Bound::Unbounded,
};
self.regex_input_at_bytes((start_bound, end_bound))
}
fn regex_input_at_bytes<R: RangeBounds<usize>>(
self,
byte_range: R,
) -> RegexInput<RopeyCursor<'a>> {
let input = match byte_range.start_bound() {
Bound::Included(&pos) | Bound::Excluded(&pos) => {
RegexInput::new(RopeyCursor::at(self, pos))
}
Bound::Unbounded => RegexInput::new(self),
};
input.range(byte_range)
}
}

0 comments on commit 5a93e0d

Please sign in to comment.