Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to regex-cursor #9422

Merged
merged 1 commit into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 16 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

96 changes: 60 additions & 36 deletions helix-core/src/selection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ use crate::{
ensure_grapheme_boundary_next, ensure_grapheme_boundary_prev, next_grapheme_boundary,
prev_grapheme_boundary,
},
line_ending::get_line_ending,
movement::Direction,
Assoc, ChangeSet, RopeGraphemes, RopeSlice,
};
use helix_stdx::rope::{self, RopeSliceExt};
use smallvec::{smallvec, SmallVec};
use std::borrow::Cow;

Expand Down Expand Up @@ -708,12 +710,12 @@ impl IntoIterator for Selection {
pub fn keep_or_remove_matches(
text: RopeSlice,
selection: &Selection,
regex: &crate::regex::Regex,
regex: &rope::Regex,
remove: bool,
) -> Option<Selection> {
let result: SmallVec<_> = selection
.iter()
.filter(|range| regex.is_match(&range.fragment(text)) ^ remove)
.filter(|range| regex.is_match(text.regex_input_at(range.from()..range.to())) ^ remove)
.copied()
.collect();

Expand All @@ -724,25 +726,20 @@ pub fn keep_or_remove_matches(
None
}

// TODO: support to split on capture #N instead of whole match
pub fn select_on_matches(
text: RopeSlice,
selection: &Selection,
regex: &crate::regex::Regex,
regex: &rope::Regex,
) -> Option<Selection> {
let mut result = SmallVec::with_capacity(selection.len());

for sel in selection {
// TODO: can't avoid occasional allocations since Regex can't operate on chunks yet
let fragment = sel.fragment(text);

let sel_start = sel.from();
let start_byte = text.char_to_byte(sel_start);

for mat in regex.find_iter(&fragment) {
for mat in regex.find_iter(text.regex_input_at(sel.from()..sel.to())) {
pascalkuthe marked this conversation as resolved.
Show resolved Hide resolved
// TODO: retain range direction

let start = text.byte_to_char(start_byte + mat.start());
let end = text.byte_to_char(start_byte + mat.end());
let start = text.byte_to_char(mat.start());
let end = text.byte_to_char(mat.end());

let range = Range::new(start, end);
// Make sure the match is not right outside of the selection.
Expand All @@ -761,12 +758,7 @@ pub fn select_on_matches(
None
}

// TODO: support to split on capture #N instead of whole match
pascalkuthe marked this conversation as resolved.
Show resolved Hide resolved
pub fn split_on_matches(
text: RopeSlice,
selection: &Selection,
regex: &crate::regex::Regex,
) -> Selection {
pub fn split_on_newline(text: RopeSlice, selection: &Selection) -> Selection {
let mut result = SmallVec::with_capacity(selection.len());

for sel in selection {
Expand All @@ -776,21 +768,47 @@ pub fn split_on_matches(
continue;
}

// TODO: can't avoid occasional allocations since Regex can't operate on chunks yet
let fragment = sel.fragment(text);

let sel_start = sel.from();
let sel_end = sel.to();

let start_byte = text.char_to_byte(sel_start);
let mut start = sel_start;

for mat in sel.slice(text).lines() {
let len = mat.len_chars();
let line_end_len = get_line_ending(&mat).map(|le| le.len_chars()).unwrap_or(0);
// TODO: retain range direction
result.push(Range::new(start, start + len - line_end_len));
start += len;
}

if start < sel_end {
result.push(Range::new(start, sel_end));
}
}

// TODO: figure out a new primary index
Selection::new(result, 0)
}

pub fn split_on_matches(text: RopeSlice, selection: &Selection, regex: &rope::Regex) -> Selection {
let mut result = SmallVec::with_capacity(selection.len());

for sel in selection {
// Special case: zero-width selection.
if sel.from() == sel.to() {
result.push(*sel);
continue;
}

let sel_start = sel.from();
let sel_end = sel.to();
let mut start = sel_start;

for mat in regex.find_iter(&fragment) {
for mat in regex.find_iter(text.regex_input_at(sel_start..sel_end)) {
// TODO: retain range direction
let end = text.byte_to_char(start_byte + mat.start());
let end = text.byte_to_char(mat.start());
result.push(Range::new(start, end));
start = text.byte_to_char(start_byte + mat.end());
start = text.byte_to_char(mat.end());
}

if start < sel_end {
Expand Down Expand Up @@ -1021,14 +1039,12 @@ mod test {

#[test]
fn test_select_on_matches() {
use crate::regex::{Regex, RegexBuilder};

let r = Rope::from_str("Nobody expects the Spanish inquisition");
let s = r.slice(..);

let selection = Selection::single(0, r.len_chars());
assert_eq!(
select_on_matches(s, &selection, &Regex::new(r"[A-Z][a-z]*").unwrap()),
select_on_matches(s, &selection, &rope::Regex::new(r"[A-Z][a-z]*").unwrap()),
Some(Selection::new(
smallvec![Range::new(0, 6), Range::new(19, 26)],
0
Expand All @@ -1038,8 +1054,14 @@ mod test {
let r = Rope::from_str("This\nString\n\ncontains multiple\nlines");
let s = r.slice(..);

let start_of_line = RegexBuilder::new(r"^").multi_line(true).build().unwrap();
let end_of_line = RegexBuilder::new(r"$").multi_line(true).build().unwrap();
let start_of_line = rope::RegexBuilder::new()
.syntax(rope::Config::new().multi_line(true))
.build(r"^")
.unwrap();
let end_of_line = rope::RegexBuilder::new()
.syntax(rope::Config::new().multi_line(true))
.build(r"$")
.unwrap();

// line without ending
assert_eq!(
Expand Down Expand Up @@ -1077,9 +1099,9 @@ mod test {
select_on_matches(
s,
&Selection::single(0, s.len_chars()),
&RegexBuilder::new(r"^[a-z ]*$")
.multi_line(true)
.build()
&rope::RegexBuilder::new()
.syntax(rope::Config::new().multi_line(true))
.build(r"^[a-z ]*$")
.unwrap()
),
Some(Selection::new(
Expand Down Expand Up @@ -1171,13 +1193,15 @@ mod test {

#[test]
fn test_split_on_matches() {
use crate::regex::Regex;

let text = Rope::from(" abcd efg wrs xyz 123 456");

let selection = Selection::new(smallvec![Range::new(0, 9), Range::new(11, 20),], 0);

let result = split_on_matches(text.slice(..), &selection, &Regex::new(r"\s+").unwrap());
let result = split_on_matches(
text.slice(..),
&selection,
&rope::Regex::new(r"\s+").unwrap(),
);

assert_eq!(
result.ranges(),
Expand Down
12 changes: 9 additions & 3 deletions helix-core/src/syntax.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use ahash::RandomState;
use arc_swap::{ArcSwap, Guard};
use bitflags::bitflags;
use hashbrown::raw::RawTable;
use helix_stdx::rope::{self, RopeSliceExt};
use slotmap::{DefaultKey as LayerId, HopSlotMap};

use std::{
Expand Down Expand Up @@ -1889,11 +1890,16 @@ impl HighlightConfiguration {
node_slice
};

static SHEBANG_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(SHEBANG).unwrap());
static SHEBANG_REGEX: Lazy<rope::Regex> =
Lazy::new(|| rope::Regex::new(SHEBANG).unwrap());

injection_capture = SHEBANG_REGEX
.captures(&Cow::from(lines))
.map(|cap| InjectionLanguageMarker::Shebang(cap[1].to_owned()))
.captures_iter(lines.regex_input())
.map(|cap| {
let cap = lines.byte_slice(cap.get_group(1).unwrap().range());
InjectionLanguageMarker::Shebang(cap.into())
})
.next()
} else if index == self.injection_content_capture_index {
content_node = Some(capture.node);
}
Expand Down
1 change: 1 addition & 0 deletions helix-stdx/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dunce = "1.0"
etcetera = "0.8"
ropey = { version = "1.6.1", default-features = false }
which = "6.0"
regex-cursor = "0.1.3"

[dev-dependencies]
tempfile = "3.9"
45 changes: 43 additions & 2 deletions helix-stdx/src/rope.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
use std::ops::{Bound, RangeBounds};

pub use regex_cursor::engines::meta::{Builder as RegexBuilder, Regex};
pub use regex_cursor::regex_automata::util::syntax::Config;
use regex_cursor::{Input as RegexInput, RopeyCursor};
use ropey::RopeSlice;

pub trait RopeSliceExt: Sized {
pub trait RopeSliceExt<'a>: Sized {
fn ends_with(self, text: &str) -> bool;
fn starts_with(self, text: &str) -> bool;
fn regex_input(self) -> RegexInput<RopeyCursor<'a>>;
fn regex_input_at_bytes<R: RangeBounds<usize>>(
self,
byte_range: R,
) -> RegexInput<RopeyCursor<'a>>;
fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>>;
}

impl RopeSliceExt for RopeSlice<'_> {
impl<'a> RopeSliceExt<'a> for RopeSlice<'a> {
fn ends_with(self, text: &str) -> bool {
let len = self.len_bytes();
if len < text.len() {
Expand All @@ -23,4 +34,34 @@ impl RopeSliceExt for RopeSlice<'_> {
self.get_byte_slice(..len - text.len())
.map_or(false, |start| start == text)
}

fn regex_input(self) -> RegexInput<RopeyCursor<'a>> {
RegexInput::new(self)
}

fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>> {
let start_bound = match char_range.start_bound() {
Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),
Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),
Bound::Unbounded => Bound::Unbounded,
};
let end_bound = match char_range.end_bound() {
Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),
Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),
Bound::Unbounded => Bound::Unbounded,
};
self.regex_input_at_bytes((start_bound, end_bound))
}
fn regex_input_at_bytes<R: RangeBounds<usize>>(
self,
byte_range: R,
) -> RegexInput<RopeyCursor<'a>> {
let input = match byte_range.start_bound() {
Bound::Included(&pos) | Bound::Excluded(&pos) => {
RegexInput::new(RopeyCursor::at(self, pos))
}
Bound::Unbounded => RegexInput::new(self),
};
input.range(byte_range)
}
}