helix-editor · archseer · Feb 26, 2024 · Jan 25, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/helix-core/src/selection.rs b/helix-core/src/selection.rs
@@ -7,9 +7,11 @@ use crate::{
         ensure_grapheme_boundary_next, ensure_grapheme_boundary_prev, next_grapheme_boundary,
         prev_grapheme_boundary,
     },
+    line_ending::get_line_ending,
     movement::Direction,
     Assoc, ChangeSet, RopeGraphemes, RopeSlice,
 };
+use helix_stdx::rope::{self, RopeSliceExt};
 use smallvec::{smallvec, SmallVec};
 use std::borrow::Cow;
 
@@ -708,12 +710,12 @@ impl IntoIterator for Selection {
 pub fn keep_or_remove_matches(
     text: RopeSlice,
     selection: &Selection,
-    regex: &crate::regex::Regex,
+    regex: &rope::Regex,
     remove: bool,
 ) -> Option<Selection> {
     let result: SmallVec<_> = selection
         .iter()
-        .filter(|range| regex.is_match(&range.fragment(text)) ^ remove)
+        .filter(|range| regex.is_match(text.regex_input_at(range.from()..range.to())) ^ remove)
         .copied()
         .collect();
 
@@ -724,25 +726,20 @@ pub fn keep_or_remove_matches(
     None
 }
 
+// TODO: support to split on capture #N instead of whole match
 pub fn select_on_matches(
     text: RopeSlice,
     selection: &Selection,
-    regex: &crate::regex::Regex,
+    regex: &rope::Regex,
 ) -> Option<Selection> {
     let mut result = SmallVec::with_capacity(selection.len());
 
     for sel in selection {
-        // TODO: can't avoid occasional allocations since Regex can't operate on chunks yet
-        let fragment = sel.fragment(text);
-
-        let sel_start = sel.from();
-        let start_byte = text.char_to_byte(sel_start);
-
-        for mat in regex.find_iter(&fragment) {
+        for mat in regex.find_iter(text.regex_input_at(sel.from()..sel.to())) {
             // TODO: retain range direction
 
-            let start = text.byte_to_char(start_byte + mat.start());
-            let end = text.byte_to_char(start_byte + mat.end());
+            let start = text.byte_to_char(mat.start());
+            let end = text.byte_to_char(mat.end());
 
             let range = Range::new(start, end);
             // Make sure the match is not right outside of the selection.
@@ -761,12 +758,7 @@ pub fn select_on_matches(
     None
 }
 
-// TODO: support to split on capture #N instead of whole match
-pub fn split_on_matches(
-    text: RopeSlice,
-    selection: &Selection,
-    regex: &crate::regex::Regex,
-) -> Selection {
+pub fn split_on_newline(text: RopeSlice, selection: &Selection) -> Selection {
     let mut result = SmallVec::with_capacity(selection.len());
 
     for sel in selection {
@@ -776,21 +768,47 @@ pub fn split_on_matches(
             continue;
         }
 
-        // TODO: can't avoid occasional allocations since Regex can't operate on chunks yet
-        let fragment = sel.fragment(text);
-
         let sel_start = sel.from();
         let sel_end = sel.to();
 
-        let start_byte = text.char_to_byte(sel_start);
+        let mut start = sel_start;
 
+        for mat in sel.slice(text).lines() {
+            let len = mat.len_chars();
+            let line_end_len = get_line_ending(&mat).map(|le| le.len_chars()).unwrap_or(0);
+            // TODO: retain range direction
+            result.push(Range::new(start, start + len - line_end_len));
+            start += len;
+        }
+
+        if start < sel_end {
+            result.push(Range::new(start, sel_end));
+        }
+    }
+
+    // TODO: figure out a new primary index
+    Selection::new(result, 0)
+}
+
+pub fn split_on_matches(text: RopeSlice, selection: &Selection, regex: &rope::Regex) -> Selection {
+    let mut result = SmallVec::with_capacity(selection.len());
+
+    for sel in selection {
+        // Special case: zero-width selection.
+        if sel.from() == sel.to() {
+            result.push(*sel);
+            continue;
+        }
+
+        let sel_start = sel.from();
+        let sel_end = sel.to();
         let mut start = sel_start;
 
-        for mat in regex.find_iter(&fragment) {
+        for mat in regex.find_iter(text.regex_input_at(sel_start..sel_end)) {
             // TODO: retain range direction
-            let end = text.byte_to_char(start_byte + mat.start());
+            let end = text.byte_to_char(mat.start());
             result.push(Range::new(start, end));
-            start = text.byte_to_char(start_byte + mat.end());
+            start = text.byte_to_char(mat.end());
         }
 
         if start < sel_end {
@@ -1021,14 +1039,12 @@ mod test {
 
     #[test]
     fn test_select_on_matches() {
-        use crate::regex::{Regex, RegexBuilder};
-
         let r = Rope::from_str("Nobody expects the Spanish inquisition");
         let s = r.slice(..);
 
         let selection = Selection::single(0, r.len_chars());
         assert_eq!(
-            select_on_matches(s, &selection, &Regex::new(r"[A-Z][a-z]*").unwrap()),
+            select_on_matches(s, &selection, &rope::Regex::new(r"[A-Z][a-z]*").unwrap()),
             Some(Selection::new(
                 smallvec![Range::new(0, 6), Range::new(19, 26)],
                 0
@@ -1038,8 +1054,14 @@ mod test {
         let r = Rope::from_str("This\nString\n\ncontains multiple\nlines");
         let s = r.slice(..);
 
-        let start_of_line = RegexBuilder::new(r"^").multi_line(true).build().unwrap();
-        let end_of_line = RegexBuilder::new(r"$").multi_line(true).build().unwrap();
+        let start_of_line = rope::RegexBuilder::new()
+            .syntax(rope::Config::new().multi_line(true))
+            .build(r"^")
+            .unwrap();
+        let end_of_line = rope::RegexBuilder::new()
+            .syntax(rope::Config::new().multi_line(true))
+            .build(r"$")
+            .unwrap();
 
         // line without ending
         assert_eq!(
@@ -1077,9 +1099,9 @@ mod test {
             select_on_matches(
                 s,
                 &Selection::single(0, s.len_chars()),
-                &RegexBuilder::new(r"^[a-z ]*$")
-                    .multi_line(true)
-                    .build()
+                &rope::RegexBuilder::new()
+                    .syntax(rope::Config::new().multi_line(true))
+                    .build(r"^[a-z ]*$")
                     .unwrap()
             ),
             Some(Selection::new(
@@ -1171,13 +1193,15 @@ mod test {
 
     #[test]
     fn test_split_on_matches() {
-        use crate::regex::Regex;
-
         let text = Rope::from(" abcd efg wrs   xyz 123 456");
 
         let selection = Selection::new(smallvec![Range::new(0, 9), Range::new(11, 20),], 0);
 
-        let result = split_on_matches(text.slice(..), &selection, &Regex::new(r"\s+").unwrap());
+        let result = split_on_matches(
+            text.slice(..),
+            &selection,
+            &rope::Regex::new(r"\s+").unwrap(),
+        );
 
         assert_eq!(
             result.ranges(),

diff --git a/helix-core/src/syntax.rs b/helix-core/src/syntax.rs
@@ -11,6 +11,7 @@ use ahash::RandomState;
 use arc_swap::{ArcSwap, Guard};
 use bitflags::bitflags;
 use hashbrown::raw::RawTable;
+use helix_stdx::rope::{self, RopeSliceExt};
 use slotmap::{DefaultKey as LayerId, HopSlotMap};
 
 use std::{
@@ -1889,11 +1890,16 @@ impl HighlightConfiguration {
                     node_slice
                 };
 
-                static SHEBANG_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(SHEBANG).unwrap());
+                static SHEBANG_REGEX: Lazy<rope::Regex> =
+                    Lazy::new(|| rope::Regex::new(SHEBANG).unwrap());
 
                 injection_capture = SHEBANG_REGEX
-                    .captures(&Cow::from(lines))
-                    .map(|cap| InjectionLanguageMarker::Shebang(cap[1].to_owned()))
+                    .captures_iter(lines.regex_input())
+                    .map(|cap| {
+                        let cap = lines.byte_slice(cap.get_group(1).unwrap().range());
+                        InjectionLanguageMarker::Shebang(cap.into())
+                    })
+                    .next()
             } else if index == self.injection_content_capture_index {
                 content_node = Some(capture.node);
             }

diff --git a/helix-stdx/Cargo.toml b/helix-stdx/Cargo.toml
@@ -16,6 +16,7 @@ dunce = "1.0"
 etcetera = "0.8"
 ropey = { version = "1.6.1", default-features = false }
 which = "6.0"
+regex-cursor = "0.1.3" 
 
 [dev-dependencies]
 tempfile = "3.9"
diff --git a/helix-stdx/src/rope.rs b/helix-stdx/src/rope.rs
@@ -1,11 +1,22 @@
+use std::ops::{Bound, RangeBounds};
+
+pub use regex_cursor::engines::meta::{Builder as RegexBuilder, Regex};
+pub use regex_cursor::regex_automata::util::syntax::Config;
+use regex_cursor::{Input as RegexInput, RopeyCursor};
 use ropey::RopeSlice;
 
-pub trait RopeSliceExt: Sized {
+pub trait RopeSliceExt<'a>: Sized {
     fn ends_with(self, text: &str) -> bool;
     fn starts_with(self, text: &str) -> bool;
+    fn regex_input(self) -> RegexInput<RopeyCursor<'a>>;
+    fn regex_input_at_bytes<R: RangeBounds<usize>>(
+        self,
+        byte_range: R,
+    ) -> RegexInput<RopeyCursor<'a>>;
+    fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>>;
 }
 
-impl RopeSliceExt for RopeSlice<'_> {
+impl<'a> RopeSliceExt<'a> for RopeSlice<'a> {
     fn ends_with(self, text: &str) -> bool {
         let len = self.len_bytes();
         if len < text.len() {
@@ -23,4 +34,34 @@ impl RopeSliceExt for RopeSlice<'_> {
         self.get_byte_slice(..len - text.len())
             .map_or(false, |start| start == text)
     }
+
+    fn regex_input(self) -> RegexInput<RopeyCursor<'a>> {
+        RegexInput::new(self)
+    }
+
+    fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>> {
+        let start_bound = match char_range.start_bound() {
+            Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),
+            Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),
+            Bound::Unbounded => Bound::Unbounded,
+        };
+        let end_bound = match char_range.end_bound() {
+            Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),
+            Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),
+            Bound::Unbounded => Bound::Unbounded,
+        };
+        self.regex_input_at_bytes((start_bound, end_bound))
+    }
+    fn regex_input_at_bytes<R: RangeBounds<usize>>(
+        self,
+        byte_range: R,
+    ) -> RegexInput<RopeyCursor<'a>> {
+        let input = match byte_range.start_bound() {
+            Bound::Included(&pos) | Bound::Excluded(&pos) => {
+                RegexInput::new(RopeyCursor::at(self, pos))
+            }
+            Bound::Unbounded => RegexInput::new(self),
+        };
+        input.range(byte_range)
+    }
 }