Skip to content

Commit

Permalink
perf(rust): faster "contains literal" matching in small-string regime
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie committed Feb 11, 2023
1 parent 0a4a2c1 commit 60c56a8
Showing 1 changed file with 7 additions and 11 deletions.
18 changes: 7 additions & 11 deletions polars/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,19 +159,11 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
ca.apply(f)
}

/// Check if strings contain a regex pattern; take literal fast-path if
/// no special chars and strlen <= 96 chars (otherwise regex faster).
/// Check if strings contain a regex pattern.
fn contains(&self, pat: &str) -> PolarsResult<BooleanChunked> {
let lit = pat.chars().all(|c| !c.is_ascii_punctuation());
let ca = self.as_utf8();
let reg = Regex::new(pat)?;
let f = |s: &str| {
if lit && (s.len() <= 96) {
s.contains(pat)
} else {
reg.is_match(s)
}
};
let f = |s: &str| reg.is_match(s);
let mut out: BooleanChunked = if !ca.has_validity() {
ca.into_no_null_iter().map(f).collect()
} else {
Expand All @@ -183,6 +175,9 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {

/// Check if strings contain a given literal
fn contains_literal(&self, lit: &str) -> PolarsResult<BooleanChunked> {
// note: benchmarking shows that the regex engine is actually
// faster at finding literal matches than str::contains.
// ref: https://github.com/pola-rs/polars/pull/6811
self.contains(escape(lit).as_str())
}

Expand Down Expand Up @@ -215,7 +210,8 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
/// Replace the leftmost literal (sub)string with another string
fn replace_literal<'a>(&'a self, pat: &str, val: &str) -> PolarsResult<Utf8Chunked> {
// note: benchmarking shows that using the regex engine for literal
// replacement is faster than str::replacen in almost all cases
// replacement is faster than str::replacen in almost all cases.
// ref: https://github.com/pola-rs/polars/pull/6777
let reg = Regex::new(escape(pat).as_str())?;
let f = |s: &'a str| reg.replace(s, NoExpand(val));
let ca = self.as_utf8();
Expand Down

0 comments on commit 60c56a8

Please sign in to comment.