From a2fc908e06f821b501ccaa6ce545592f57330347 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 10 Feb 2023 12:17:18 +0000 Subject: [PATCH] perf(rust): update string replacement codepaths; latest release build shows regex engine is now faster in all regimes --- .../src/chunked_array/strings/namespace.rs | 23 ++++--------------- 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/polars/polars-ops/src/chunked_array/strings/namespace.rs b/polars/polars-ops/src/chunked_array/strings/namespace.rs index 1643540af7e5..635586088990 100644 --- a/polars/polars-ops/src/chunked_array/strings/namespace.rs +++ b/polars/polars-ops/src/chunked_array/strings/namespace.rs @@ -204,33 +204,20 @@ pub trait Utf8NameSpaceImpl: AsUtf8 { out } - /// Replace the leftmost regex-matched (sub)string with another string; take - /// fast-path for small (<= 32 chars) strings (otherwise regex faster). + /// Replace the leftmost regex-matched (sub)string with another string fn replace<'a>(&'a self, pat: &str, val: &str) -> PolarsResult { - let lit = !(pat.chars().any(|c| c.is_ascii_punctuation()) - | val.chars().any(|c| c.is_ascii_punctuation())); let reg = Regex::new(pat)?; - let f = |s: &'a str| { - if lit && (s.len() <= 32) { - Cow::Owned(s.replacen(pat, val, 1)) - } else { - reg.replace(s, val) - } - }; + let f = |s: &'a str| reg.replace(s, val); let ca = self.as_utf8(); Ok(ca.apply(f)) } /// Replace the leftmost literal (sub)string with another string fn replace_literal<'a>(&'a self, pat: &str, val: &str) -> PolarsResult { + // note: benchmarking shows that using the regex engine for literal + // replacement is faster than str::replacen in almost all cases let reg = Regex::new(escape(pat).as_str())?; - let f = |s: &'a str| { - if s.len() <= 32 { - Cow::Owned(s.replacen(pat, val, 1)) - } else { - reg.replace(s, NoExpand(val)) - } - }; + let f = |s: &'a str| reg.replace(s, NoExpand(val)); let ca = self.as_utf8(); Ok(ca.apply(f)) }