Skip to content

Commit

Permalink
apacheGH-33723: [C++] re2::RE2::RE2() result must be checked (apache#…
Browse files Browse the repository at this point in the history
…33806)

### Rationale for this change

RE2 construction needs to be checked.

### What changes are included in this PR?

Check all RE2 object status.

### Are these changes tested?

Tested with existing tests.

### Are there any user-facing changes?

No.
* Closes: apache#33723

Authored-by: Jin Shang <shangjin1997@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
  • Loading branch information
js8544 authored and sjperkins committed Feb 10, 2023
1 parent bf8be3c commit 0be4763
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 26 deletions.
29 changes: 24 additions & 5 deletions cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <algorithm>
#include <cctype>
#include <iterator>
#include <memory>
#include <string>

#ifdef ARROW_WITH_RE2
Expand All @@ -26,6 +27,8 @@

#include "arrow/array/builder_nested.h"
#include "arrow/compute/kernels/scalar_string_internal.h"
#include "arrow/result.h"
#include "arrow/util/macros.h"
#include "arrow/util/string.h"
#include "arrow/util/value_parsing.h"

Expand Down Expand Up @@ -1505,6 +1508,13 @@ struct MatchLike {
static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)", kRE2Options);
// A LIKE pattern matching this regex can be translated into a suffix search.
static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))", kRE2Options);
static bool global_checked = false;
if (ARROW_PREDICT_FALSE(!global_checked)) {
RETURN_NOT_OK(RegexStatus(kLikePatternIsSubstringMatch));
RETURN_NOT_OK(RegexStatus(kLikePatternIsStartsWith));
RETURN_NOT_OK(RegexStatus(kLikePatternIsEndsWith));
global_checked = true;
}

auto original_options = MatchSubstringState::Get(ctx);
auto original_state = ctx->state();
Expand Down Expand Up @@ -1669,14 +1679,21 @@ struct FindSubstring {
struct FindSubstringRegex {
std::unique_ptr<RE2> regex_match_;

static Result<FindSubstringRegex> Make(const MatchSubstringOptions& options,
bool is_utf8 = true, bool literal = false) {
auto matcher = FindSubstringRegex(options, is_utf8, literal);
RETURN_NOT_OK(RegexStatus(*matcher.regex_match_));
return std::move(matcher);
}

explicit FindSubstringRegex(const MatchSubstringOptions& options, bool is_utf8 = true,
bool literal = false) {
std::string regex = "(";
regex.reserve(options.pattern.length() + 2);
regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
regex += ")";
regex_match_.reset(
new RE2(regex, MakeRE2Options(is_utf8, options.ignore_case, /*literal=*/false)));
regex_match_ = std::make_unique<RE2>(
regex, MakeRE2Options(is_utf8, options.ignore_case, /*literal=*/false));
}

template <typename OutValue, typename... Ignored>
Expand All @@ -1698,9 +1715,10 @@ struct FindSubstringExec {
const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
if (options.ignore_case) {
#ifdef ARROW_WITH_RE2
ARROW_ASSIGN_OR_RAISE(auto matcher,
FindSubstringRegex::Make(options, InputType::is_utf8, true));
applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
kernel{FindSubstringRegex(options, /*is_utf8=*/InputType::is_utf8,
/*literal=*/true)};
kernel{std::move(matcher)};
return kernel.Exec(ctx, batch, out);
#else
return Status::NotImplemented("ignore_case requires RE2");
Expand All @@ -1725,8 +1743,9 @@ struct FindSubstringRegexExec {
using OffsetType = typename TypeTraits<InputType>::OffsetType;
static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
ARROW_ASSIGN_OR_RAISE(auto matcher, FindSubstringRegex::Make(options, false));
applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
kernel{FindSubstringRegex(options, /*literal=*/false)};
kernel{std::move(matcher)};
return kernel.Exec(ctx, batch, out);
}
};
Expand Down
11 changes: 11 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#endif

#include "arrow/compute/api_scalar.h"
#include "arrow/compute/exec.h"
#include "arrow/compute/kernels/codegen_internal.h"
#include "arrow/compute/kernels/test_util.h"
#include "arrow/testing/gtest_util.h"
Expand Down Expand Up @@ -494,6 +495,16 @@ TYPED_TEST(TestBaseBinaryKernels, FindSubstringRegex) {
this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])",
this->offset_type(), "[0, 0, 1, null, -1, 0]", &options);
}

TYPED_TEST(TestBaseBinaryKernels, FindSubstringRegexWrongPattern) {
MatchSubstringOptions options{"(a", /*ignore_case=*/false};

EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, ::testing::HasSubstr("Invalid regular expression"),
CallFunction("find_substring_regex",
{Datum(R"(["a", "A", "baaa", null, "", "AaaA"])")}, &options));
}

#else
TYPED_TEST(TestBaseBinaryKernels, FindSubstringIgnoreCase) {
MatchSubstringOptions options{"a+", /*ignore_case=*/true};
Expand Down
53 changes: 36 additions & 17 deletions cpp/src/gandiva/regex_functions_holder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,12 @@

#include "gandiva/regex_functions_holder.h"
#include <regex>
#include "arrow/util/macros.h"
#include "gandiva/node.h"
#include "gandiva/regex_util.h"

namespace gandiva {

RE2 LikeHolder::starts_with_regex_(R"(([^\.\*])*\.\*)");
RE2 LikeHolder::ends_with_regex_(R"(\.\*([^\.\*])*)");
RE2 LikeHolder::is_substr_regex_(R"(\.\*([^\.\*])*\.\*)");

std::string& RemovePatternEscapeChars(const FunctionNode& node, std::string& pattern) {
pattern.erase(std::remove(pattern.begin(), pattern.end(), '\\'), pattern.end());
return pattern;
Expand All @@ -34,27 +31,44 @@ std::string& RemovePatternEscapeChars(const FunctionNode& node, std::string& pat
// Short-circuit pattern matches for the following common sub cases :
// - starts_with, ends_with and is_substr
const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) {
// NOTE: avoid making those constants global to avoid compiling regexes at startup
// pre-compiled pattern for matching starts_with
static const RE2 starts_with_regex(R"(([^\.\*])*\.\*)");
// pre-compiled pattern for matching ends_with
static const RE2 ends_with_regex(R"(\.\*([^\.\*])*)");
// pre-compiled pattern for matching is_substr
static const RE2 is_substr_regex(R"(\.\*([^\.\*])*\.\*)");

static bool global_checked = false;
if (ARROW_PREDICT_FALSE(!global_checked)) {
if (ARROW_PREDICT_FALSE(
!(starts_with_regex.ok() && ends_with_regex.ok() && is_substr_regex.ok()))) {
return node;
}
global_checked = true;
}

std::shared_ptr<LikeHolder> holder;
auto status = Make(node, &holder);
if (status.ok()) {
std::string& pattern = holder->pattern_;
auto literal_type = node.children().at(1)->return_type();

if (RE2::FullMatch(pattern, starts_with_regex_)) {
if (RE2::FullMatch(pattern, starts_with_regex)) {
auto prefix = pattern.substr(0, pattern.length() - 2); // trim .*
auto parsed_prefix = RemovePatternEscapeChars(node, prefix);
auto prefix_node = std::make_shared<LiteralNode>(
literal_type, LiteralHolder(parsed_prefix), false);
return FunctionNode("starts_with", {node.children().at(0), prefix_node},
node.return_type());
} else if (RE2::FullMatch(pattern, ends_with_regex_)) {
} else if (RE2::FullMatch(pattern, ends_with_regex)) {
auto suffix = pattern.substr(2); // skip .*
auto parsed_suffix = RemovePatternEscapeChars(node, suffix);
auto suffix_node = std::make_shared<LiteralNode>(
literal_type, LiteralHolder(parsed_suffix), false);
return FunctionNode("ends_with", {node.children().at(0), suffix_node},
node.return_type());
} else if (RE2::FullMatch(pattern, is_substr_regex_)) {
} else if (RE2::FullMatch(pattern, is_substr_regex)) {
auto substr =
pattern.substr(2, pattern.length() - 4); // trim starting and ending .*
auto parsed_substr = RemovePatternEscapeChars(node, substr);
Expand Down Expand Up @@ -115,9 +129,10 @@ Status LikeHolder::Make(const std::string& sql_pattern,

auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
ARROW_RETURN_IF(!lholder->regex_.ok(),
Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
Status::Invalid("Building RE2 pattern '", pcre_pattern,
"' failed with: ", lholder->regex_.error()));

*holder = lholder;
*holder = std::move(lholder);
return Status::OK();
}

Expand All @@ -136,9 +151,10 @@ Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escap

auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
ARROW_RETURN_IF(!lholder->regex_.ok(),
Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
Status::Invalid("Building RE2 pattern '", pcre_pattern,
"' failed with: ", lholder->regex_.error()));

*holder = lholder;
*holder = std::move(lholder);
return Status::OK();
}

Expand All @@ -151,9 +167,10 @@ Status LikeHolder::Make(const std::string& sql_pattern,
lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern, regex_op));

ARROW_RETURN_IF(!lholder->regex_.ok(),
Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));
Status::Invalid("Building RE2 pattern '", pcre_pattern,
"' failed with: ", lholder->regex_.error()));

*holder = lholder;
*holder = std::move(lholder);
return Status::OK();
}

Expand All @@ -180,9 +197,10 @@ Status ReplaceHolder::Make(const std::string& sql_pattern,
std::shared_ptr<ReplaceHolder>* holder) {
auto lholder = std::shared_ptr<ReplaceHolder>(new ReplaceHolder(sql_pattern));
ARROW_RETURN_IF(!lholder->regex_.ok(),
Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed"));
Status::Invalid("Building RE2 pattern '", sql_pattern,
"' failed with: ", lholder->regex_.error()));

*holder = lholder;
*holder = std::move(lholder);
return Status::OK();
}

Expand Down Expand Up @@ -210,9 +228,10 @@ Status ExtractHolder::Make(const std::string& sql_pattern,
std::shared_ptr<ExtractHolder>* holder) {
auto lholder = std::shared_ptr<ExtractHolder>(new ExtractHolder(sql_pattern));
ARROW_RETURN_IF(!lholder->regex_.ok(),
Status::Invalid("Building RE2 pattern '", sql_pattern, "' failed"));
Status::Invalid("Building RE2 pattern '", sql_pattern,
"' failed with: ", lholder->regex_.error()));

*holder = lholder;
*holder = std::move(lholder);
return Status::OK();
}

Expand Down
4 changes: 0 additions & 4 deletions cpp/src/gandiva/regex_functions_holder.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,6 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder {

std::string pattern_; // posix pattern string, to help debugging
RE2 regex_; // compiled regex for the pattern

static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with
static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with
static RE2 is_substr_regex_; // pre-compiled pattern for matching is_substr
};

/// Function Holder for 'replace'
Expand Down

0 comments on commit 0be4763

Please sign in to comment.