Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix mean computation for the geometric distribution in the data generator #15282

Merged
13 changes: 7 additions & 6 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,15 @@ double get_distribution_mean(distribution_params<T> const& dist)
case distribution_id::NORMAL:
case distribution_id::UNIFORM: return (dist.lower_bound / 2.) + (dist.upper_bound / 2.);
case distribution_id::GEOMETRIC: {
auto const range_size = dist.lower_bound < dist.upper_bound
? dist.upper_bound - dist.lower_bound
: dist.lower_bound - dist.upper_bound;
auto const p = geometric_dist_p(range_size);
// Geometric distribution is approximated by a half-normal distribution
vuule marked this conversation as resolved.
Show resolved Hide resolved
// Doubling the standard deviation because the dist range only includes half of the (unfolded)
// normal distribution
auto const gauss_std_dev = std_dev_from_range(dist.lower_bound, dist.upper_bound) * 2;
auto const half_gauss_mean = gauss_std_dev * sqrt(2. / M_PI);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if (dist.lower_bound < dist.upper_bound)
return dist.lower_bound + (1. / p);
return dist.lower_bound + half_gauss_mean;
else
return dist.lower_bound - (1. / p);
return dist.lower_bound - half_gauss_mean;
}
default: CUDF_FAIL("Unsupported distribution type.");
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/benchmarks/common/generate_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ class data_profile {
std::map<cudf::type_id, distribution_params<double>> float_params;
distribution_params<cudf::string_view> string_dist_desc{{distribution_id::NORMAL, 0, 32}};
distribution_params<cudf::list_view> list_dist_desc{
cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 100}, 2};
cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 64}, 2};
distribution_params<cudf::struct_view> struct_dist_desc{
{cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::STRING}, 2};
std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;
Expand Down
44 changes: 28 additions & 16 deletions cpp/benchmarks/common/random_distribution_factory.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,15 +44,25 @@ using integral_to_realType =
T,
std::conditional_t<sizeof(T) * 8 <= 23, float, double>>;

// standard deviation such that most samples fall within the given range
template <typename T>
constexpr double std_dev_from_range(T lower_bound, T upper_bound)
{
// 99.7% samples are within 3 standard deviations of the mean
constexpr double k = 6.0;
auto const range_size = std::abs(static_cast<double>(upper_bound) - lower_bound);
return range_size / k;
}

/**
* @brief Generates a normal distribution between zero and upper_bound.
*/
template <typename T>
auto make_normal_dist(T lower_bound, T upper_bound)
{
using realT = integral_to_realType<T>;
T const mean = lower_bound + (upper_bound - lower_bound) / 2;
T const stddev = (upper_bound - lower_bound) / 6;
using realT = integral_to_realType<T>;
realT const mean = lower_bound / 2. + upper_bound / 2.;
realT const stddev = std_dev_from_range(lower_bound, upper_bound);
return thrust::random::normal_distribution<realT>(mean, stddev);
}

Expand All @@ -68,14 +78,6 @@ auto make_uniform_dist(T range_start, T range_end)
return thrust::uniform_real_distribution<T>(range_start, range_end);
}

template <typename T>
double geometric_dist_p(T range_size)
{
constexpr double percentage_in_range = 0.99;
double const p = 1 - exp(log(1 - percentage_in_range) / range_size);
return p ? p : std::numeric_limits<double>::epsilon();
}

/**
* @brief Generates a geometric distribution between lower_bound and upper_bound.
* This distribution is an approximation generated using normal distribution.
Expand All @@ -89,10 +91,17 @@ class geometric_distribution : public thrust::random::normal_distribution<integr
T _lower_bound;
T _upper_bound;

super_t make_approx_normal_dist(T lower_bound, T upper_bound) const
{
auto const abs_range_size = std::abs(static_cast<realType>(upper_bound) - lower_bound);
// Generate normal distribution around zero; output will be shifted by lower_bound
return make_normal_dist(-abs_range_size, abs_range_size);
}

public:
using result_type = T;
__host__ __device__ explicit geometric_distribution(T lower_bound, T upper_bound)
: super_t(0, std::labs(upper_bound - lower_bound) / 4.0),
explicit geometric_distribution(T lower_bound, T upper_bound)
: super_t(make_approx_normal_dist(lower_bound, upper_bound)),
_lower_bound(lower_bound),
_upper_bound(upper_bound)
{
Expand All @@ -101,8 +110,11 @@ class geometric_distribution : public thrust::random::normal_distribution<integr
template <typename UniformRandomNumberGenerator>
__host__ __device__ result_type operator()(UniformRandomNumberGenerator& urng)
{
return _lower_bound < _upper_bound ? std::abs(super_t::operator()(urng)) + _lower_bound
: _lower_bound - std::abs(super_t::operator()(urng));
// Distribution always biases towards lower_bound
realType const result = _lower_bound < _upper_bound
? std::abs(super_t::operator()(urng)) + _lower_bound
: _lower_bound - std::abs(super_t::operator()(urng));
return std::round(result);
}
};

Expand Down