Skip to content

Commit

Permalink
Add missing atomic operators, refactor atomic operators, move atomic …
Browse files Browse the repository at this point in the history
…operators to detail namespace. (#14962)

This PR does a thorough refactoring of `device_atomics.cuh`.

- I moved all atomic-related functions to `cudf::detail::` (making this an API-breaking change, but most likely a low-impact break)
- I added all missing operators for natively supported types to `atomicAdd`, `atomicMin`, `atomicMax`, etc. as discussed in #10149 and #14907.
  - This should prevent fallback to the `atomicCAS` path for types that are natively supported for those atomic operators, which we suspect as the root cause of the performance regression in #14886.
- I kept `atomicAdd` rather than `cudf::detail::atomic_add` in locations where a native CUDA overload exists, and the same for min/max/CAS operations. Aggregations are the only place where we use the special overloads. We were previously calling the native CUDA function rather than our special overloads in many cases, so I retained the previous behavior. This avoids including the additional headers that implement an unnecessary level of wrapping for natively supported overloads.
- I enabled native 2-byte CAS operations (on `unsigned short int`) that eliminate the do-while loop and extra alignment-checking logic
  - The CUDA docs don't state this, but some forum posts claim this is only supported by compute capability 7.0+. We now have 7.0 as a lower bound for RAPIDS so I'm not concerned by this as long as builds/tests pass.
- I improved/cleaned the documentation and moved around some code so that the operators were in a logical order.
- I assessed the existing tests and it looks like all the types are being covered. I'm not sure if there is a good way to enforce that certain types (like `uint64_t`) are passing through native `atomicAdd` calls.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Suraj Aralihalli (https://github.com/SurajAralihalli)

URL: #14962
  • Loading branch information
bdice committed Mar 12, 2024
1 parent d48b904 commit 155405b
Show file tree
Hide file tree
Showing 9 changed files with 206 additions and 140 deletions.
3 changes: 2 additions & 1 deletion cpp/benchmarks/join/generate_input_tables.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@

#pragma once

#include <cudf/detail/utilities/device_atomics.cuh>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/distance.h>
Expand Down
46 changes: 24 additions & 22 deletions cpp/include/cudf/detail/aggregation/aggregation.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -144,8 +144,8 @@ struct update_target_element<
if (source_has_nulls and source.is_null(source_index)) { return; }

using Target = target_type_t<Source, aggregation::MIN>;
atomicMin(&target.element<Target>(target_index),
static_cast<Target>(source.element<Source>(source_index)));
cudf::detail::atomic_min(&target.element<Target>(target_index),
static_cast<Target>(source.element<Source>(source_index)));

if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
}
Expand All @@ -170,8 +170,8 @@ struct update_target_element<
using DeviceTarget = device_storage_type_t<Target>;
using DeviceSource = device_storage_type_t<Source>;

atomicMin(&target.element<DeviceTarget>(target_index),
static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
cudf::detail::atomic_min(&target.element<DeviceTarget>(target_index),
static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));

if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
}
Expand All @@ -193,8 +193,8 @@ struct update_target_element<
if (source_has_nulls and source.is_null(source_index)) { return; }

using Target = target_type_t<Source, aggregation::MAX>;
atomicMax(&target.element<Target>(target_index),
static_cast<Target>(source.element<Source>(source_index)));
cudf::detail::atomic_max(&target.element<Target>(target_index),
static_cast<Target>(source.element<Source>(source_index)));

if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
}
Expand All @@ -219,8 +219,8 @@ struct update_target_element<
using DeviceTarget = device_storage_type_t<Target>;
using DeviceSource = device_storage_type_t<Source>;

atomicMax(&target.element<DeviceTarget>(target_index),
static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
cudf::detail::atomic_max(&target.element<DeviceTarget>(target_index),
static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));

if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
}
Expand All @@ -242,8 +242,8 @@ struct update_target_element<
if (source_has_nulls and source.is_null(source_index)) { return; }

using Target = target_type_t<Source, aggregation::SUM>;
atomicAdd(&target.element<Target>(target_index),
static_cast<Target>(source.element<Source>(source_index)));
cudf::detail::atomic_add(&target.element<Target>(target_index),
static_cast<Target>(source.element<Source>(source_index)));

if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
}
Expand All @@ -268,8 +268,8 @@ struct update_target_element<
using DeviceTarget = device_storage_type_t<Target>;
using DeviceSource = device_storage_type_t<Source>;

atomicAdd(&target.element<DeviceTarget>(target_index),
static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
cudf::detail::atomic_add(&target.element<DeviceTarget>(target_index),
static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));

if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
}
Expand Down Expand Up @@ -368,7 +368,7 @@ struct update_target_element<Source,

using Target = target_type_t<Source, aggregation::SUM_OF_SQUARES>;
auto value = static_cast<Target>(source.element<Source>(source_index));
atomicAdd(&target.element<Target>(target_index), value * value);
cudf::detail::atomic_add(&target.element<Target>(target_index), value * value);
if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
}
};
Expand All @@ -387,8 +387,8 @@ struct update_target_element<Source,
if (source_has_nulls and source.is_null(source_index)) { return; }

using Target = target_type_t<Source, aggregation::PRODUCT>;
atomicMul(&target.element<Target>(target_index),
static_cast<Target>(source.element<Source>(source_index)));
cudf::detail::atomic_mul(&target.element<Target>(target_index),
static_cast<Target>(source.element<Source>(source_index)));
if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
}
};
Expand All @@ -408,7 +408,7 @@ struct update_target_element<
if (source_has_nulls and source.is_null(source_index)) { return; }

using Target = target_type_t<Source, aggregation::COUNT_VALID>;
atomicAdd(&target.element<Target>(target_index), Target{1});
cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});

// It is assumed the output for COUNT_VALID is initialized to be all valid
}
Expand All @@ -427,7 +427,7 @@ struct update_target_element<
size_type source_index) const noexcept
{
using Target = target_type_t<Source, aggregation::COUNT_ALL>;
atomicAdd(&target.element<Target>(target_index), Target{1});
cudf::detail::atomic_add(&target.element<Target>(target_index), Target{1});

// It is assumed the output for COUNT_ALL is initialized to be all valid
}
Expand All @@ -449,10 +449,11 @@ struct update_target_element<
if (source_has_nulls and source.is_null(source_index)) { return; }

using Target = target_type_t<Source, aggregation::ARGMAX>;
auto old = atomicCAS(&target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
auto old = cudf::detail::atomic_cas(
&target.element<Target>(target_index), ARGMAX_SENTINEL, source_index);
if (old != ARGMAX_SENTINEL) {
while (source.element<Source>(source_index) > source.element<Source>(old)) {
old = atomicCAS(&target.element<Target>(target_index), old, source_index);
old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
}
}

Expand All @@ -476,10 +477,11 @@ struct update_target_element<
if (source_has_nulls and source.is_null(source_index)) { return; }

using Target = target_type_t<Source, aggregation::ARGMIN>;
auto old = atomicCAS(&target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
auto old = cudf::detail::atomic_cas(
&target.element<Target>(target_index), ARGMIN_SENTINEL, source_index);
if (old != ARGMIN_SENTINEL) {
while (source.element<Source>(source_index) < source.element<Source>(old)) {
old = atomicCAS(&target.element<Target>(target_index), old, source_index);
old = cudf::detail::atomic_cas(&target.element<Target>(target_index), old, source_index);
}
}

Expand Down

0 comments on commit 155405b

Please sign in to comment.