Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Add unique_count algorithm #1619

Merged
merged 10 commits into from
May 7, 2022
125 changes: 125 additions & 0 deletions testing/cuda/unique.cu
Original file line number Diff line number Diff line change
Expand Up @@ -320,3 +320,128 @@ void TestUniqueCopyCudaStreamsNoSync()
}
DECLARE_UNITTEST(TestUniqueCopyCudaStreamsNoSync);


template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
__global__
void unique_count_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
{
*result = thrust::unique_count(exec, first, last);
}


template<typename ExecutionPolicy, typename Iterator1, typename BinaryPredicate, typename Iterator2>
__global__
void unique_count_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, BinaryPredicate pred, Iterator2 result)
{
*result = thrust::unique_count(exec, first, last, pred);
}


template<typename ExecutionPolicy>
void TestUniqueCountDevice(ExecutionPolicy exec)
{
typedef thrust::device_vector<int> Vector;
typedef Vector::value_type T;

Vector data(10);
data[0] = 11;
data[1] = 11;
data[2] = 12;
data[3] = 20;
data[4] = 29;
data[5] = 21;
data[6] = 21;
data[7] = 31;
data[8] = 31;
data[9] = 37;

Vector output(1, -1);

unique_count_kernel<<<1,1>>>(exec, data.begin(), data.end(), output.begin());
{
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);
}

ASSERT_EQUAL(output[0], 7);

unique_count_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_equal_div_10_unique<T>(), output.begin());
{
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);
}

ASSERT_EQUAL(output[0], 3);
}


void TestUniqueCountDeviceSeq()
{
TestUniqueCountDevice(thrust::seq);
}
DECLARE_UNITTEST(TestUniqueCountDeviceSeq);


void TestUniqueCountDeviceDevice()
{
TestUniqueCountDevice(thrust::device);
}
DECLARE_UNITTEST(TestUniqueCountDeviceDevice);


void TestUniqueCountDeviceNoSync()
{
TestUniqueCountDevice(thrust::cuda::par_nosync);
}
DECLARE_UNITTEST(TestUniqueCountDeviceNoSync);


template<typename ExecutionPolicy>
void TestUniqueCountCudaStreams(ExecutionPolicy policy)
{
typedef thrust::device_vector<int> Vector;
typedef Vector::value_type T;

Vector data(10);
data[0] = 11;
data[1] = 11;
data[2] = 12;
data[3] = 20;
data[4] = 29;
data[5] = 21;
data[6] = 21;
data[7] = 31;
data[8] = 31;
data[9] = 37;

cudaStream_t s;
cudaStreamCreate(&s);

auto streampolicy = policy.on(s);

int result = thrust::unique_count(streampolicy, data.begin(), data.end());
cudaStreamSynchronize(s);

ASSERT_EQUAL(result, 7);

result = thrust::unique_count(streampolicy, data.begin(), data.end(), is_equal_div_10_unique<T>());
cudaStreamSynchronize(s);

ASSERT_EQUAL(result, 3);

cudaStreamDestroy(s);
}

void TestUniqueCountCudaStreamsSync()
{
TestUniqueCountCudaStreams(thrust::cuda::par);
}
DECLARE_UNITTEST(TestUniqueCountCudaStreamsSync);


void TestUniqueCountCudaStreamsNoSync()
{
TestUniqueCountCudaStreams(thrust::cuda::par_nosync);
}
DECLARE_UNITTEST(TestUniqueCountCudaStreamsNoSync);

107 changes: 102 additions & 5 deletions testing/unique.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <unittest/unittest.h>
#include <unittest/iterator_helpers.h>
#include <thrust/unique.h>
#include <thrust/functional.h>
#include <thrust/iterator/discard_iterator.h>
Expand Down Expand Up @@ -95,6 +96,50 @@ void TestUniqueCopyDispatchImplicit()
DECLARE_UNITTEST(TestUniqueCopyDispatchImplicit);


template <typename ForwardIterator>
typename thrust::iterator_traits<ForwardIterator>::difference_type
unique_count(my_system &system,
ForwardIterator,
ForwardIterator)
{
system.validate_dispatch();
return 0;
}

void TestUniqueCountDispatchExplicit()
{
thrust::device_vector<int> vec(1);

my_system sys(0);
thrust::unique_count(sys, vec.begin(), vec.begin());

ASSERT_EQUAL(true, sys.is_valid());
}
DECLARE_UNITTEST(TestUniqueCountDispatchExplicit);


template <typename ForwardIterator>
typename thrust::iterator_traits<ForwardIterator>::difference_type
unique_count(my_tag,
ForwardIterator,
ForwardIterator)
{
return 13;
}

void TestUniqueCountDispatchImplicit()
{
thrust::device_vector<int> vec(1);

auto result = thrust::unique_count(
thrust::retag<my_tag>(vec.begin()),
thrust::retag<my_tag>(vec.begin()));

ASSERT_EQUAL(13, result);
}
DECLARE_UNITTEST(TestUniqueCountDispatchImplicit);


template<typename T>
struct is_equal_div_10_unique
{
Expand All @@ -119,11 +164,13 @@ void TestUniqueSimple(void)
data[8] = 31;
data[9] = 37;

typename Vector::iterator new_last;
forward_iterator_wrapper<typename Vector::iterator> new_last;
const auto begin = make_forward_iterator_wrapper(data.begin());
const auto end = make_forward_iterator_wrapper(data.end());

new_last = thrust::unique(data.begin(), data.end());
new_last = thrust::unique(begin, end);

ASSERT_EQUAL(new_last - data.begin(), 7);
ASSERT_EQUAL(thrust::distance(begin, new_last), 7);
ASSERT_EQUAL(data[0], 11);
ASSERT_EQUAL(data[1], 12);
ASSERT_EQUAL(data[2], 20);
Expand All @@ -132,9 +179,9 @@ void TestUniqueSimple(void)
ASSERT_EQUAL(data[5], 31);
ASSERT_EQUAL(data[6], 37);

new_last = thrust::unique(data.begin(), new_last, is_equal_div_10_unique<T>());
new_last = thrust::unique(begin, new_last, is_equal_div_10_unique<T>());

ASSERT_EQUAL(new_last - data.begin(), 3);
ASSERT_EQUAL(thrust::distance(begin, new_last), 3);
ASSERT_EQUAL(data[0], 11);
ASSERT_EQUAL(data[1], 20);
ASSERT_EQUAL(data[2], 31);
Expand Down Expand Up @@ -266,3 +313,53 @@ struct TestUniqueCopyToDiscardIterator
VariableUnitTest<TestUniqueCopyToDiscardIterator, IntegralTypes> TestUniqueCopyToDiscardIteratorInstance;


template <typename Vector>
void TestUniqueCountSimple(void)
{
typedef typename Vector::value_type T;

Vector data(10);
data[0] = 11;
data[1] = 11;
data[2] = 12;
data[3] = 20;
data[4] = 29;
data[5] = 21;
data[6] = 21;
data[7] = 31;
data[8] = 31;
data[9] = 37;

int count = thrust::unique_count(
make_forward_iterator_wrapper(data.begin()),
make_forward_iterator_wrapper(data.end()));

ASSERT_EQUAL(count, 7);

int div_10_count = thrust::unique_count(
make_forward_iterator_wrapper(data.begin()),
make_forward_iterator_wrapper(data.end()),
is_equal_div_10_unique<T>());

ASSERT_EQUAL(div_10_count, 3);
}
DECLARE_INTEGRAL_VECTOR_UNITTEST(TestUniqueCountSimple);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unique_count claims to work with forward iterators. There should be a test using forward iterators.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a nice way to wrap an iterator into a forward_iterator in Thrust? I wrote a small wrapper class and that seems to work and compile, but I suppose that problem has been solved elsewhere already?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ericniebler is right, we should be testing this. Unfortunately we lack robust testing for these sorts of things.

Thanks for adding the new testing infrastructure! Please include it in this PR, ideally in the testing framework so we can reuse it from other tests later 👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a test to unique_copy and unique. I am not 100% sure it does what we would expect - due to the missing iterator tag, it gets executed sequentially on the CPU using device references for access.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean, forward iterators always dispatch to the CPU? @allisonvacanti can you comment on that? I mean, it seems reasonable to me.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This sounded odd to me, as I've never seen any logic in Thrust that would dispatch forward iterators to serial implementations. So I dug into it, and unfortunately this is due to a pretty nasty bug in Thrust's iterator traits.

The details are gory, but I've summarized them in a comment on #902.

Until that's fixed, I'm not comfortable merging this test using the forward_iterator_wrapper, since they only "do the right thing" because the iterator framework is broken.

I hate to churn on this PR even more, but I think we should remove the iterator wrappers for now and just test that the regular iterators work. We can re-introduce the wrapper tests as part of #55, after #902 is fixed and settled.

@ericniebler Can you review the two linked issues and see if you agree with my suggestion?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that forward iterators actually need to be dispatched to the sequential backend. They support multipass reading and should be usable in a parallel algorithm, so long as they're only copied and incremented. Is there something in the unique_count/count_if algorithms that would break them?

Copy link
Contributor Author

@upsj upsj May 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main issue I see with parallel execution on forward iterators is that they introduce an essentially linear dependency chain that means either every thread i starts from begin and increments it i times, or waits until one of its predecessors j is done and writes its iterator somewhere, to then increments it i - j times. Both don't really seem useful for parallel execution to me.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will require more increments, but if the work-per-element is expensive compared to the cost of the iterator increment, it may still make sense to parallelize. I'd rather let the user make that call, since they can opt-in to sequential execution by passing in the sequential exeuction policy (thrust::seq).

More importantly, the sequential implementation executes on CPU, and some types of device memory aren't accessible from the CPU's address space, so switching to seq really needs to be opt-in rather than default.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the clarification, that makes sense! I was only thinking of simple but massively parallel cases.


template <typename T>
struct TestUniqueCount
{
void operator()(const size_t n)
{
thrust::host_vector<T> h_data = unittest::random_integers<bool>(n);
thrust::device_vector<T> d_data = h_data;

int h_count{};
int d_count{};

h_count = thrust::unique_count(h_data.begin(), h_data.end());
d_count = thrust::unique_count(d_data.begin(), d_data.end());

ASSERT_EQUAL(h_count, d_count);
}
};
VariableUnitTest<TestUniqueCount, IntegralTypes> TestUniqueCountInstance;
71 changes: 71 additions & 0 deletions testing/unittest/iterator_helpers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#pragma once

#include <thrust/iterator/iterator_traits.h>
#include <thrust/iterator/iterator_categories.h>
#include <type_traits>


// Wraps an existing iterator into a forward iterator,
// thus removing some of its functionality
template <typename Iterator>
struct forward_iterator_wrapper {
// LegacyIterator requirements
using iterator_system_tag = typename thrust::iterator_system<Iterator>::type;
using reference = typename thrust::iterator_traits<Iterator>::reference;
using pointer = typename thrust::iterator_traits<Iterator>::pointer;
using value_type = typename thrust::iterator_traits<Iterator>::value_type;
using difference_type = typename thrust::iterator_traits<Iterator>::difference_type;
using iterator_category = typename std::conditional<
std::is_convertible<iterator_system_tag, thrust::device_system_tag>::value,
thrust::forward_device_iterator_tag,
typename std::conditional<
std::is_convertible<iterator_system_tag, thrust::host_system_tag>::value,
thrust::forward_host_iterator_tag,
std::forward_iterator_tag>::type>::type;
using base_iterator_category = typename thrust::iterator_traits<Iterator>::iterator_category;
static_assert(
std::is_convertible<base_iterator_category, std::forward_iterator_tag>::value,
"Cannot create forward_iterator_wrapper around an iterator that is not itself at least a forward iterator");

upsj marked this conversation as resolved.
Show resolved Hide resolved
__host__ __device__ reference operator*() const {
return *wrapped;
}

__host__ __device__ forward_iterator_wrapper& operator++() {
++wrapped;
return *this;
}

// LegacyInputIterator
friend __host__ __device__ bool operator==(const forward_iterator_wrapper& a, const forward_iterator_wrapper& b) {
return a.wrapped == b.wrapped;
}

friend __host__ __device__ bool operator!=(const forward_iterator_wrapper& a, const forward_iterator_wrapper& b) {
return !(a == b);
}

__host__ __device__ forward_iterator_wrapper operator++(int) {
auto cpy = *this;
++(*this);
return cpy;
}

template <typename It = Iterator>
__host__ __device__ typename std::enable_if<std::is_pointer<It>::value, pointer>::type operator->() const {
return wrapped;
}

template <typename It = Iterator>
__host__ __device__ typename std::enable_if<!std::is_pointer<It>::value, pointer>::type operator->() const {
return wrapped.operator->();
upsj marked this conversation as resolved.
Show resolved Hide resolved
}

Iterator wrapped;
};


template <typename Iterator>
forward_iterator_wrapper<Iterator> make_forward_iterator_wrapper(Iterator it) {
return {it};
}
2 changes: 1 addition & 1 deletion thrust/count.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,4 +228,4 @@ template <typename InputIterator, typename Predicate>

THRUST_NAMESPACE_END

#include <thrust/detail/count.inl>
#include <thrust/detail/count.h>
Loading