CUB

$define CUB_STDERR

#include <stdio.h>
#include <iostream>

#include <cub/cub.cuh>

using namespace cub;

bool g_verbose = false;
int g_iterations = 100;

// ---------
// Kernels
// ---------

template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD>
__global__ void BlockPrefixSumKernel(
	int *d_in,
	int *d_out,
	clock_t *d_elapsed)
{
	typedef BlockScan<int, BLOCK_THREADS > BlockScanT;

	__shared__ typename BlockScanT::SmemStorage smem_storage;

	int data[ITEMS_PER_THREAD];
	BlockLoadVectorized(d_in, data);

	clock_t start = clock();

	int aggregate;
	BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate);

	clock_t stop = clock();

	BlockStoreVectorized(d_out, data);

	if(threadIdx.x == 0)
	{
		*d_elapsed = (start > stop)? start - stop : stop - start;
		d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
	}
}

int Initialize(
	int *h_in,
	int *h_reference,
	int num_elements)
{
	int inclusive = 0;

	for (int i = 0; i< num_elements; ++i)
	{
		h_in&#91;i&#93; = i % 17;

		h_reference&#91;i&#93; = inclusive;
		inclusive += h_in&#91;i&#93;;
	}
	return inclusive;
}

template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD>
void Test()
{
	const int TILE_SIZE = BLOCJ_THREAD * ITEMS_PER_THREAD;

	int *h_in = new int[TILE_SIZE];
	int *h_reference = new int[TILE_SIZE];
	int *h_gpu = new int[TILE_SIZE + 1];

	int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);

	int *d_in = NULL;
	int *d_out = NULL;
	clock_t *d_elapsed = NULL;
	cudaMalloc((void**)&d_in, sizeof(int)* TILE_SIZE);
	cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1));
	cudaMalloc((void**)&d_elapsed, sizeof(clock_t));

	if (g_verbose)
	{
		printf("Input data: ");
		for (int i = 0; i < TILE_SIZE; i++)
			printf("%d, ", h_in&#91;i&#93;);
		printf("\n\n");
	}

	cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

	printf("BlockScan %d items (%d threads, %d items per thread):",
		TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD);

	clock_t elapsed_scan_clocks = 0;
	for (int i = 0; i < g_iterations; ++i)
	{
		BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<1, BLOCK_THREADS>>>(
			d_in,
			d_otu,
			d_elapsed);
		clock_t scan_clocks;
		cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost);
		cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost);
		elapsed_scan_clocks += scan_clocks;
	}

	bool correct = true;
	for (int i = 0; i < TILE_SIZE; i++)
	{
		if (h_gpu&#91;i&#93; != h_reference&#91;i&#93;)
		{
			printf("Incorrect result @ offset %d (%d != %d)\n",
				i, h_gpu&#91;i&#93;, h_reference&#91;i&#93;);
			correct = false;
			break;
		}
	}

	if (h_gpu&#91;TILE_SIZE&#93; != h_aggregate)
	{
		printf("Incorrect aggregate (%d != %d)\n", h_gpu&#91;TILE_SIZE&#93;, h_aggregate);
		correct = false;
	}
	if (correct) printf("Correct!\n");

	if (g_verbose)
	{
		printf("GPu output(reference output): ");
		for (int i = 0; i < TILE_SIZE; i++)
			printf("%d (%d), ", h_gpu&#91;i&#93;, h_reference&#91;i&#93;);
		printf("\n");
		printf("GPU aggregate (reference aggregate)", h_gpu&#91;TILE_SIZE&#93;, h_aggregate);
		printf("\n\n");
	}

	printf("Average clock per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations);

	if (h_in) delete&#91;&#93; h_in;
	if (h_reference) delete&#91;&#93; h_reference;
	if (h_gpu) delete&#91;&#93; h_gpu;
	if (d_in) cudaFree(d_in);
	if (d_out) cudaFree(d_out);
	if (d_elapsed) cudaFree(d_elapsed);
}

int main(int argc, char** argv)
{
	cudaDeviceProp props;
	cudaGetDeviceProperties(&props, 0);
	printf("Using device %s\n", props.name);

	Test<1024, 1>();
	Test<512, 2>();
	Test<256, 4>();
	Test<128, 8>();
	Test<64, 16>();
	Test<32, 32>();
	Test<16, 64>();

	return 0;
}