$define CUB_STDERR
#include <stdio.h>
#include <iostream>
#include <cub/cub.cuh>
using namespace cub;
bool g_verbose = false;
int g_iterations = 100;
// ---------
// Kernels
// ---------
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD>
__global__ void BlockPrefixSumKernel(
int *d_in,
int *d_out,
clock_t *d_elapsed)
{
typedef BlockScan<int, BLOCK_THREADS > BlockScanT;
__shared__ typename BlockScanT::SmemStorage smem_storage;
int data[ITEMS_PER_THREAD];
BlockLoadVectorized(d_in, data);
clock_t start = clock();
int aggregate;
BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate);
clock_t stop = clock();
BlockStoreVectorized(d_out, data);
if(threadIdx.x == 0)
{
*d_elapsed = (start > stop)? start - stop : stop - start;
d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
}
}
int Initialize(
int *h_in,
int *h_reference,
int num_elements)
{
int inclusive = 0;
for (int i = 0; i< num_elements; ++i)
{
h_in[i] = i % 17;
h_reference[i] = inclusive;
inclusive += h_in[i];
}
return inclusive;
}
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD>
void Test()
{
const int TILE_SIZE = BLOCJ_THREAD * ITEMS_PER_THREAD;
int *h_in = new int[TILE_SIZE];
int *h_reference = new int[TILE_SIZE];
int *h_gpu = new int[TILE_SIZE + 1];
int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
int *d_in = NULL;
int *d_out = NULL;
clock_t *d_elapsed = NULL;
cudaMalloc((void**)&d_in, sizeof(int)* TILE_SIZE);
cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1));
cudaMalloc((void**)&d_elapsed, sizeof(clock_t));
if (g_verbose)
{
printf("Input data: ");
for (int i = 0; i < TILE_SIZE; i++)
printf("%d, ", h_in[i]);
printf("\n\n");
}
cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
printf("BlockScan %d items (%d threads, %d items per thread):",
TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD);
clock_t elapsed_scan_clocks = 0;
for (int i = 0; i < g_iterations; ++i)
{
BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<1, BLOCK_THREADS>>>(
d_in,
d_otu,
d_elapsed);
clock_t scan_clocks;
cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost);
cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost);
elapsed_scan_clocks += scan_clocks;
}
bool correct = true;
for (int i = 0; i < TILE_SIZE; i++)
{
if (h_gpu[i] != h_reference[i])
{
printf("Incorrect result @ offset %d (%d != %d)\n",
i, h_gpu[i], h_reference[i]);
correct = false;
break;
}
}
if (h_gpu[TILE_SIZE] != h_aggregate)
{
printf("Incorrect aggregate (%d != %d)\n", h_gpu[TILE_SIZE], h_aggregate);
correct = false;
}
if (correct) printf("Correct!\n");
if (g_verbose)
{
printf("GPu output(reference output): ");
for (int i = 0; i < TILE_SIZE; i++)
printf("%d (%d), ", h_gpu[i], h_reference[i]);
printf("\n");
printf("GPU aggregate (reference aggregate)", h_gpu[TILE_SIZE], h_aggregate);
printf("\n\n");
}
printf("Average clock per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations);
if (h_in) delete[] h_in;
if (h_reference) delete[] h_reference;
if (h_gpu) delete[] h_gpu;
if (d_in) cudaFree(d_in);
if (d_out) cudaFree(d_out);
if (d_elapsed) cudaFree(d_elapsed);
}
int main(int argc, char** argv)
{
cudaDeviceProp props;
cudaGetDeviceProperties(&props, 0);
printf("Using device %s\n", props.name);
Test<1024, 1>();
Test<512, 2>();
Test<256, 4>();
Test<128, 8>();
Test<64, 16>();
Test<32, 32>();
Test<16, 64>();
return 0;
}