$define CUB_STDERR #include <stdio.h> #include <iostream> #include <cub/cub.cuh> using namespace cub; bool g_verbose = false; int g_iterations = 100; // --------- // Kernels // --------- template < int BLOCK_THREADS, int ITEMS_PER_THREAD> __global__ void BlockPrefixSumKernel( int *d_in, int *d_out, clock_t *d_elapsed) { typedef BlockScan<int, BLOCK_THREADS > BlockScanT; __shared__ typename BlockScanT::SmemStorage smem_storage; int data[ITEMS_PER_THREAD]; BlockLoadVectorized(d_in, data); clock_t start = clock(); int aggregate; BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate); clock_t stop = clock(); BlockStoreVectorized(d_out, data); if(threadIdx.x == 0) { *d_elapsed = (start > stop)? start - stop : stop - start; d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate; } } int Initialize( int *h_in, int *h_reference, int num_elements) { int inclusive = 0; for (int i = 0; i< num_elements; ++i) { h_in[i] = i % 17; h_reference[i] = inclusive; inclusive += h_in[i]; } return inclusive; } template < int BLOCK_THREADS, int ITEMS_PER_THREAD> void Test() { const int TILE_SIZE = BLOCJ_THREAD * ITEMS_PER_THREAD; int *h_in = new int[TILE_SIZE]; int *h_reference = new int[TILE_SIZE]; int *h_gpu = new int[TILE_SIZE + 1]; int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE); int *d_in = NULL; int *d_out = NULL; clock_t *d_elapsed = NULL; cudaMalloc((void**)&d_in, sizeof(int)* TILE_SIZE); cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1)); cudaMalloc((void**)&d_elapsed, sizeof(clock_t)); if (g_verbose) { printf("Input data: "); for (int i = 0; i < TILE_SIZE; i++) printf("%d, ", h_in[i]); printf("\n\n"); } cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); printf("BlockScan %d items (%d threads, %d items per thread):", TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD); clock_t elapsed_scan_clocks = 0; for (int i = 0; i < g_iterations; ++i) { BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<1, BLOCK_THREADS>>>( d_in, d_otu, d_elapsed); clock_t scan_clocks; cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost); cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost); elapsed_scan_clocks += scan_clocks; } bool correct = true; for (int i = 0; i < TILE_SIZE; i++) { if (h_gpu[i] != h_reference[i]) { printf("Incorrect result @ offset %d (%d != %d)\n", i, h_gpu[i], h_reference[i]); correct = false; break; } } if (h_gpu[TILE_SIZE] != h_aggregate) { printf("Incorrect aggregate (%d != %d)\n", h_gpu[TILE_SIZE], h_aggregate); correct = false; } if (correct) printf("Correct!\n"); if (g_verbose) { printf("GPu output(reference output): "); for (int i = 0; i < TILE_SIZE; i++) printf("%d (%d), ", h_gpu[i], h_reference[i]); printf("\n"); printf("GPU aggregate (reference aggregate)", h_gpu[TILE_SIZE], h_aggregate); printf("\n\n"); } printf("Average clock per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations); if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (h_gpu) delete[] h_gpu; if (d_in) cudaFree(d_in); if (d_out) cudaFree(d_out); if (d_elapsed) cudaFree(d_elapsed); } int main(int argc, char** argv) { cudaDeviceProp props; cudaGetDeviceProperties(&props, 0); printf("Using device %s\n", props.name); Test<1024, 1>(); Test<512, 2>(); Test<256, 4>(); Test<128, 8>(); Test<64, 16>(); Test<32, 32>(); Test<16, 64>(); return 0; }