__global__ void increment_naive(int *g) { int i = blockIdx.x * blockDim.x + threadIdx.x; i = i % ARRAY_SIZE; g[i] = g[i] + 1; } __global__ void increment_atomic(int *g) { int i = blockIdx.x * blockDim.x + threadIdx.x; i = i % ARRAY_SIZE; atomicAdd(& g[i], 1); } int main(int argc, char **argv) { PpuTimer timer; printf("%d total threads in %d blocks writing into %d array elements\n", NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE); int array[ARRAY_SIZE]; const_int ARRAY_BYTES = ARRAY_SIZE * sizeof(int); }
strategies for efficient cuda programming
1. high arithmetic intensity math/memory
-minimize time spent on memory
-put data in faster memory
local > shared > global
-use coalesced global memory access