__global__ void increment_naive(int *g)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
i = i % ARRAY_SIZE;
g[i] = g[i] + 1;
}
__global__ void increment_atomic(int *g)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
i = i % ARRAY_SIZE;
atomicAdd(& g[i], 1);
}
int main(int argc, char **argv)
{
PpuTimer timer;
printf("%d total threads in %d blocks writing into %d array elements\n",
NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
int array[ARRAY_SIZE];
const_int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
}
strategies for efficient cuda programming
1. high arithmetic intensity math/memory
-minimize time spent on memory
-put data in faster memory
local > shared > global
-use coalesced global memory access