Atomic memory

__global__ void increment_naive(int *g)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	i = i % ARRAY_SIZE;
	g[i] = g[i] + 1;
}

__global__ void increment_atomic(int *g)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	i = i % ARRAY_SIZE;
	atomicAdd(& g[i], 1);
}

int main(int argc, char **argv)
{
	PpuTimer timer;
	printf("%d total threads in %d blocks writing into %d array elements\n",
		NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);

	int array[ARRAY_SIZE];
	const_int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
}

strategies for efficient cuda programming
1. high arithmetic intensity math/memory
-minimize time spent on memory
-put data in faster memory
local > shared > global
-use coalesced global memory access