shared memory

// using different memory spaces in CUDA
// a __device__ or __global__ function runs on the GPU
__global__ void use_local_memory_GPU(float in)
{
	int i, index = threadIdx.x;
	float average, sum = 0.0f;

	__shared__ float sh_arr[128];
	sh_arr[index] = array[index];

	__syncthreads();
	for (i=0; i<index; i++){sum+= sh_arr&#91;i&#93;; }
		average = sum / (index + 1.0f);
	if (array&#91;index&#93; > average) { array[index] = average; }
	sh_arr[index] = 3.14;
	
}
__global__ void foo(float *g)
{
	float a = 3.14;
	int i = threadIdx.x;
	g[i] = a;
	g[i*2] = a;
	a = g[i];
	a = g[BLOCK_WIDTH/2 + i];
	g[i] = a * g[BLOCK_WIDTH/2 + i];
	g[BLOCK_wIDTH-1 - i] = a;
}
#include <stdio.h>
#include "gputimer.h"

#define NUM_THREADS 1000000
#define ARRAY_SIZE 10

#define BLOCK_WIDTH 1000

void print_array(int *array, int size)

__global__ void increment_naive(int *g)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	i = i % ARRAY_SIZE;
	g[i] = g[i] +1;
}
int main(int argc, char **argv)
{
	GpuTimer timer;
	printf("%d total threads in %d blocks writing into %d array elements\n",
		NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
}