// using different memory spaces in CUDA
// a __device__ or __global__ function runs on the GPU
__global__ void use_local_memory_GPU(float in)
{
int i, index = threadIdx.x;
float average, sum = 0.0f;
__shared__ float sh_arr[128];
sh_arr[index] = array[index];
__syncthreads();
for (i=0; i<index; i++){sum+= sh_arr[i]; }
average = sum / (index + 1.0f);
if (array[index] > average) { array[index] = average; }
sh_arr[index] = 3.14;
}
__global__ void foo(float *g)
{
float a = 3.14;
int i = threadIdx.x;
g[i] = a;
g[i*2] = a;
a = g[i];
a = g[BLOCK_WIDTH/2 + i];
g[i] = a * g[BLOCK_WIDTH/2 + i];
g[BLOCK_wIDTH-1 - i] = a;
}
#include <stdio.h>
#include "gputimer.h"
#define NUM_THREADS 1000000
#define ARRAY_SIZE 10
#define BLOCK_WIDTH 1000
void print_array(int *array, int size)
__global__ void increment_naive(int *g)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
i = i % ARRAY_SIZE;
g[i] = g[i] +1;
}
int main(int argc, char **argv)
{
GpuTimer timer;
printf("%d total threads in %d blocks writing into %d array elements\n",
NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
}