// using different memory spaces in CUDA // a __device__ or __global__ function runs on the GPU __global__ void use_local_memory_GPU(float in) { int i, index = threadIdx.x; float average, sum = 0.0f; __shared__ float sh_arr[128]; sh_arr[index] = array[index]; __syncthreads(); for (i=0; i<index; i++){sum+= sh_arr[i]; } average = sum / (index + 1.0f); if (array[index] > average) { array[index] = average; } sh_arr[index] = 3.14; }
__global__ void foo(float *g) { float a = 3.14; int i = threadIdx.x; g[i] = a; g[i*2] = a; a = g[i]; a = g[BLOCK_WIDTH/2 + i]; g[i] = a * g[BLOCK_WIDTH/2 + i]; g[BLOCK_wIDTH-1 - i] = a; }
#include <stdio.h> #include "gputimer.h" #define NUM_THREADS 1000000 #define ARRAY_SIZE 10 #define BLOCK_WIDTH 1000 void print_array(int *array, int size) __global__ void increment_naive(int *g) { int i = blockIdx.x * blockDim.x + threadIdx.x; i = i % ARRAY_SIZE; g[i] = g[i] +1; } int main(int argc, char **argv) { GpuTimer timer; printf("%d total threads in %d blocks writing into %d array elements\n", NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE); }