#include#include "gputimer.h" const int N=1024; void transpose_CPU(float in[], float out[]) { for(int j=0; j < N; j++) for(int i=0; i
Month: March 2017
APOD
Systematic Optimization
Analyze -> parallelize -> optimize -> deploy
analyze: profile whole application
– where can it benefit?
– by how much
Parallize: Pick an approach
Optimize: Profile-driven optimize
Deploy:Don’t optimize in vaccum!
serial implementation
#includeint main(int argc, char **argv) { const int ARRAY_SIZE = 10; int acc = 0; int out[ARRAY_SIZE]; int element[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; for(int i = 0; i < ARRAY_SIZE; i++){ out[i] = acc; acc = acc + element[i]; } for(int i = 0; i < ARRAY_SIZE; i++){ printf("%i", out[i]); } return 0; }
{
int r = 0;
for (int i = 0; i < bits; i++)
{
int bit = (w & (1 << i)) >> i;
r |= bit << (bits - i - 1);
}
return r;
}
__global__ void naive_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int myItem = d_in[myId];
int myBin = myItem % BIN_COUNT;
d_bins[myBin]++;
}
__global__ void simple_histo(int *d_bins, const int *d_in, const int)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int myItem = d.in[myId];
}
reducing
__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
int myId = threadId.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
for (unsigned int s = blockDim.x / 2; s>0; s >>= 1)
{
if (tid < s)
{
d_in[myId] += d_in[myId + s];
}
__syncthreads();
}
if (tid == 0)
{
d_out[blockIdx.x] = d_id[myId];
}
}[/c]
thread divergence
pre-loop code
…
for (int i=0; i <= threadIdx; ++i)
{
...some loop code...
}
...
post-loop code
pre-loop, loop code, post-loop
fundamental GPU algorithm
-reduce, scan, histogram
Atomic memory
__global__ void increment_naive(int *g)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
i = i % ARRAY_SIZE;
g[i] = g[i] + 1;
}
__global__ void increment_atomic(int *g)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
i = i % ARRAY_SIZE;
atomicAdd(& g[i], 1);
}
int main(int argc, char **argv)
{
PpuTimer timer;
printf("%d total threads in %d blocks writing into %d array elements\n",
NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
int array[ARRAY_SIZE];
const_int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
}
strategies for efficient cuda programming
1. high arithmetic intensity math/memory
-minimize time spent on memory
-put data in faster memory
local > shared > global
-use coalesced global memory access
shared memory
// using different memory spaces in CUDA
// a __device__ or __global__ function runs on the GPU
__global__ void use_local_memory_GPU(float in)
{
int i, index = threadIdx.x;
float average, sum = 0.0f;
__shared__ float sh_arr[128];
sh_arr[index] = array[index];
__syncthreads();
for (i=0; i<index; i++){sum+= sh_arr[i]; }
average = sum / (index + 1.0f);
if (array[index] > average) { array[index] = average; }
sh_arr[index] = 3.14;
}
__global__ void foo(float *g)
{
float a = 3.14;
int i = threadIdx.x;
g[i] = a;
g[i*2] = a;
a = g[i];
a = g[BLOCK_WIDTH/2 + i];
g[i] = a * g[BLOCK_WIDTH/2 + i];
g[BLOCK_wIDTH-1 - i] = a;
}
#include <stdio.h>
#include "gputimer.h"
#define NUM_THREADS 1000000
#define ARRAY_SIZE 10
#define BLOCK_WIDTH 1000
void print_array(int *array, int size)
__global__ void increment_naive(int *g)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
i = i % ARRAY_SIZE;
g[i] = g[i] +1;
}
int main(int argc, char **argv)
{
GpuTimer timer;
printf("%d total threads in %d blocks writing into %d array elements\n",
NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
}
Global memory
// using different memory spaces in CUDA
// a __device__ or __global__ function runs on the GPU
__global__ void use_local_memory_GPU(float in)
{
float f;
f = in;
}
int main(int argc, char **argv)
{
use_local_memory_GPU<<<1, 128>>>(2.0f);
float h_arr[128];
float *d_arr;
cudaMalloc((void **)&d_arr, sizeof(float)*128);
}
The need for barriers
int idx = threadIdx.x; --shared-- int array[128]; array[idx]=threadIdx.x; if(idx < 127) array[idx] = array[idx+1]
thead, thread block
CUDA
a hierarchy of
-computation
-memory spaces
synchronization
Writing Efficient Program
High-level strategy
1.maximize arithmetic intensity math/memory
Parallel communication pattern
map: one-to-one
transpose: one-to-one
Gather many-to-one
scatter one-to-many
stencil several-to-one
reduce
summary of programming model
kernels – c / c++ functions
kernel foo(),
thread blocks: group of threads that cooperate to solve a (sub) problem
kernel bar()
streaming multiprocessors
SMs
CUDA makes few guarantees about when and where thread blocks will run.
Advantages
– hardware can run things efficiently
– no waiting on lowpokes
– scalability!
from cell phones to supercomputers
from current to future GPUs
#inculde <stdio.h>
#define NUM_BLOCKS 16
#define BLOCK_WIDTH 1
__global__ void hello()
{
printf("Hello world! I'm a thread block %d\n", blockIdx.x);
}
int main(int argc, char **argv)
{
// launch the kernel
hello<<<NUM_BLOCKS, BLOCK_WIDTH>>>();
cudaDeviceSynchronize();
printf("That's all!\n");
return 0;
}