N object * N – 1 (forces)/obj = N2
N log N: Three method(barnes – hut)
N: fast multipole method
stream
cudaStream_t s1, s2; cudaStreamCreate(&s1); cudaStreamCreate(&s2); cudaMemory(&d_arr, &h_arr, numbytes, cudaH2D); A<<<1, 128>>>(d_arr); cudaMemcpy(&h_ahh, &d_arr, numbytes, cudaD2H);
APOD
– measure & improve memory bandwidth
– assure sufficient occupacy
– minimize thread divergence
– within warp
– avoid branchy code
– avoid thread workload imbalance
– don’t freak out
– consider fast math
– intrinsics __sin(), __cos(), etc
– use double prcision on purpose
Measuring Memory
__global__ void transpose_serial(float in[], float out[]) { for(int j=0; j < N; j++) for(int i=0; i < N; i++) out[j + i*N] = in[i + j*N]; } __global__ void transpose_parallel_per_row(float in[], float out[]) { int i = threadIdx.x; for(int j=0; j < N; j++) out[j + i*N] = in[i + j*N]; } __global__ void transpose_parallel_per_element(float in[], float[]) { int i = blockIdx.x * K + threadIdx.x; int j = blockIdx.y * K + threadIdx.y; out[j + i*N] = in[i + j*N]; }
transpose code
#include#include "gputimer.h" const int N=1024; void transpose_CPU(float in[], float out[]) { for(int j=0; j < N; j++) for(int i=0; i
APOD
Systematic Optimization
Analyze -> parallelize -> optimize -> deploy
analyze: profile whole application
– where can it benefit?
– by how much
Parallize: Pick an approach
Optimize: Profile-driven optimize
Deploy:Don’t optimize in vaccum!
serial implementation
#includeint main(int argc, char **argv) { const int ARRAY_SIZE = 10; int acc = 0; int out[ARRAY_SIZE]; int element[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; for(int i = 0; i < ARRAY_SIZE; i++){ out[i] = acc; acc = acc + element[i]; } for(int i = 0; i < ARRAY_SIZE; i++){ printf("%i", out[i]); } return 0; }
{ int r = 0; for (int i = 0; i < bits; i++) { int bit = (w & (1 << i)) >> i; r |= bit << (bits - i - 1); } return r; } __global__ void naive_histo(int *d_bins, const int *d_in, const int BIN_COUNT) { int myId = threadIdx.x + blockDim.x * blockIdx.x; int myItem = d_in[myId]; int myBin = myItem % BIN_COUNT; d_bins[myBin]++; } __global__ void simple_histo(int *d_bins, const int *d_in, const int) { int myId = threadIdx.x + blockDim.x * blockIdx.x; int myItem = d.in[myId]; }
reducing
__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
int myId = threadId.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
for (unsigned int s = blockDim.x / 2; s>0; s >>= 1)
{
if (tid < s)
{
d_in[myId] += d_in[myId + s];
}
__syncthreads();
}
if (tid == 0)
{
d_out[blockIdx.x] = d_id[myId];
}
}[/c]
thread divergence
pre-loop code
…
for (int i=0; i <= threadIdx; ++i)
{
...some loop code...
}
...
post-loop code
pre-loop, loop code, post-loop
fundamental GPU algorithm
-reduce, scan, histogram
Atomic memory
__global__ void increment_naive(int *g) { int i = blockIdx.x * blockDim.x + threadIdx.x; i = i % ARRAY_SIZE; g[i] = g[i] + 1; } __global__ void increment_atomic(int *g) { int i = blockIdx.x * blockDim.x + threadIdx.x; i = i % ARRAY_SIZE; atomicAdd(& g[i], 1); } int main(int argc, char **argv) { PpuTimer timer; printf("%d total threads in %d blocks writing into %d array elements\n", NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE); int array[ARRAY_SIZE]; const_int ARRAY_BYTES = ARRAY_SIZE * sizeof(int); }
strategies for efficient cuda programming
1. high arithmetic intensity math/memory
-minimize time spent on memory
-put data in faster memory
local > shared > global
-use coalesced global memory access
shared memory
// using different memory spaces in CUDA // a __device__ or __global__ function runs on the GPU __global__ void use_local_memory_GPU(float in) { int i, index = threadIdx.x; float average, sum = 0.0f; __shared__ float sh_arr[128]; sh_arr[index] = array[index]; __syncthreads(); for (i=0; i<index; i++){sum+= sh_arr[i]; } average = sum / (index + 1.0f); if (array[index] > average) { array[index] = average; } sh_arr[index] = 3.14; }
__global__ void foo(float *g) { float a = 3.14; int i = threadIdx.x; g[i] = a; g[i*2] = a; a = g[i]; a = g[BLOCK_WIDTH/2 + i]; g[i] = a * g[BLOCK_WIDTH/2 + i]; g[BLOCK_wIDTH-1 - i] = a; }
#include <stdio.h> #include "gputimer.h" #define NUM_THREADS 1000000 #define ARRAY_SIZE 10 #define BLOCK_WIDTH 1000 void print_array(int *array, int size) __global__ void increment_naive(int *g) { int i = blockIdx.x * blockDim.x + threadIdx.x; i = i % ARRAY_SIZE; g[i] = g[i] +1; } int main(int argc, char **argv) { GpuTimer timer; printf("%d total threads in %d blocks writing into %d array elements\n", NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE); }