stream

cudaStream_t s1, s2;
cudaStreamCreate(&s1); cudaStreamCreate(&s2);

cudaMemory(&d_arr, &h_arr, numbytes, cudaH2D);
A<<<1, 128>>>(d_arr);
cudaMemcpy(&h_ahh, &d_arr, numbytes, cudaD2H);

APOD
– measure & improve memory bandwidth
– assure sufficient occupacy
– minimize thread divergence
– within warp
– avoid branchy code
– avoid thread workload imbalance
– don’t freak out
– consider fast math
– intrinsics __sin(), __cos(), etc
– use double prcision on purpose

Measuring Memory

__global__ void
transpose_serial(float in[], float out[])
{
	for(int j=0; j < N; j++)
		for(int i=0; i < N; i++)
			out[j + i*N] = in[i + j*N];
}

__global__ void
transpose_parallel_per_row(float in[], float out[])
{
	int i = threadIdx.x;

	for(int j=0; j < N; j++)
		out[j + i*N] = in[i + j*N];
}

__global__ void
transpose_parallel_per_element(float in[], float[])
{
	int i = blockIdx.x * K + threadIdx.x;
	int j = blockIdx.y * K + threadIdx.y;

	out[j + i*N] = in[i + j*N];
}

APOD

Systematic Optimization

Analyze -> parallelize -> optimize -> deploy

analyze: profile whole application
– where can it benefit?
– by how much
Parallize: Pick an approach
Optimize: Profile-driven optimize
Deploy:Don’t optimize in vaccum!

serial implementation

#include 

int main(int argc, char **argv)
{
	const int ARRAY_SIZE = 10;

	int acc = 0;
	int out[ARRAY_SIZE];
	int element[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};

	for(int i = 0; i < ARRAY_SIZE; i++){
		out[i] = acc;
		acc = acc + element[i];
	}

	for(int i = 0; i < ARRAY_SIZE; i++){
		printf("%i", out[i]);
	}

	return 0;
}
{
	int r = 0;
	for (int i = 0; i < bits; i++)
	{
		int bit = (w & (1 << i)) >> i;
		r |= bit << (bits - i - 1);
	}
	return r;
}
__global__ void naive_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
{
	int myId = threadIdx.x + blockDim.x * blockIdx.x;
	int myItem = d_in[myId];
	int myBin = myItem % BIN_COUNT;
	d_bins[myBin]++;
}
__global__ void simple_histo(int *d_bins, const int *d_in, const int)
{
	int myId = threadIdx.x + blockDim.x * blockIdx.x;
	int myItem = d.in[myId];
}

reducing

__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
int myId = threadId.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;

for (unsigned int s = blockDim.x / 2; s>0; s >>= 1)
{
if (tid < s) { d_in[myId] += d_in[myId + s]; } __syncthreads(); } if (tid == 0) { d_out[blockIdx.x] = d_id[myId]; } }[/c]

Atomic memory

__global__ void increment_naive(int *g)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	i = i % ARRAY_SIZE;
	g[i] = g[i] + 1;
}

__global__ void increment_atomic(int *g)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	i = i % ARRAY_SIZE;
	atomicAdd(& g[i], 1);
}

int main(int argc, char **argv)
{
	PpuTimer timer;
	printf("%d total threads in %d blocks writing into %d array elements\n",
		NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);

	int array[ARRAY_SIZE];
	const_int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
}

strategies for efficient cuda programming
1. high arithmetic intensity math/memory
-minimize time spent on memory
-put data in faster memory
local > shared > global
-use coalesced global memory access

shared memory

// using different memory spaces in CUDA
// a __device__ or __global__ function runs on the GPU
__global__ void use_local_memory_GPU(float in)
{
	int i, index = threadIdx.x;
	float average, sum = 0.0f;

	__shared__ float sh_arr[128];
	sh_arr[index] = array[index];

	__syncthreads();
	for (i=0; i<index; i++){sum+= sh_arr&#91;i&#93;; }
		average = sum / (index + 1.0f);
	if (array&#91;index&#93; > average) { array[index] = average; }
	sh_arr[index] = 3.14;
	
}
__global__ void foo(float *g)
{
	float a = 3.14;
	int i = threadIdx.x;
	g[i] = a;
	g[i*2] = a;
	a = g[i];
	a = g[BLOCK_WIDTH/2 + i];
	g[i] = a * g[BLOCK_WIDTH/2 + i];
	g[BLOCK_wIDTH-1 - i] = a;
}
#include <stdio.h>
#include "gputimer.h"

#define NUM_THREADS 1000000
#define ARRAY_SIZE 10

#define BLOCK_WIDTH 1000

void print_array(int *array, int size)

__global__ void increment_naive(int *g)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	i = i % ARRAY_SIZE;
	g[i] = g[i] +1;
}
int main(int argc, char **argv)
{
	GpuTimer timer;
	printf("%d total threads in %d blocks writing into %d array elements\n",
		NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
}