Thread per row

__global__ void
spmv_csr_scalar_kernel(
	const int num_rows, const int * rowptr,
	const int * index, const float * value,
	const float * x, float * y){
	int row = blockDim.x * blockIdx.x +
	threadIdx.x;
	if (row < num_rows){
		int row_start = rowptr[row];
		int row_end = rowptr[row+1];
		for (int jj = row_start;
			jj < row_end ; jj++)
			dot += value[jj] * x[index[jj]];
		y[row] += dot;
	}
}
	)

Using on P thread

__device__ float3
title_calculation(Params myPrams, float3 force){
	int i;
	extern __shared__ Params[] sourceParams;
	for (i = 0; i < blockDim.x; i++){
		force += bodyBodyInteraction(myParams, sourceParams[i]);
	}
	return force;
}

stream

cudaStream_t s1, s2;
cudaStreamCreate(&s1); cudaStreamCreate(&s2);

cudaMemory(&d_arr, &h_arr, numbytes, cudaH2D);
A<<<1, 128>>>(d_arr);
cudaMemcpy(&h_ahh, &d_arr, numbytes, cudaD2H);

APOD
– measure & improve memory bandwidth
– assure sufficient occupacy
– minimize thread divergence
– within warp
– avoid branchy code
– avoid thread workload imbalance
– don’t freak out
– consider fast math
– intrinsics __sin(), __cos(), etc
– use double prcision on purpose

Measuring Memory

__global__ void
transpose_serial(float in[], float out[])
{
	for(int j=0; j < N; j++)
		for(int i=0; i < N; i++)
			out[j + i*N] = in[i + j*N];
}

__global__ void
transpose_parallel_per_row(float in[], float out[])
{
	int i = threadIdx.x;

	for(int j=0; j < N; j++)
		out[j + i*N] = in[i + j*N];
}

__global__ void
transpose_parallel_per_element(float in[], float[])
{
	int i = blockIdx.x * K + threadIdx.x;
	int j = blockIdx.y * K + threadIdx.y;

	out[j + i*N] = in[i + j*N];
}

APOD

Systematic Optimization

Analyze -> parallelize -> optimize -> deploy

analyze: profile whole application
– where can it benefit?
– by how much
Parallize: Pick an approach
Optimize: Profile-driven optimize
Deploy:Don’t optimize in vaccum!

serial implementation

#include 

int main(int argc, char **argv)
{
	const int ARRAY_SIZE = 10;

	int acc = 0;
	int out[ARRAY_SIZE];
	int element[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};

	for(int i = 0; i < ARRAY_SIZE; i++){
		out[i] = acc;
		acc = acc + element[i];
	}

	for(int i = 0; i < ARRAY_SIZE; i++){
		printf("%i", out[i]);
	}

	return 0;
}
{
	int r = 0;
	for (int i = 0; i < bits; i++)
	{
		int bit = (w & (1 << i)) >> i;
		r |= bit << (bits - i - 1);
	}
	return r;
}
__global__ void naive_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
{
	int myId = threadIdx.x + blockDim.x * blockIdx.x;
	int myItem = d_in[myId];
	int myBin = myItem % BIN_COUNT;
	d_bins[myBin]++;
}
__global__ void simple_histo(int *d_bins, const int *d_in, const int)
{
	int myId = threadIdx.x + blockDim.x * blockIdx.x;
	int myItem = d.in[myId];
}

reducing


__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
int myId = threadId.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;

for (unsigned int s = blockDim.x / 2; s>0; s >>= 1)
{
if (tid < s) { d_in[myId] += d_in[myId + s]; } __syncthreads(); } if (tid == 0) { d_out[blockIdx.x] = d_id[myId]; } }[/c]