Do we have warry about race condition

while (!h_done){
	bfs(edges, vertices)
	cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}

while(!h_done){
	cudaMemcpy(&d_done, &h_true, sizeof(bool), cudaHostToDevice);
	bfs(edges, vertices)
	cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}

	if((vfirst != -1) && (vsecond == -1)){
		vertices[vsecond] = vfirst + 1;
		done = false;
	}
	if ((vfirst == -1) && (vsecond != -1)){
		vertices[vfirst] = vsecond + 1;
		done = false;
	}

map operation

__global__ void
bfs( const Edge * edges,
	Vertex * vertices,
	int current_depth )
{
	int e = blockDim.x * blockIdx.x + threadIdx.x;
	int vfirst = edges[e].first;
	int dfirst = vertices[vfirst];
	int vsecond = edges[e].second;
	int dsecond = vertices[vsecond];
	if ((dfirst == curent_depth) && (dsecond == -1)){
		vertices[vsecond] = dfirst + 1;
	}
	if (
		)
}

The BFS code

__global__ void
initialize_vertices( Vertex * vertices,
					int starting_vertex,
					int num_vertices )
{
	int v = blockDim.x * blockIdx.x + threadIdx.x;
	if (v == starting_vertex) vertices[v] = 0 else vertices[v] = -1;
}

Thread per row

__global__ void
spmv_csr_scalar_kernel(
	const int num_rows, const int * rowptr,
	const int * index, const float * value,
	const float * x, float * y){
	int row = blockDim.x * blockIdx.x +
	threadIdx.x;
	if (row < num_rows){
		int row_start = rowptr[row];
		int row_end = rowptr[row+1];
		for (int jj = row_start;
			jj < row_end ; jj++)
			dot += value[jj] * x[index[jj]];
		y[row] += dot;
	}
}
	)

Using on P thread

__device__ float3
title_calculation(Params myPrams, float3 force){
	int i;
	extern __shared__ Params[] sourceParams;
	for (i = 0; i < blockDim.x; i++){
		force += bodyBodyInteraction(myParams, sourceParams[i]);
	}
	return force;
}

stream

cudaStream_t s1, s2;
cudaStreamCreate(&s1); cudaStreamCreate(&s2);

cudaMemory(&d_arr, &h_arr, numbytes, cudaH2D);
A<<<1, 128>>>(d_arr);
cudaMemcpy(&h_ahh, &d_arr, numbytes, cudaD2H);

APOD
– measure & improve memory bandwidth
– assure sufficient occupacy
– minimize thread divergence
– within warp
– avoid branchy code
– avoid thread workload imbalance
– don’t freak out
– consider fast math
– intrinsics __sin(), __cos(), etc
– use double prcision on purpose

Measuring Memory

__global__ void
transpose_serial(float in[], float out[])
{
	for(int j=0; j < N; j++)
		for(int i=0; i < N; i++)
			out[j + i*N] = in[i + j*N];
}

__global__ void
transpose_parallel_per_row(float in[], float out[])
{
	int i = threadIdx.x;

	for(int j=0; j < N; j++)
		out[j + i*N] = in[i + j*N];
}

__global__ void
transpose_parallel_per_element(float in[], float[])
{
	int i = blockIdx.x * K + threadIdx.x;
	int j = blockIdx.y * K + threadIdx.y;

	out[j + i*N] = in[i + j*N];
}

APOD

Systematic Optimization

Analyze -> parallelize -> optimize -> deploy

analyze: profile whole application
– where can it benefit?
– by how much
Parallize: Pick an approach
Optimize: Profile-driven optimize
Deploy:Don’t optimize in vaccum!