Time sort in Trust

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <algorithm>
#include <cstdlib>

#include "gputimer.h"

int main(void)
{
	// generate N random numbers serially
	int N = 1000000;
	thrust::host_vector<float> h_vec(N);
	std::generate(h_vec.begin(), h_vec.end(), rand);

	// transfert data to the device
	thrust::device_vector<float> d_vec = h_vec;

	// sort data on the device (846M keys per second on GeForce GTX 480)
	GpuTimer timer;
	timer.Start();
	thrust::sort(d_vec.begin(), d_vec.end());
	timer.Stop();

	// transfer data back to host
	thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());

	printf("Thrust sorted %d keys in %g ms\n", N, timer.Elapsed());
	return 0;
}

Quadratic GPU vs Serial CPU

N2 GPU: N2 work visit every edge many times but only sets depth once
CPU: N work maintains frontier to minimize visits / node

int N == << 20;
cublasInit();
cublasAlloc(N, sizeof(float), (void**)&d_x);
cublasAlloc(N, sizeof(float), (void*)&d_y);

cublasSetVector(N, sizeof(x[0]), x, y, d_x, 1);
cublasSetVector(N, sizeof(y[0]), y, 1, d_y, 1);

saxpy(N, 2.0, d_x, 1, y, 1);

cublasGetVector(N, sizeof(y[0]), d_y, 1, y, 1);

cublasFree(d_x);
cublasFree(d_y);
cublasShutdown(),

Do we have warry about race condition

while (!h_done){
	bfs(edges, vertices)
	cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}

while(!h_done){
	cudaMemcpy(&d_done, &h_true, sizeof(bool), cudaHostToDevice);
	bfs(edges, vertices)
	cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}

	if((vfirst != -1) && (vsecond == -1)){
		vertices[vsecond] = vfirst + 1;
		done = false;
	}
	if ((vfirst == -1) && (vsecond != -1)){
		vertices[vfirst] = vsecond + 1;
		done = false;
	}

map operation

__global__ void
bfs( const Edge * edges,
	Vertex * vertices,
	int current_depth )
{
	int e = blockDim.x * blockIdx.x + threadIdx.x;
	int vfirst = edges[e].first;
	int dfirst = vertices[vfirst];
	int vsecond = edges[e].second;
	int dsecond = vertices[vsecond];
	if ((dfirst == curent_depth) && (dsecond == -1)){
		vertices[vsecond] = dfirst + 1;
	}
	if (
		)
}

The BFS code

__global__ void
initialize_vertices( Vertex * vertices,
					int starting_vertex,
					int num_vertices )
{
	int v = blockDim.x * blockIdx.x + threadIdx.x;
	if (v == starting_vertex) vertices[v] = 0 else vertices[v] = -1;
}

Thread per row

__global__ void
spmv_csr_scalar_kernel(
	const int num_rows, const int * rowptr,
	const int * index, const float * value,
	const float * x, float * y){
	int row = blockDim.x * blockIdx.x +
	threadIdx.x;
	if (row < num_rows){
		int row_start = rowptr[row];
		int row_end = rowptr[row+1];
		for (int jj = row_start;
			jj < row_end ; jj++)
			dot += value[jj] * x[index[jj]];
		y[row] += dot;
	}
}
	)

Using on P thread

__device__ float3
title_calculation(Params myPrams, float3 force){
	int i;
	extern __shared__ Params[] sourceParams;
	for (i = 0; i < blockDim.x; i++){
		force += bodyBodyInteraction(myParams, sourceParams[i]);
	}
	return force;
}

stream

cudaStream_t s1, s2;
cudaStreamCreate(&s1); cudaStreamCreate(&s2);

cudaMemory(&d_arr, &h_arr, numbytes, cudaH2D);
A<<<1, 128>>>(d_arr);
cudaMemcpy(&h_ahh, &d_arr, numbytes, cudaD2H);

APOD
– measure & improve memory bandwidth
– assure sufficient occupacy
– minimize thread divergence
– within warp
– avoid branchy code
– avoid thread workload imbalance
– don’t freak out
– consider fast math
– intrinsics __sin(), __cos(), etc
– use double prcision on purpose

Measuring Memory

__global__ void
transpose_serial(float in[], float out[])
{
	for(int j=0; j < N; j++)
		for(int i=0; i < N; i++)
			out[j + i*N] = in[i + j*N];
}

__global__ void
transpose_parallel_per_row(float in[], float out[])
{
	int i = threadIdx.x;

	for(int j=0; j < N; j++)
		out[j + i*N] = in[i + j*N];
}

__global__ void
transpose_parallel_per_element(float in[], float[])
{
	int i = blockIdx.x * K + threadIdx.x;
	int j = blockIdx.y * K + threadIdx.y;

	out[j + i*N] = in[i + j*N];
}