#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <algorithm>
#include <cstdlib>
#include "gputimer.h"
int main(void)
{
// generate N random numbers serially
int N = 1000000;
thrust::host_vector<float> h_vec(N);
std::generate(h_vec.begin(), h_vec.end(), rand);
// transfert data to the device
thrust::device_vector<float> d_vec = h_vec;
// sort data on the device (846M keys per second on GeForce GTX 480)
GpuTimer timer;
timer.Start();
thrust::sort(d_vec.begin(), d_vec.end());
timer.Stop();
// transfer data back to host
thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
printf("Thrust sorted %d keys in %g ms\n", N, timer.Elapsed());
return 0;
}
Quadratic GPU vs Serial CPU
N2 GPU: N2 work visit every edge many times but only sets depth once
CPU: N work maintains frontier to minimize visits / node
int N == << 20; cublasInit(); cublasAlloc(N, sizeof(float), (void**)&d_x); cublasAlloc(N, sizeof(float), (void*)&d_y); cublasSetVector(N, sizeof(x[0]), x, y, d_x, 1); cublasSetVector(N, sizeof(y[0]), y, 1, d_y, 1); saxpy(N, 2.0, d_x, 1, y, 1); cublasGetVector(N, sizeof(y[0]), d_y, 1, y, 1); cublasFree(d_x); cublasFree(d_y); cublasShutdown(),
Do we have warry about race condition
while (!h_done){
bfs(edges, vertices)
cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}
while(!h_done){
cudaMemcpy(&d_done, &h_true, sizeof(bool), cudaHostToDevice);
bfs(edges, vertices)
cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}
if((vfirst != -1) && (vsecond == -1)){
vertices[vsecond] = vfirst + 1;
done = false;
}
if ((vfirst == -1) && (vsecond != -1)){
vertices[vfirst] = vsecond + 1;
done = false;
}
map operation
__global__ void
bfs( const Edge * edges,
Vertex * vertices,
int current_depth )
{
int e = blockDim.x * blockIdx.x + threadIdx.x;
int vfirst = edges[e].first;
int dfirst = vertices[vfirst];
int vsecond = edges[e].second;
int dsecond = vertices[vsecond];
if ((dfirst == curent_depth) && (dsecond == -1)){
vertices[vsecond] = dfirst + 1;
}
if (
)
}
The BFS code
__global__ void
initialize_vertices( Vertex * vertices,
int starting_vertex,
int num_vertices )
{
int v = blockDim.x * blockIdx.x + threadIdx.x;
if (v == starting_vertex) vertices[v] = 0 else vertices[v] = -1;
}
Thread per row
__global__ void
spmv_csr_scalar_kernel(
const int num_rows, const int * rowptr,
const int * index, const float * value,
const float * x, float * y){
int row = blockDim.x * blockIdx.x +
threadIdx.x;
if (row < num_rows){
int row_start = rowptr[row];
int row_end = rowptr[row+1];
for (int jj = row_start;
jj < row_end ; jj++)
dot += value[jj] * x[index[jj]];
y[row] += dot;
}
}
)
Using on P thread
__device__ float3
title_calculation(Params myPrams, float3 force){
int i;
extern __shared__ Params[] sourceParams;
for (i = 0; i < blockDim.x; i++){
force += bodyBodyInteraction(myParams, sourceParams[i]);
}
return force;
}
All Pairs N-body
N object * N – 1 (forces)/obj = N2
N log N: Three method(barnes – hut)
N: fast multipole method
stream
cudaStream_t s1, s2; cudaStreamCreate(&s1); cudaStreamCreate(&s2); cudaMemory(&d_arr, &h_arr, numbytes, cudaH2D); A<<<1, 128>>>(d_arr); cudaMemcpy(&h_ahh, &d_arr, numbytes, cudaD2H);
APOD
– measure & improve memory bandwidth
– assure sufficient occupacy
– minimize thread divergence
– within warp
– avoid branchy code
– avoid thread workload imbalance
– don’t freak out
– consider fast math
– intrinsics __sin(), __cos(), etc
– use double prcision on purpose
Measuring Memory
__global__ void
transpose_serial(float in[], float out[])
{
for(int j=0; j < N; j++)
for(int i=0; i < N; i++)
out[j + i*N] = in[i + j*N];
}
__global__ void
transpose_parallel_per_row(float in[], float out[])
{
int i = threadIdx.x;
for(int j=0; j < N; j++)
out[j + i*N] = in[i + j*N];
}
__global__ void
transpose_parallel_per_element(float in[], float[])
{
int i = blockIdx.x * K + threadIdx.x;
int j = blockIdx.y * K + threadIdx.y;
out[j + i*N] = in[i + j*N];
}