__global__ void
bfs( const Edge * edges,
Vertex * vertices,
int current_depth )
{
int e = blockDim.x * blockIdx.x + threadIdx.x;
int vfirst = edges[e].first;
int dfirst = vertices[vfirst];
int vsecond = edges[e].second;
int dsecond = vertices[vsecond];
if ((dfirst == curent_depth) && (dsecond == -1)){
vertices[vsecond] = dfirst + 1;
}
if (
)
}
Category: GPU
The BFS code
__global__ void
initialize_vertices( Vertex * vertices,
int starting_vertex,
int num_vertices )
{
int v = blockDim.x * blockIdx.x + threadIdx.x;
if (v == starting_vertex) vertices[v] = 0 else vertices[v] = -1;
}
Thread per row
__global__ void
spmv_csr_scalar_kernel(
const int num_rows, const int * rowptr,
const int * index, const float * value,
const float * x, float * y){
int row = blockDim.x * blockIdx.x +
threadIdx.x;
if (row < num_rows){
int row_start = rowptr[row];
int row_end = rowptr[row+1];
for (int jj = row_start;
jj < row_end ; jj++)
dot += value[jj] * x[index[jj]];
y[row] += dot;
}
}
)
Using on P thread
__device__ float3
title_calculation(Params myPrams, float3 force){
int i;
extern __shared__ Params[] sourceParams;
for (i = 0; i < blockDim.x; i++){
force += bodyBodyInteraction(myParams, sourceParams[i]);
}
return force;
}
All Pairs N-body
N object * N – 1 (forces)/obj = N2
N log N: Three method(barnes – hut)
N: fast multipole method
stream
cudaStream_t s1, s2; cudaStreamCreate(&s1); cudaStreamCreate(&s2); cudaMemory(&d_arr, &h_arr, numbytes, cudaH2D); A<<<1, 128>>>(d_arr); cudaMemcpy(&h_ahh, &d_arr, numbytes, cudaD2H);
APOD
– measure & improve memory bandwidth
– assure sufficient occupacy
– minimize thread divergence
– within warp
– avoid branchy code
– avoid thread workload imbalance
– don’t freak out
– consider fast math
– intrinsics __sin(), __cos(), etc
– use double prcision on purpose
Measuring Memory
__global__ void
transpose_serial(float in[], float out[])
{
for(int j=0; j < N; j++)
for(int i=0; i < N; i++)
out[j + i*N] = in[i + j*N];
}
__global__ void
transpose_parallel_per_row(float in[], float out[])
{
int i = threadIdx.x;
for(int j=0; j < N; j++)
out[j + i*N] = in[i + j*N];
}
__global__ void
transpose_parallel_per_element(float in[], float[])
{
int i = blockIdx.x * K + threadIdx.x;
int j = blockIdx.y * K + threadIdx.y;
out[j + i*N] = in[i + j*N];
}
transpose code
#include#include "gputimer.h" const int N=1024; void transpose_CPU(float in[], float out[]) { for(int j=0; j < N; j++) for(int i=0; i
APOD
Systematic Optimization
Analyze -> parallelize -> optimize -> deploy
analyze: profile whole application
– where can it benefit?
– by how much
Parallize: Pick an approach
Optimize: Profile-driven optimize
Deploy:Don’t optimize in vaccum!
serial implementation
#includeint main(int argc, char **argv) { const int ARRAY_SIZE = 10; int acc = 0; int out[ARRAY_SIZE]; int element[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; for(int i = 0; i < ARRAY_SIZE; i++){ out[i] = acc; acc = acc + element[i]; } for(int i = 0; i < ARRAY_SIZE; i++){ printf("%i", out[i]); } return 0; }
{
int r = 0;
for (int i = 0; i < bits; i++)
{
int bit = (w & (1 << i)) >> i;
r |= bit << (bits - i - 1);
}
return r;
}
__global__ void naive_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int myItem = d_in[myId];
int myBin = myItem % BIN_COUNT;
d_bins[myBin]++;
}
__global__ void simple_histo(int *d_bins, const int *d_in, const int)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int myItem = d.in[myId];
}