#include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/generate.h> #include <thrust/sort.h> #include <thrust/copy.h> #include <algorithm> #include <cstdlib> #include "gputimer.h" int main(void) { // generate N random numbers serially int N = 1000000; thrust::host_vector<float> h_vec(N); std::generate(h_vec.begin(), h_vec.end(), rand); // transfert data to the device thrust::device_vector<float> d_vec = h_vec; // sort data on the device (846M keys per second on GeForce GTX 480) GpuTimer timer; timer.Start(); thrust::sort(d_vec.begin(), d_vec.end()); timer.Stop(); // transfer data back to host thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin()); printf("Thrust sorted %d keys in %g ms\n", N, timer.Elapsed()); return 0; }
Month: March 2017
Quadratic GPU vs Serial CPU
N2 GPU: N2 work visit every edge many times but only sets depth once
CPU: N work maintains frontier to minimize visits / node
int N == << 20; cublasInit(); cublasAlloc(N, sizeof(float), (void**)&d_x); cublasAlloc(N, sizeof(float), (void*)&d_y); cublasSetVector(N, sizeof(x[0]), x, y, d_x, 1); cublasSetVector(N, sizeof(y[0]), y, 1, d_y, 1); saxpy(N, 2.0, d_x, 1, y, 1); cublasGetVector(N, sizeof(y[0]), d_y, 1, y, 1); cublasFree(d_x); cublasFree(d_y); cublasShutdown(),
Do we have warry about race condition
while (!h_done){ bfs(edges, vertices) cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost); } while(!h_done){ cudaMemcpy(&d_done, &h_true, sizeof(bool), cudaHostToDevice); bfs(edges, vertices) cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost); } if((vfirst != -1) && (vsecond == -1)){ vertices[vsecond] = vfirst + 1; done = false; } if ((vfirst == -1) && (vsecond != -1)){ vertices[vfirst] = vsecond + 1; done = false; }
map operation
__global__ void bfs( const Edge * edges, Vertex * vertices, int current_depth ) { int e = blockDim.x * blockIdx.x + threadIdx.x; int vfirst = edges[e].first; int dfirst = vertices[vfirst]; int vsecond = edges[e].second; int dsecond = vertices[vsecond]; if ((dfirst == curent_depth) && (dsecond == -1)){ vertices[vsecond] = dfirst + 1; } if ( ) }
The BFS code
__global__ void initialize_vertices( Vertex * vertices, int starting_vertex, int num_vertices ) { int v = blockDim.x * blockIdx.x + threadIdx.x; if (v == starting_vertex) vertices[v] = 0 else vertices[v] = -1; }
Thread per row
__global__ void spmv_csr_scalar_kernel( const int num_rows, const int * rowptr, const int * index, const float * value, const float * x, float * y){ int row = blockDim.x * blockIdx.x + threadIdx.x; if (row < num_rows){ int row_start = rowptr[row]; int row_end = rowptr[row+1]; for (int jj = row_start; jj < row_end ; jj++) dot += value[jj] * x[index[jj]]; y[row] += dot; } } )
Using on P thread
__device__ float3 title_calculation(Params myPrams, float3 force){ int i; extern __shared__ Params[] sourceParams; for (i = 0; i < blockDim.x; i++){ force += bodyBodyInteraction(myParams, sourceParams[i]); } return force; }
All Pairs N-body
N object * N – 1 (forces)/obj = N2
N log N: Three method(barnes – hut)
N: fast multipole method
stream
cudaStream_t s1, s2; cudaStreamCreate(&s1); cudaStreamCreate(&s2); cudaMemory(&d_arr, &h_arr, numbytes, cudaH2D); A<<<1, 128>>>(d_arr); cudaMemcpy(&h_ahh, &d_arr, numbytes, cudaD2H);
APOD
– measure & improve memory bandwidth
– assure sufficient occupacy
– minimize thread divergence
– within warp
– avoid branchy code
– avoid thread workload imbalance
– don’t freak out
– consider fast math
– intrinsics __sin(), __cos(), etc
– use double prcision on purpose
Measuring Memory
__global__ void transpose_serial(float in[], float out[]) { for(int j=0; j < N; j++) for(int i=0; i < N; i++) out[j + i*N] = in[i + j*N]; } __global__ void transpose_parallel_per_row(float in[], float out[]) { int i = threadIdx.x; for(int j=0; j < N; j++) out[j + i*N] = in[i + j*N]; } __global__ void transpose_parallel_per_element(float in[], float[]) { int i = blockIdx.x * K + threadIdx.x; int j = blockIdx.y * K + threadIdx.y; out[j + i*N] = in[i + j*N]; }