#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <algorithm>
#include <cstdlib>
#include "gputimer.h"
int main(void)
{
	// generate N random numbers serially
	int N = 1000000;
	thrust::host_vector<float> h_vec(N);
	std::generate(h_vec.begin(), h_vec.end(), rand);
	// transfert data to the device
	thrust::device_vector<float> d_vec = h_vec;
	// sort data on the device (846M keys per second on GeForce GTX 480)
	GpuTimer timer;
	timer.Start();
	thrust::sort(d_vec.begin(), d_vec.end());
	timer.Stop();
	// transfer data back to host
	thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
	printf("Thrust sorted %d keys in %g ms\n", N, timer.Elapsed());
	return 0;
}
	Quadratic GPU vs Serial CPU
N2 GPU: N2 work visit every edge many times but only sets depth once
CPU: N work maintains frontier to minimize visits / node
int N == << 20; cublasInit(); cublasAlloc(N, sizeof(float), (void**)&d_x); cublasAlloc(N, sizeof(float), (void*)&d_y); cublasSetVector(N, sizeof(x[0]), x, y, d_x, 1); cublasSetVector(N, sizeof(y[0]), y, 1, d_y, 1); saxpy(N, 2.0, d_x, 1, y, 1); cublasGetVector(N, sizeof(y[0]), d_y, 1, y, 1); cublasFree(d_x); cublasFree(d_y); cublasShutdown(),
Do we have warry about race condition
while (!h_done){
	bfs(edges, vertices)
	cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}
while(!h_done){
	cudaMemcpy(&d_done, &h_true, sizeof(bool), cudaHostToDevice);
	bfs(edges, vertices)
	cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}
	if((vfirst != -1) && (vsecond == -1)){
		vertices[vsecond] = vfirst + 1;
		done = false;
	}
	if ((vfirst == -1) && (vsecond != -1)){
		vertices[vfirst] = vsecond + 1;
		done = false;
	}
	map operation
__global__ void
bfs( const Edge * edges,
	Vertex * vertices,
	int current_depth )
{
	int e = blockDim.x * blockIdx.x + threadIdx.x;
	int vfirst = edges[e].first;
	int dfirst = vertices[vfirst];
	int vsecond = edges[e].second;
	int dsecond = vertices[vsecond];
	if ((dfirst == curent_depth) && (dsecond == -1)){
		vertices[vsecond] = dfirst + 1;
	}
	if (
		)
}
	The BFS code
__global__ void
initialize_vertices( Vertex * vertices,
					int starting_vertex,
					int num_vertices )
{
	int v = blockDim.x * blockIdx.x + threadIdx.x;
	if (v == starting_vertex) vertices[v] = 0 else vertices[v] = -1;
}
	Thread per row
__global__ void
spmv_csr_scalar_kernel(
	const int num_rows, const int * rowptr,
	const int * index, const float * value,
	const float * x, float * y){
	int row = blockDim.x * blockIdx.x +
	threadIdx.x;
	if (row < num_rows){
		int row_start = rowptr[row];
		int row_end = rowptr[row+1];
		for (int jj = row_start;
			jj < row_end ; jj++)
			dot += value[jj] * x[index[jj]];
		y[row] += dot;
	}
}
	)
	Using on P thread
__device__ float3
title_calculation(Params myPrams, float3 force){
	int i;
	extern __shared__ Params[] sourceParams;
	for (i = 0; i < blockDim.x; i++){
		force += bodyBodyInteraction(myParams, sourceParams[i]);
	}
	return force;
}
	All Pairs N-body
N object * N – 1 (forces)/obj = N2
N log N: Three method(barnes – hut)
N: fast multipole method
stream
cudaStream_t s1, s2; cudaStreamCreate(&s1); cudaStreamCreate(&s2); cudaMemory(&d_arr, &h_arr, numbytes, cudaH2D); A<<<1, 128>>>(d_arr); cudaMemcpy(&h_ahh, &d_arr, numbytes, cudaD2H);
APOD
– measure & improve memory bandwidth
 – assure sufficient occupacy
– minimize thread divergence
 – within warp
 – avoid branchy code
 – avoid thread workload imbalance
 – don’t freak out
– consider fast math
 – intrinsics __sin(), __cos(), etc
 – use double prcision on purpose 
Measuring Memory
__global__ void
transpose_serial(float in[], float out[])
{
	for(int j=0; j < N; j++)
		for(int i=0; i < N; i++)
			out[j + i*N] = in[i + j*N];
}
__global__ void
transpose_parallel_per_row(float in[], float out[])
{
	int i = threadIdx.x;
	for(int j=0; j < N; j++)
		out[j + i*N] = in[i + j*N];
}
__global__ void
transpose_parallel_per_element(float in[], float[])
{
	int i = blockIdx.x * K + threadIdx.x;
	int j = blockIdx.y * K + threadIdx.y;
	out[j + i*N] = in[i + j*N];
}