GPUメーカー

大手GPUメーカー
– Nvidia
– Intel
– AMD

まずNvidia

やべーなこれ、日本は全く相手にされてないな。

What we need to see:

BS or MS in Computer Science or equivalent program from an accredited University / College
10+ years of hands-on experiences building software and/or scalable cloud services
Strong self-initiative, passion, interpersonal skills, and agility working with new technology
Hands-on development of high quality distributed system features and/or cloud scale services, and RESTful web services
Experience with cloud system infrastructure, cloud-scale software, Continuous Integration and Continuous Delivery (CI/CD)
Demonstrated skills in wide variety of languages including: Java and Python
Deep understanding of cloud design in the areas of virtualization and global infrastructure, distributed systems, load balancing and security
Track record of crafting well-designed solutions and delivering high-quality software on time

Intel

AMD
REQUIREMENTS:

Advanced programming skills in C for operating system kernel & systems development
Experience with the GNU toolchain
Proficient use of git
Experience building and submitting patches on a mailing list and in general collaborative open source development
Excellent understanding of operating systems concepts, data structures, the x86-64 architecture, and virtualization
Experience with low level debug tools as well as emulators and simulators
Experience with open source software development
Experience working with external software partners
Experience running, analyzing, and tuning system performance benchmarks
Strong analysis and problem solving skills
Proven interpersonal skills, technical and team leadership and collaboration
Excellent written and verbal communication skills
Programming skills with Python and Bash
Preferred experience with Jenkins for CI/CT.

やべーなこれ。。

CUB

$define CUB_STDERR

#include <stdio.h>
#include <iostream>

#include <cub/cub.cuh>

using namespace cub;

bool g_verbose = false;
int g_iterations = 100;

// ---------
// Kernels
// ---------

template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD>
__global__ void BlockPrefixSumKernel(
	int *d_in,
	int *d_out,
	clock_t *d_elapsed)
{
	typedef BlockScan<int, BLOCK_THREADS > BlockScanT;

	__shared__ typename BlockScanT::SmemStorage smem_storage;

	int data[ITEMS_PER_THREAD];
	BlockLoadVectorized(d_in, data);

	clock_t start = clock();

	int aggregate;
	BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate);

	clock_t stop = clock();

	BlockStoreVectorized(d_out, data);

	if(threadIdx.x == 0)
	{
		*d_elapsed = (start > stop)? start - stop : stop - start;
		d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
	}
}

int Initialize(
	int *h_in,
	int *h_reference,
	int num_elements)
{
	int inclusive = 0;

	for (int i = 0; i< num_elements; ++i)
	{
		h_in&#91;i&#93; = i % 17;

		h_reference&#91;i&#93; = inclusive;
		inclusive += h_in&#91;i&#93;;
	}
	return inclusive;
}

template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD>
void Test()
{
	const int TILE_SIZE = BLOCJ_THREAD * ITEMS_PER_THREAD;

	int *h_in = new int[TILE_SIZE];
	int *h_reference = new int[TILE_SIZE];
	int *h_gpu = new int[TILE_SIZE + 1];

	int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);

	int *d_in = NULL;
	int *d_out = NULL;
	clock_t *d_elapsed = NULL;
	cudaMalloc((void**)&d_in, sizeof(int)* TILE_SIZE);
	cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1));
	cudaMalloc((void**)&d_elapsed, sizeof(clock_t));

	if (g_verbose)
	{
		printf("Input data: ");
		for (int i = 0; i < TILE_SIZE; i++)
			printf("%d, ", h_in&#91;i&#93;);
		printf("\n\n");
	}

	cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

	printf("BlockScan %d items (%d threads, %d items per thread):",
		TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD);

	clock_t elapsed_scan_clocks = 0;
	for (int i = 0; i < g_iterations; ++i)
	{
		BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<1, BLOCK_THREADS>>>(
			d_in,
			d_otu,
			d_elapsed);
		clock_t scan_clocks;
		cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost);
		cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost);
		elapsed_scan_clocks += scan_clocks;
	}

	bool correct = true;
	for (int i = 0; i < TILE_SIZE; i++)
	{
		if (h_gpu&#91;i&#93; != h_reference&#91;i&#93;)
		{
			printf("Incorrect result @ offset %d (%d != %d)\n",
				i, h_gpu&#91;i&#93;, h_reference&#91;i&#93;);
			correct = false;
			break;
		}
	}

	if (h_gpu&#91;TILE_SIZE&#93; != h_aggregate)
	{
		printf("Incorrect aggregate (%d != %d)\n", h_gpu&#91;TILE_SIZE&#93;, h_aggregate);
		correct = false;
	}
	if (correct) printf("Correct!\n");

	if (g_verbose)
	{
		printf("GPu output(reference output): ");
		for (int i = 0; i < TILE_SIZE; i++)
			printf("%d (%d), ", h_gpu&#91;i&#93;, h_reference&#91;i&#93;);
		printf("\n");
		printf("GPU aggregate (reference aggregate)", h_gpu&#91;TILE_SIZE&#93;, h_aggregate);
		printf("\n\n");
	}

	printf("Average clock per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations);

	if (h_in) delete&#91;&#93; h_in;
	if (h_reference) delete&#91;&#93; h_reference;
	if (h_gpu) delete&#91;&#93; h_gpu;
	if (d_in) cudaFree(d_in);
	if (d_out) cudaFree(d_out);
	if (d_elapsed) cudaFree(d_elapsed);
}

int main(int argc, char** argv)
{
	cudaDeviceProp props;
	cudaGetDeviceProperties(&props, 0);
	printf("Using device %s\n", props.name);

	Test<1024, 1>();
	Test<512, 2>();
	Test<256, 4>();
	Test<128, 8>();
	Test<64, 16>();
	Test<32, 32>();
	Test<16, 64>();

	return 0;
}

Time sort in Trust

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <algorithm>
#include <cstdlib>

#include "gputimer.h"

int main(void)
{
	// generate N random numbers serially
	int N = 1000000;
	thrust::host_vector<float> h_vec(N);
	std::generate(h_vec.begin(), h_vec.end(), rand);

	// transfert data to the device
	thrust::device_vector<float> d_vec = h_vec;

	// sort data on the device (846M keys per second on GeForce GTX 480)
	GpuTimer timer;
	timer.Start();
	thrust::sort(d_vec.begin(), d_vec.end());
	timer.Stop();

	// transfer data back to host
	thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());

	printf("Thrust sorted %d keys in %g ms\n", N, timer.Elapsed());
	return 0;
}

Quadratic GPU vs Serial CPU

N2 GPU: N2 work visit every edge many times but only sets depth once
CPU: N work maintains frontier to minimize visits / node

int N == << 20;
cublasInit();
cublasAlloc(N, sizeof(float), (void**)&d_x);
cublasAlloc(N, sizeof(float), (void*)&d_y);

cublasSetVector(N, sizeof(x[0]), x, y, d_x, 1);
cublasSetVector(N, sizeof(y[0]), y, 1, d_y, 1);

saxpy(N, 2.0, d_x, 1, y, 1);

cublasGetVector(N, sizeof(y[0]), d_y, 1, y, 1);

cublasFree(d_x);
cublasFree(d_y);
cublasShutdown(),

Do we have warry about race condition

while (!h_done){
	bfs(edges, vertices)
	cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}

while(!h_done){
	cudaMemcpy(&d_done, &h_true, sizeof(bool), cudaHostToDevice);
	bfs(edges, vertices)
	cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}

	if((vfirst != -1) && (vsecond == -1)){
		vertices[vsecond] = vfirst + 1;
		done = false;
	}
	if ((vfirst == -1) && (vsecond != -1)){
		vertices[vfirst] = vsecond + 1;
		done = false;
	}

map operation

__global__ void
bfs( const Edge * edges,
	Vertex * vertices,
	int current_depth )
{
	int e = blockDim.x * blockIdx.x + threadIdx.x;
	int vfirst = edges[e].first;
	int dfirst = vertices[vfirst];
	int vsecond = edges[e].second;
	int dsecond = vertices[vsecond];
	if ((dfirst == curent_depth) && (dsecond == -1)){
		vertices[vsecond] = dfirst + 1;
	}
	if (
		)
}

The BFS code

__global__ void
initialize_vertices( Vertex * vertices,
					int starting_vertex,
					int num_vertices )
{
	int v = blockDim.x * blockIdx.x + threadIdx.x;
	if (v == starting_vertex) vertices[v] = 0 else vertices[v] = -1;
}

Thread per row

__global__ void
spmv_csr_scalar_kernel(
	const int num_rows, const int * rowptr,
	const int * index, const float * value,
	const float * x, float * y){
	int row = blockDim.x * blockIdx.x +
	threadIdx.x;
	if (row < num_rows){
		int row_start = rowptr[row];
		int row_end = rowptr[row+1];
		for (int jj = row_start;
			jj < row_end ; jj++)
			dot += value[jj] * x[index[jj]];
		y[row] += dot;
	}
}
	)

Using on P thread

__device__ float3
title_calculation(Params myPrams, float3 force){
	int i;
	extern __shared__ Params[] sourceParams;
	for (i = 0; i < blockDim.x; i++){
		force += bodyBodyInteraction(myParams, sourceParams[i]);
	}
	return force;
}