GPU

ビデオカード、グラフィックカード、グラフィックボードと呼び方は多数
ビデオチップをGPUと呼ぶ。ビデオメモリもある

macだと、グラフィックスに、Intel Iris Plus Graphics 640 1536 MB と記載がある
これはCPU一体型のGPUなので、独立型より性能は劣る

GPU
グラフィック演算処理、並列演算処理(スーパースカラ)が得意
CPUが3〜5の並列処理のところ、GPUは512も
コア数もCPUよりGPUの方が多い、 eg.5120
消費電力が高い
500Mhz~650Mhzあれば綺麗な画像で楽しめる

VRAM(ビデオメモリ)
 映像データを一時的に保存
 RAMと同様、容量、速度が重要。128〜258MB, 1Ghz~1.8Ghzあれば十分
メモリバス幅は128〜256bit

ビデオカードスロットとバスには 1) AGP(転送速度が2.128GB/S)と 2) PCIエクスプレスX16(主流、4GB/S) の2タイプがある

ビデオカードには、DVI-I端子(デジタル・アナログ)、B-sub15ピンのVGA端子(アナログ)、DVI-D(デジタル)などの端子がある
TV出力端子(S端子)はテレビに表示

グラフィックボードのチップメーカーはNVIDIA、AMDなど
NVIDIAはよく聞きますね。あれ、AMDって、CPUだけでなく、GPUもやってるの?

MSI GeForce GTX 1660Ti Gaming X
MSI GeForce GTX 1060 GAMING X 6G
Nvidia GeForce RTX 2080 8GB GDDR6 – Founders Edition
 L VRAMの容量が大きいほど描画が綺麗になりやすい
 L GeForceはゲームを目的とすると、動作保証が取れているということ

GPUメーカー

大手GPUメーカー
– Nvidia
– Intel
– AMD

まずNvidia

やべーなこれ、日本は全く相手にされてないな。

What we need to see:

BS or MS in Computer Science or equivalent program from an accredited University / College
10+ years of hands-on experiences building software and/or scalable cloud services
Strong self-initiative, passion, interpersonal skills, and agility working with new technology
Hands-on development of high quality distributed system features and/or cloud scale services, and RESTful web services
Experience with cloud system infrastructure, cloud-scale software, Continuous Integration and Continuous Delivery (CI/CD)
Demonstrated skills in wide variety of languages including: Java and Python
Deep understanding of cloud design in the areas of virtualization and global infrastructure, distributed systems, load balancing and security
Track record of crafting well-designed solutions and delivering high-quality software on time

Intel

AMD
REQUIREMENTS:

Advanced programming skills in C for operating system kernel & systems development
Experience with the GNU toolchain
Proficient use of git
Experience building and submitting patches on a mailing list and in general collaborative open source development
Excellent understanding of operating systems concepts, data structures, the x86-64 architecture, and virtualization
Experience with low level debug tools as well as emulators and simulators
Experience with open source software development
Experience working with external software partners
Experience running, analyzing, and tuning system performance benchmarks
Strong analysis and problem solving skills
Proven interpersonal skills, technical and team leadership and collaboration
Excellent written and verbal communication skills
Programming skills with Python and Bash
Preferred experience with Jenkins for CI/CT.

やべーなこれ。。

CUB

$define CUB_STDERR

#include <stdio.h>
#include <iostream>

#include <cub/cub.cuh>

using namespace cub;

bool g_verbose = false;
int g_iterations = 100;

// ---------
// Kernels
// ---------

template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD>
__global__ void BlockPrefixSumKernel(
	int *d_in,
	int *d_out,
	clock_t *d_elapsed)
{
	typedef BlockScan<int, BLOCK_THREADS > BlockScanT;

	__shared__ typename BlockScanT::SmemStorage smem_storage;

	int data[ITEMS_PER_THREAD];
	BlockLoadVectorized(d_in, data);

	clock_t start = clock();

	int aggregate;
	BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate);

	clock_t stop = clock();

	BlockStoreVectorized(d_out, data);

	if(threadIdx.x == 0)
	{
		*d_elapsed = (start > stop)? start - stop : stop - start;
		d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
	}
}

int Initialize(
	int *h_in,
	int *h_reference,
	int num_elements)
{
	int inclusive = 0;

	for (int i = 0; i< num_elements; ++i)
	{
		h_in&#91;i&#93; = i % 17;

		h_reference&#91;i&#93; = inclusive;
		inclusive += h_in&#91;i&#93;;
	}
	return inclusive;
}

template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD>
void Test()
{
	const int TILE_SIZE = BLOCJ_THREAD * ITEMS_PER_THREAD;

	int *h_in = new int[TILE_SIZE];
	int *h_reference = new int[TILE_SIZE];
	int *h_gpu = new int[TILE_SIZE + 1];

	int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);

	int *d_in = NULL;
	int *d_out = NULL;
	clock_t *d_elapsed = NULL;
	cudaMalloc((void**)&d_in, sizeof(int)* TILE_SIZE);
	cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1));
	cudaMalloc((void**)&d_elapsed, sizeof(clock_t));

	if (g_verbose)
	{
		printf("Input data: ");
		for (int i = 0; i < TILE_SIZE; i++)
			printf("%d, ", h_in&#91;i&#93;);
		printf("\n\n");
	}

	cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

	printf("BlockScan %d items (%d threads, %d items per thread):",
		TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD);

	clock_t elapsed_scan_clocks = 0;
	for (int i = 0; i < g_iterations; ++i)
	{
		BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<1, BLOCK_THREADS>>>(
			d_in,
			d_otu,
			d_elapsed);
		clock_t scan_clocks;
		cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost);
		cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost);
		elapsed_scan_clocks += scan_clocks;
	}

	bool correct = true;
	for (int i = 0; i < TILE_SIZE; i++)
	{
		if (h_gpu&#91;i&#93; != h_reference&#91;i&#93;)
		{
			printf("Incorrect result @ offset %d (%d != %d)\n",
				i, h_gpu&#91;i&#93;, h_reference&#91;i&#93;);
			correct = false;
			break;
		}
	}

	if (h_gpu&#91;TILE_SIZE&#93; != h_aggregate)
	{
		printf("Incorrect aggregate (%d != %d)\n", h_gpu&#91;TILE_SIZE&#93;, h_aggregate);
		correct = false;
	}
	if (correct) printf("Correct!\n");

	if (g_verbose)
	{
		printf("GPu output(reference output): ");
		for (int i = 0; i < TILE_SIZE; i++)
			printf("%d (%d), ", h_gpu&#91;i&#93;, h_reference&#91;i&#93;);
		printf("\n");
		printf("GPU aggregate (reference aggregate)", h_gpu&#91;TILE_SIZE&#93;, h_aggregate);
		printf("\n\n");
	}

	printf("Average clock per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations);

	if (h_in) delete&#91;&#93; h_in;
	if (h_reference) delete&#91;&#93; h_reference;
	if (h_gpu) delete&#91;&#93; h_gpu;
	if (d_in) cudaFree(d_in);
	if (d_out) cudaFree(d_out);
	if (d_elapsed) cudaFree(d_elapsed);
}

int main(int argc, char** argv)
{
	cudaDeviceProp props;
	cudaGetDeviceProperties(&props, 0);
	printf("Using device %s\n", props.name);

	Test<1024, 1>();
	Test<512, 2>();
	Test<256, 4>();
	Test<128, 8>();
	Test<64, 16>();
	Test<32, 32>();
	Test<16, 64>();

	return 0;
}

Time sort in Trust

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <algorithm>
#include <cstdlib>

#include "gputimer.h"

int main(void)
{
	// generate N random numbers serially
	int N = 1000000;
	thrust::host_vector<float> h_vec(N);
	std::generate(h_vec.begin(), h_vec.end(), rand);

	// transfert data to the device
	thrust::device_vector<float> d_vec = h_vec;

	// sort data on the device (846M keys per second on GeForce GTX 480)
	GpuTimer timer;
	timer.Start();
	thrust::sort(d_vec.begin(), d_vec.end());
	timer.Stop();

	// transfer data back to host
	thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());

	printf("Thrust sorted %d keys in %g ms\n", N, timer.Elapsed());
	return 0;
}

Quadratic GPU vs Serial CPU

N2 GPU: N2 work visit every edge many times but only sets depth once
CPU: N work maintains frontier to minimize visits / node

int N == << 20;
cublasInit();
cublasAlloc(N, sizeof(float), (void**)&d_x);
cublasAlloc(N, sizeof(float), (void*)&d_y);

cublasSetVector(N, sizeof(x[0]), x, y, d_x, 1);
cublasSetVector(N, sizeof(y[0]), y, 1, d_y, 1);

saxpy(N, 2.0, d_x, 1, y, 1);

cublasGetVector(N, sizeof(y[0]), d_y, 1, y, 1);

cublasFree(d_x);
cublasFree(d_y);
cublasShutdown(),

Do we have warry about race condition

while (!h_done){
	bfs(edges, vertices)
	cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}

while(!h_done){
	cudaMemcpy(&d_done, &h_true, sizeof(bool), cudaHostToDevice);
	bfs(edges, vertices)
	cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}

	if((vfirst != -1) && (vsecond == -1)){
		vertices[vsecond] = vfirst + 1;
		done = false;
	}
	if ((vfirst == -1) && (vsecond != -1)){
		vertices[vfirst] = vsecond + 1;
		done = false;
	}

map operation

__global__ void
bfs( const Edge * edges,
	Vertex * vertices,
	int current_depth )
{
	int e = blockDim.x * blockIdx.x + threadIdx.x;
	int vfirst = edges[e].first;
	int dfirst = vertices[vfirst];
	int vsecond = edges[e].second;
	int dsecond = vertices[vsecond];
	if ((dfirst == curent_depth) && (dsecond == -1)){
		vertices[vsecond] = dfirst + 1;
	}
	if (
		)
}

The BFS code

__global__ void
initialize_vertices( Vertex * vertices,
					int starting_vertex,
					int num_vertices )
{
	int v = blockDim.x * blockIdx.x + threadIdx.x;
	if (v == starting_vertex) vertices[v] = 0 else vertices[v] = -1;
}

Thread per row

__global__ void
spmv_csr_scalar_kernel(
	const int num_rows, const int * rowptr,
	const int * index, const float * value,
	const float * x, float * y){
	int row = blockDim.x * blockIdx.x +
	threadIdx.x;
	if (row < num_rows){
		int row_start = rowptr[row];
		int row_end = rowptr[row+1];
		for (int jj = row_start;
			jj < row_end ; jj++)
			dot += value[jj] * x[index[jj]];
		y[row] += dot;
	}
}
	)