Parallel computing

Parallel computing: many threads solving a problem by working together.
Map:Tasks read from and write to specific data elements
Gather,
Scatter: tasks compute where to write output
Stencil:tasks read input from a fixed neighborhood in an array:
Transpose:array, matrix, image, data structure
-> tasks re-order data elements in memory

struct foo {
float f;
int i;
};
foo array[1000];
array of structures -> structure of array

float out[], in[];
int i = threadIdx.x;
int j = threadIdx.y;

const float pi = 3.1415;

out[i] = pi*in[i];
out[i + j*128] = in[j + i*128];

if (i%2){
	out[i-1] += pi * in[i]; out[i+1] += pi * in[i];
	out[i] = (in[i]+ in[i-1] + in[i+1]) * pi/3.0f;
}

Greyscale Conversion

A common way to represent color images is known as RGBA – the color is specified by how much Red, Green, and Blue is in it. The ‘A’ stands for Alpha and is used for transparency;

Each channel Red, Blue, Green, and Alpha is represented by one byte.
Since we are using one byte for each color there are 256 different possible values for each color. This means we use 4 bytes per pixel.

Greyscale images are represented by a single intensity value per pixel
which is one byte in size.

To convert an image from color to grayscale one simple method is to set the intensity to the average of the RGB channels. But we will use a more sophisticated method that takes into account how the eye perceives color and weights the channels unequally.

The eye responds most strongly to green followed by red and then blue.
The NTSC (National Television System Committee) recommends the following
formula for color to greyscale conversion:

I = .299f * R + .587f * G + .114f * B

#include "reference_calc.cpp"
#include "utils.h"
#include <stdio.h>

__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
					unsigned char* const greyImage,
					int numRows, int numCols)

{

}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
	unsigned char* const d_greyImage, size_t numRows, size_t numCols)

{
	const dim3 blockSize(1, 1, 1):
	const dim3 gridSize( 1, 1, 1);
	rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);

	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());	
}

void referenceCalculation(const uchar4* const rgbaImage,
unsigned char *const greyImage,
size_t numRows,
size_t numCols)
{
for (size_t r = 0; r < umRows; ++r){ for (size_t c = 0; c < numCols; ++c){ uchar4 rgba = rgbaImage[r * numCols + c]; float channelSum = .299f * rgba.x + .587f * rgba.y + .144f * rgba.z; greyImage[r * numCols + c] = channelSum; } } } [/c] [c] #include
#include “timer.h”
#include “utils.h”
#include
#include

size_t numRows();
size_t numCols();

void preProcess(uchar4 **h_rgbaImage, unsigned char **h_greyImage,
uchar4 **d_rgbaImage, unsigned char **d_greyImage,
const std::string& filename);

void postProcess(const std::string& output_file);

void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const_greyImage, size_t numRows, size_t numCols);

#include “HW1.cpp”

int main(int argc, char **argv){
uchar4 *h_rgbaImage, *d_rgbaImage;
unsigned char *h_greyImage, *d_greyImage;

std::string input_file;
std::string output_file;
if (argc == 3){
input_file = std::string(argv[1]);
output_file = std::string(argv[2]);
}
else {
std::cerr << "Usage: ./hw input_file output_file" << std::endl; exit(1); } preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file); GpuTimer timer; timer.Start(); your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols()); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); printf("\n"); int err = printf("%f msecs.\n", timer.Elapsed()); if (err < 0){ std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } postProcess(output_file); return 0; } [/c]

MAP

MAP
– set of elements to process
– function to run on each element

map(elements, function)

How Pixels are represented
red, green, blue

struct uchar4Σ
 unsigned char x, y, z, w

converting color to black and white
I = (r + g + b)/3
I = .299f * r + .587f

Squaring Numbers

#include 

__global__ void square(float * d_out, float * d_in){
	int idx = threadIdx.x;
	float f = d_in[idx];
	d_out[idx] = f * f;
}

int main(int argc, char ** argv){
	const int ARRAY_SIZE = 64;
	const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

	float h_in[ARRAY_SIZE];
	for (int i = 0; i < ARRAY_SIZE; i++){
		h_in[i] = float(i);
	}
	float h_out[ARRAY_SIZE];

	float * d_in;
	float * d_out;

	cudaMalloc((void **) &d_in, ARRAY_BYTES);
	cudaMalloc((void **) &d_out, ARRAY_BYTES);

	cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpy???);

	square<<<1, ARRAY_SIZE>>>(d_out, d_in);

	cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpy???);

	for (int i = 0; i < ARRAY_SIZE; i++){
		printf("%f", h_out[i]);
		printf(((i % 4) != 3) ? "\t" : "\n");
	}

	cudaFree(d_in);
	cudaFree(d_out);

	return 0;
}

$ less square.cu
$ nvcc -o square square.cu

cuda mem cpu host to device
cuda mem cpu device to host

configuring the kernel launch
Kernel <<< Grid of Blocks, Block of threads >>> (...
-> 1, 2, or 3D -> 1, 2, or 3D
dim3(x, y, z)
dim3(w, i, i)==dim3(w)==w

square<<<1, 64>>> == square <<< dim3(1,1,1), dim3(64,1,1)>>>
square<<>>(...)

A CUDA program

cpu allocates storage on GPU
cpu copies input data from cpu
cpu launches kernel(s) on gpu to process
cpu copies results back to cpu from gpu

defining the gpu computation
BIG IDEA
kernels look like serial programs

for(i=0; i<64; i++){
	out[i] = in[i] * in[i];
}

64 times multifications
1* takes 2ns, executing 128 ns

a high-level view
CPU allocate memory, copy data to/from GPU, launch kernel
GPU express out = in*in
CPU launch 64 threads
64 times multifications
1* takes 10ns, executing 10ns

Power-efficient

CPU:latency (time/seconds)
GPU:throughput (stuff/time) (jobs/hour)

Latency vs Bandwidth

car:
latency 22.5 hours
throughput 0.089 people/hour
bus:
latency: 90 hours
throughput 0.45 people/hour

8 core ng (intel)
8-wide avx vector operations / core
2 thread / core(hyperthreading)
128-way parallelism

CUDA program
which is written in c with extensions for CPU(“host”) and GPU(“device”)

GPU computing

CUDA toolkit documentation v8.0
http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-microsoft-windows/index.html#axzz4aQaLXxZw

at limit for instruction-level parallelism per clock-cycle
-more processor to run computer faster

modern GPU: -thousands of ALUs
– hundreds of processors
– tens of thousands of concurrent threads

GPU
-smaller, faster, less power, more on chip

CPU
– complex control hardware
flexibility + performance
expensive in terms of power

GPU
– simpler control hardware
more hw for computation
potentially more power efficient
more restrictive programming model

getting started nginx

cent-osにnginxをインストール

# rpm -ivh http://nginx.org/packages/centos/6/noarch/RPMS/nginx-release-centos-6-0.el6.ngx.noarch.rpm

yum -y install nginx

here we go!
[vagrant@localhost app]$ nginx -v
nginx version: nginx/1.10.3

[vagrant@localhost app]$ sudo ps aux | grep nginx
vagrant 14953 0.0 0.1 103328 908 pts/1 S+ 10:17 0:00 grep nginx
※psコマンドは、プロセスの状態を一覧形式で表示します。オプションを省略した場合は、psコマンドを実行したユーザが起動しているプロセスのみを一覧表示します。