reducing

__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
int myId = threadId.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;

for (unsigned int s = blockDim.x / 2; s>0; s >>= 1)
{
if (tid < s) { d_in[myId] += d_in[myId + s]; } __syncthreads(); } if (tid == 0) { d_out[blockIdx.x] = d_id[myId]; } }[/c]

Atomic memory

__global__ void increment_naive(int *g)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	i = i % ARRAY_SIZE;
	g[i] = g[i] + 1;
}

__global__ void increment_atomic(int *g)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	i = i % ARRAY_SIZE;
	atomicAdd(& g[i], 1);
}

int main(int argc, char **argv)
{
	PpuTimer timer;
	printf("%d total threads in %d blocks writing into %d array elements\n",
		NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);

	int array[ARRAY_SIZE];
	const_int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
}

strategies for efficient cuda programming
1. high arithmetic intensity math/memory
-minimize time spent on memory
-put data in faster memory
local > shared > global
-use coalesced global memory access

shared memory

// using different memory spaces in CUDA
// a __device__ or __global__ function runs on the GPU
__global__ void use_local_memory_GPU(float in)
{
	int i, index = threadIdx.x;
	float average, sum = 0.0f;

	__shared__ float sh_arr[128];
	sh_arr[index] = array[index];

	__syncthreads();
	for (i=0; i<index; i++){sum+= sh_arr&#91;i&#93;; }
		average = sum / (index + 1.0f);
	if (array&#91;index&#93; > average) { array[index] = average; }
	sh_arr[index] = 3.14;
	
}
__global__ void foo(float *g)
{
	float a = 3.14;
	int i = threadIdx.x;
	g[i] = a;
	g[i*2] = a;
	a = g[i];
	a = g[BLOCK_WIDTH/2 + i];
	g[i] = a * g[BLOCK_WIDTH/2 + i];
	g[BLOCK_wIDTH-1 - i] = a;
}
#include <stdio.h>
#include "gputimer.h"

#define NUM_THREADS 1000000
#define ARRAY_SIZE 10

#define BLOCK_WIDTH 1000

void print_array(int *array, int size)

__global__ void increment_naive(int *g)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	i = i % ARRAY_SIZE;
	g[i] = g[i] +1;
}
int main(int argc, char **argv)
{
	GpuTimer timer;
	printf("%d total threads in %d blocks writing into %d array elements\n",
		NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
}

Global memory

// using different memory spaces in CUDA
// a __device__ or __global__ function runs on the GPU
__global__ void use_local_memory_GPU(float in)
{
	float f;
	f = in;
}

int main(int argc, char **argv)
{
	use_local_memory_GPU<<<1, 128>>>(2.0f);

	float h_arr[128];
	float *d_arr;

	cudaMalloc((void **)&d_arr, sizeof(float)*128);
}

The need for barriers

int idx = threadIdx.x;
--shared-- int array[128];
array[idx]=threadIdx.x;
if(idx < 127)
	array[idx] = array[idx+1]

thead, thread block

CUDA
a hierarchy of
-computation
-memory spaces
synchronization

Writing Efficient Program
High-level strategy
1.maximize arithmetic intensity math/memory

Parallel communication pattern

map: one-to-one
transpose: one-to-one
Gather many-to-one
scatter one-to-many
stencil several-to-one
reduce

summary of programming model
kernels – c / c++ functions

kernel foo(),
thread blocks: group of threads that cooperate to solve a (sub) problem

kernel bar()

streaming multiprocessors
SMs

CUDA makes few guarantees about when and where thread blocks will run.

Advantages
– hardware can run things efficiently
– no waiting on lowpokes
– scalability!
from cell phones to supercomputers
from current to future GPUs

#inculde <stdio.h>

#define NUM_BLOCKS 16
#define BLOCK_WIDTH 1

__global__ void hello()
{
	printf("Hello world! I'm a thread block %d\n", blockIdx.x);
}

int main(int argc, char **argv)
{
	// launch the kernel
	hello<<<NUM_BLOCKS, BLOCK_WIDTH>>>();

	cudaDeviceSynchronize();
	printf("That's all!\n");

	return 0;
}

Parallel computing

Parallel computing: many threads solving a problem by working together.
Map:Tasks read from and write to specific data elements
Gather,
Scatter: tasks compute where to write output
Stencil:tasks read input from a fixed neighborhood in an array:
Transpose:array, matrix, image, data structure
-> tasks re-order data elements in memory

struct foo {
float f;
int i;
};
foo array[1000];
array of structures -> structure of array

float out[], in[];
int i = threadIdx.x;
int j = threadIdx.y;

const float pi = 3.1415;

out[i] = pi*in[i];
out[i + j*128] = in[j + i*128];

if (i%2){
	out[i-1] += pi * in[i]; out[i+1] += pi * in[i];
	out[i] = (in[i]+ in[i-1] + in[i+1]) * pi/3.0f;
}

Greyscale Conversion

A common way to represent color images is known as RGBA – the color is specified by how much Red, Green, and Blue is in it. The ‘A’ stands for Alpha and is used for transparency;

Each channel Red, Blue, Green, and Alpha is represented by one byte.
Since we are using one byte for each color there are 256 different possible values for each color. This means we use 4 bytes per pixel.

Greyscale images are represented by a single intensity value per pixel
which is one byte in size.

To convert an image from color to grayscale one simple method is to set the intensity to the average of the RGB channels. But we will use a more sophisticated method that takes into account how the eye perceives color and weights the channels unequally.

The eye responds most strongly to green followed by red and then blue.
The NTSC (National Television System Committee) recommends the following
formula for color to greyscale conversion:

I = .299f * R + .587f * G + .114f * B

#include "reference_calc.cpp"
#include "utils.h"
#include <stdio.h>

__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
					unsigned char* const greyImage,
					int numRows, int numCols)

{

}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
	unsigned char* const d_greyImage, size_t numRows, size_t numCols)

{
	const dim3 blockSize(1, 1, 1):
	const dim3 gridSize( 1, 1, 1);
	rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);

	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());	
}

void referenceCalculation(const uchar4* const rgbaImage,
unsigned char *const greyImage,
size_t numRows,
size_t numCols)
{
for (size_t r = 0; r < umRows; ++r){ for (size_t c = 0; c < numCols; ++c){ uchar4 rgba = rgbaImage[r * numCols + c]; float channelSum = .299f * rgba.x + .587f * rgba.y + .144f * rgba.z; greyImage[r * numCols + c] = channelSum; } } } [/c] [c] #include
#include “timer.h”
#include “utils.h”
#include
#include

size_t numRows();
size_t numCols();

void preProcess(uchar4 **h_rgbaImage, unsigned char **h_greyImage,
uchar4 **d_rgbaImage, unsigned char **d_greyImage,
const std::string& filename);

void postProcess(const std::string& output_file);

void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const_greyImage, size_t numRows, size_t numCols);

#include “HW1.cpp”

int main(int argc, char **argv){
uchar4 *h_rgbaImage, *d_rgbaImage;
unsigned char *h_greyImage, *d_greyImage;

std::string input_file;
std::string output_file;
if (argc == 3){
input_file = std::string(argv[1]);
output_file = std::string(argv[2]);
}
else {
std::cerr << "Usage: ./hw input_file output_file" << std::endl; exit(1); } preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file); GpuTimer timer; timer.Start(); your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols()); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); printf("\n"); int err = printf("%f msecs.\n", timer.Elapsed()); if (err < 0){ std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } postProcess(output_file); return 0; } [/c]

MAP

MAP
– set of elements to process
– function to run on each element

map(elements, function)

How Pixels are represented
red, green, blue

struct uchar4Σ
 unsigned char x, y, z, w

converting color to black and white
I = (r + g + b)/3
I = .299f * r + .587f