pre-loop code
…
for (int i=0; i <= threadIdx; ++i)
{
...some loop code...
}
...
post-loop code
pre-loop, loop code, post-loop
fundamental GPU algorithm
-reduce, scan, histogram
Atomic memory
__global__ void increment_naive(int *g) { int i = blockIdx.x * blockDim.x + threadIdx.x; i = i % ARRAY_SIZE; g[i] = g[i] + 1; } __global__ void increment_atomic(int *g) { int i = blockIdx.x * blockDim.x + threadIdx.x; i = i % ARRAY_SIZE; atomicAdd(& g[i], 1); } int main(int argc, char **argv) { PpuTimer timer; printf("%d total threads in %d blocks writing into %d array elements\n", NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE); int array[ARRAY_SIZE]; const_int ARRAY_BYTES = ARRAY_SIZE * sizeof(int); }
strategies for efficient cuda programming
1. high arithmetic intensity math/memory
-minimize time spent on memory
-put data in faster memory
local > shared > global
-use coalesced global memory access
shared memory
// using different memory spaces in CUDA // a __device__ or __global__ function runs on the GPU __global__ void use_local_memory_GPU(float in) { int i, index = threadIdx.x; float average, sum = 0.0f; __shared__ float sh_arr[128]; sh_arr[index] = array[index]; __syncthreads(); for (i=0; i<index; i++){sum+= sh_arr[i]; } average = sum / (index + 1.0f); if (array[index] > average) { array[index] = average; } sh_arr[index] = 3.14; }
__global__ void foo(float *g) { float a = 3.14; int i = threadIdx.x; g[i] = a; g[i*2] = a; a = g[i]; a = g[BLOCK_WIDTH/2 + i]; g[i] = a * g[BLOCK_WIDTH/2 + i]; g[BLOCK_wIDTH-1 - i] = a; }
#include <stdio.h> #include "gputimer.h" #define NUM_THREADS 1000000 #define ARRAY_SIZE 10 #define BLOCK_WIDTH 1000 void print_array(int *array, int size) __global__ void increment_naive(int *g) { int i = blockIdx.x * blockDim.x + threadIdx.x; i = i % ARRAY_SIZE; g[i] = g[i] +1; } int main(int argc, char **argv) { GpuTimer timer; printf("%d total threads in %d blocks writing into %d array elements\n", NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE); }
Global memory
// using different memory spaces in CUDA // a __device__ or __global__ function runs on the GPU __global__ void use_local_memory_GPU(float in) { float f; f = in; } int main(int argc, char **argv) { use_local_memory_GPU<<<1, 128>>>(2.0f); float h_arr[128]; float *d_arr; cudaMalloc((void **)&d_arr, sizeof(float)*128); }
The need for barriers
int idx = threadIdx.x; --shared-- int array[128]; array[idx]=threadIdx.x; if(idx < 127) array[idx] = array[idx+1]
thead, thread block
CUDA
a hierarchy of
-computation
-memory spaces
synchronization
Writing Efficient Program
High-level strategy
1.maximize arithmetic intensity math/memory
Parallel communication pattern
map: one-to-one
transpose: one-to-one
Gather many-to-one
scatter one-to-many
stencil several-to-one
reduce
summary of programming model
kernels – c / c++ functions
kernel foo(),
thread blocks: group of threads that cooperate to solve a (sub) problem
kernel bar()
streaming multiprocessors
SMs
CUDA makes few guarantees about when and where thread blocks will run.
Advantages
– hardware can run things efficiently
– no waiting on lowpokes
– scalability!
from cell phones to supercomputers
from current to future GPUs
#inculde <stdio.h> #define NUM_BLOCKS 16 #define BLOCK_WIDTH 1 __global__ void hello() { printf("Hello world! I'm a thread block %d\n", blockIdx.x); } int main(int argc, char **argv) { // launch the kernel hello<<<NUM_BLOCKS, BLOCK_WIDTH>>>(); cudaDeviceSynchronize(); printf("That's all!\n"); return 0; }
Parallel computing
Parallel computing: many threads solving a problem by working together.
Map:Tasks read from and write to specific data elements
Gather,
Scatter: tasks compute where to write output
Stencil:tasks read input from a fixed neighborhood in an array:
Transpose:array, matrix, image, data structure
-> tasks re-order data elements in memory
struct foo {
float f;
int i;
};
foo array[1000];
array of structures -> structure of array
float out[], in[]; int i = threadIdx.x; int j = threadIdx.y; const float pi = 3.1415; out[i] = pi*in[i]; out[i + j*128] = in[j + i*128]; if (i%2){ out[i-1] += pi * in[i]; out[i+1] += pi * in[i]; out[i] = (in[i]+ in[i-1] + in[i+1]) * pi/3.0f; }
Greyscale Conversion
A common way to represent color images is known as RGBA – the color is specified by how much Red, Green, and Blue is in it. The ‘A’ stands for Alpha and is used for transparency;
Each channel Red, Blue, Green, and Alpha is represented by one byte.
Since we are using one byte for each color there are 256 different possible values for each color. This means we use 4 bytes per pixel.
Greyscale images are represented by a single intensity value per pixel
which is one byte in size.
To convert an image from color to grayscale one simple method is to set the intensity to the average of the RGB channels. But we will use a more sophisticated method that takes into account how the eye perceives color and weights the channels unequally.
The eye responds most strongly to green followed by red and then blue.
The NTSC (National Television System Committee) recommends the following
formula for color to greyscale conversion:
I = .299f * R + .587f * G + .114f * B
#include "reference_calc.cpp" #include "utils.h" #include <stdio.h> __global__ void rgba_to_greyscale(const uchar4* const rgbaImage, unsigned char* const greyImage, int numRows, int numCols) { } void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage, unsigned char* const d_greyImage, size_t numRows, size_t numCols) { const dim3 blockSize(1, 1, 1): const dim3 gridSize( 1, 1, 1); rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); }
void referenceCalculation(const uchar4* const rgbaImage,
unsigned char *const greyImage,
size_t numRows,
size_t numCols)
{
for (size_t r = 0; r < umRows; ++r){
for (size_t c = 0; c < numCols; ++c){
uchar4 rgba = rgbaImage[r * numCols + c];
float channelSum = .299f * rgba.x + .587f * rgba.y + .144f * rgba.z;
greyImage[r * numCols + c] = channelSum;
}
}
}
[/c]
[c]
#include
#include “timer.h”
#include “utils.h”
#include
#include
size_t numRows();
size_t numCols();
void preProcess(uchar4 **h_rgbaImage, unsigned char **h_greyImage,
uchar4 **d_rgbaImage, unsigned char **d_greyImage,
const std::string& filename);
void postProcess(const std::string& output_file);
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const_greyImage, size_t numRows, size_t numCols);
#include “HW1.cpp”
int main(int argc, char **argv){
uchar4 *h_rgbaImage, *d_rgbaImage;
unsigned char *h_greyImage, *d_greyImage;
std::string input_file;
std::string output_file;
if (argc == 3){
input_file = std::string(argv[1]);
output_file = std::string(argv[2]);
}
else {
std::cerr << "Usage: ./hw input_file output_file" << std::endl;
exit(1);
}
preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);
GpuTimer timer;
timer.Start();
your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols());
timer.Stop();
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
printf("\n");
int err = printf("%f msecs.\n", timer.Elapsed());
if (err < 0){
std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
exit(1);
}
postProcess(output_file);
return 0;
}
[/c]
MAP
MAP
– set of elements to process
– function to run on each element
map(elements, function)
How Pixels are represented
red, green, blue
struct uchar4Σ
unsigned char x, y, z, w
converting color to black and white
I = (r + g + b)/3
I = .299f * r + .587f
Squaring Numbers
#include__global__ void square(float * d_out, float * d_in){ int idx = threadIdx.x; float f = d_in[idx]; d_out[idx] = f * f; } int main(int argc, char ** argv){ const int ARRAY_SIZE = 64; const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); float h_in[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++){ h_in[i] = float(i); } float h_out[ARRAY_SIZE]; float * d_in; float * d_out; cudaMalloc((void **) &d_in, ARRAY_BYTES); cudaMalloc((void **) &d_out, ARRAY_BYTES); cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpy???); square<<<1, ARRAY_SIZE>>>(d_out, d_in); cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpy???); for (int i = 0; i < ARRAY_SIZE; i++){ printf("%f", h_out[i]); printf(((i % 4) != 3) ? "\t" : "\n"); } cudaFree(d_in); cudaFree(d_out); return 0; }
$ less square.cu
$ nvcc -o square square.cu
cuda mem cpu host to device
cuda mem cpu device to host
configuring the kernel launch
Kernel <<< Grid of Blocks, Block of threads >>> (...
-> 1, 2, or 3D -> 1, 2, or 3D
dim3(x, y, z)
dim3(w, i, i)==dim3(w)==w
square<<<1, 64>>> == square <<< dim3(1,1,1), dim3(64,1,1)>>>
square<<