#include__global__ void square(float * d_out, float * d_in){ int idx = threadIdx.x; float f = d_in[idx]; d_out[idx] = f * f; } int main(int argc, char ** argv){ const int ARRAY_SIZE = 64; const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); float h_in[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++){ h_in[i] = float(i); } float h_out[ARRAY_SIZE]; float * d_in; float * d_out; cudaMalloc((void **) &d_in, ARRAY_BYTES); cudaMalloc((void **) &d_out, ARRAY_BYTES); cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpy???); square<<<1, ARRAY_SIZE>>>(d_out, d_in); cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpy???); for (int i = 0; i < ARRAY_SIZE; i++){ printf("%f", h_out[i]); printf(((i % 4) != 3) ? "\t" : "\n"); } cudaFree(d_in); cudaFree(d_out); return 0; }
$ less square.cu
$ nvcc -o square square.cu
cuda mem cpu host to device
cuda mem cpu device to host
configuring the kernel launch
Kernel <<< Grid of Blocks, Block of threads >>> (...
-> 1, 2, or 3D -> 1, 2, or 3D
dim3(x, y, z)
dim3(w, i, i)==dim3(w)==w
square<<<1, 64>>> == square <<< dim3(1,1,1), dim3(64,1,1)>>>
square<<