#include
__global__ void square(float * d_out, float * d_in){
int idx = threadIdx.x;
float f = d_in[idx];
d_out[idx] = f * f;
}
int main(int argc, char ** argv){
const int ARRAY_SIZE = 64;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
float h_in[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++){
h_in[i] = float(i);
}
float h_out[ARRAY_SIZE];
float * d_in;
float * d_out;
cudaMalloc((void **) &d_in, ARRAY_BYTES);
cudaMalloc((void **) &d_out, ARRAY_BYTES);
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpy???);
square<<<1, ARRAY_SIZE>>>(d_out, d_in);
cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpy???);
for (int i = 0; i < ARRAY_SIZE; i++){
printf("%f", h_out[i]);
printf(((i % 4) != 3) ? "\t" : "\n");
}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
$ less square.cu
$ nvcc -o square square.cu
cuda mem cpu host to device
cuda mem cpu device to host
configuring the kernel launch
Kernel <<< Grid of Blocks, Block of threads >>> (...
-> 1, 2, or 3D -> 1, 2, or 3D
dim3(x, y, z)
dim3(w, i, i)==dim3(w)==w
square<<<1, 64>>> == square <<< dim3(1,1,1), dim3(64,1,1)>>>
square<<>>(...)