Squaring Numbers

#include 

__global__ void square(float * d_out, float * d_in){
	int idx = threadIdx.x;
	float f = d_in[idx];
	d_out[idx] = f * f;
}

int main(int argc, char ** argv){
	const int ARRAY_SIZE = 64;
	const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

	float h_in[ARRAY_SIZE];
	for (int i = 0; i < ARRAY_SIZE; i++){
		h_in[i] = float(i);
	}
	float h_out[ARRAY_SIZE];

	float * d_in;
	float * d_out;

	cudaMalloc((void **) &d_in, ARRAY_BYTES);
	cudaMalloc((void **) &d_out, ARRAY_BYTES);

	cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpy???);

	square<<<1, ARRAY_SIZE>>>(d_out, d_in);

	cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpy???);

	for (int i = 0; i < ARRAY_SIZE; i++){
		printf("%f", h_out[i]);
		printf(((i % 4) != 3) ? "\t" : "\n");
	}

	cudaFree(d_in);
	cudaFree(d_out);

	return 0;
}

$ less square.cu
$ nvcc -o square square.cu

cuda mem cpu host to device
cuda mem cpu device to host

configuring the kernel launch
Kernel <<< Grid of Blocks, Block of threads >>> (...
-> 1, 2, or 3D -> 1, 2, or 3D
dim3(x, y, z)
dim3(w, i, i)==dim3(w)==w

square<<<1, 64>>> == square <<< dim3(1,1,1), dim3(64,1,1)>>>
square<<>>(...)