reducing

__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
int myId = threadId.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;

for (unsigned int s = blockDim.x / 2; s>0; s >>= 1)
{
if (tid < s) { d_in[myId] += d_in[myId + s]; } __syncthreads(); } if (tid == 0) { d_out[blockIdx.x] = d_id[myId]; } }[/c]