__global__ void transpose_serial(float in[], float out[]) { for(int j=0; j < N; j++) for(int i=0; i < N; i++) out[j + i*N] = in[i + j*N]; } __global__ void transpose_parallel_per_row(float in[], float out[]) { int i = threadIdx.x; for(int j=0; j < N; j++) out[j + i*N] = in[i + j*N]; } __global__ void transpose_parallel_per_element(float in[], float[]) { int i = blockIdx.x * K + threadIdx.x; int j = blockIdx.y * K + threadIdx.y; out[j + i*N] = in[i + j*N]; }