Thread per row

__global__ void
spmv_csr_scalar_kernel(
	const int num_rows, const int * rowptr,
	const int * index, const float * value,
	const float * x, float * y){
	int row = blockDim.x * blockIdx.x +
	threadIdx.x;
	if (row < num_rows){
		int row_start = rowptr[row];
		int row_end = rowptr[row+1];
		for (int jj = row_start;
			jj < row_end ; jj++)
			dot += value[jj] * x[index[jj]];
		y[row] += dot;
	}
}
	)