Quadratic GPU vs Serial CPU

N2 GPU: N2 work visit every edge many times but only sets depth once
CPU: N work maintains frontier to minimize visits / node

int N == << 20;
cublasInit();
cublasAlloc(N, sizeof(float), (void**)&d_x);
cublasAlloc(N, sizeof(float), (void*)&d_y);

cublasSetVector(N, sizeof(x[0]), x, y, d_x, 1);
cublasSetVector(N, sizeof(y[0]), y, 1, d_y, 1);

saxpy(N, 2.0, d_x, 1, y, 1);

cublasGetVector(N, sizeof(y[0]), d_y, 1, y, 1);

cublasFree(d_x);
cublasFree(d_y);
cublasShutdown(),