Machine generates key sequence based on its configuration
key would not repeat for 10^19 letters
Probability Review
Event: subset of outcomes from a probability space
Head = { H }
Valid = { H, T }
P(A) = ΣweA P(w)
1 – P(E) = P(Valid)
Complementary Event Prbability
VA: P(a) + p(A) = 1
Conditional probability
P(B|A) = P(Aub)/ p(A)
Perfect Cipher
the ciphertext provides an attacker with no additional information about the plaintext.
m -> enc -> c
m e M
m* e M
Eki(mi) = c
P(m=m*).P(k=k*) = 1/k
P(Ek(m)= c) = 1/|k|
P(m=m* Ek(m)=c) = P(m = m*)/|k|
Malleable
alice Ek(m)=c active attacker
Impratical |k| = |m|
Shamon’s Theorem: if a cipher is perfect, it must be impractical
Perfect cipher: P(m=m*|Ek(m)=c) = P(m=m*)
One Time Pad
Ek(m) = c0 c1 … Cn-1
m = ‘CS’ = 1010011 1000011
k = 11001000100110
c = 01101111100101
Proving Security
here is a mathematical proof, accepted by experts, that shows the cipher is secure.
Probability(Review)
Ω:set of all possible outcomes
probability space
Ω = {h, t}
uniform distribution each outcome is equally likely
p: outcome -> [0, 1]
p(h) = 1/2 p(t) = 1/2
Ω = {H, T, E}
p(h) = 0.49999
p(t) = 0.49999
ΣweΩ p(w) = 1
Symmetric Crytosystems
plaintext -> encryption -> ciphertext -> insecure channel -> decryptron
VmeM:D(E(m))=m
Kerck hoff’s Principle CIS
meM ceC keK
E:MxK -> c D:c*k -> M
correctness property: Vm;k: Dk(Ek(m))
Security property:
Ideal: ciphertext reveals nothing about key or message
def convert_to_bits(n, pad): result = [] while n > 0: if n % 2 == 0: result = [0] + result else: result = [1] + result n = n / 2 while len(result) < pad: result = [0] + result return result
applied cryptography
cryptography
crypto -> secret
graphy -> writing
cryptology
symmetric cryptography and applications
asymmetric cryptography and applications
m -> enc(<-k) -> ciphertext
timing side-channel
Programming Model
__global__ void Hello(){
printf("Hello");
}
void main(){
Hello<<<1,1>>>();
cudaDeviceSynchronize();
printf("World");
}
CUB
$define CUB_STDERR
#include <stdio.h>
#include <iostream>
#include <cub/cub.cuh>
using namespace cub;
bool g_verbose = false;
int g_iterations = 100;
// ---------
// Kernels
// ---------
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD>
__global__ void BlockPrefixSumKernel(
int *d_in,
int *d_out,
clock_t *d_elapsed)
{
typedef BlockScan<int, BLOCK_THREADS > BlockScanT;
__shared__ typename BlockScanT::SmemStorage smem_storage;
int data[ITEMS_PER_THREAD];
BlockLoadVectorized(d_in, data);
clock_t start = clock();
int aggregate;
BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate);
clock_t stop = clock();
BlockStoreVectorized(d_out, data);
if(threadIdx.x == 0)
{
*d_elapsed = (start > stop)? start - stop : stop - start;
d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
}
}
int Initialize(
int *h_in,
int *h_reference,
int num_elements)
{
int inclusive = 0;
for (int i = 0; i< num_elements; ++i)
{
h_in[i] = i % 17;
h_reference[i] = inclusive;
inclusive += h_in[i];
}
return inclusive;
}
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD>
void Test()
{
const int TILE_SIZE = BLOCJ_THREAD * ITEMS_PER_THREAD;
int *h_in = new int[TILE_SIZE];
int *h_reference = new int[TILE_SIZE];
int *h_gpu = new int[TILE_SIZE + 1];
int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
int *d_in = NULL;
int *d_out = NULL;
clock_t *d_elapsed = NULL;
cudaMalloc((void**)&d_in, sizeof(int)* TILE_SIZE);
cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1));
cudaMalloc((void**)&d_elapsed, sizeof(clock_t));
if (g_verbose)
{
printf("Input data: ");
for (int i = 0; i < TILE_SIZE; i++)
printf("%d, ", h_in[i]);
printf("\n\n");
}
cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
printf("BlockScan %d items (%d threads, %d items per thread):",
TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD);
clock_t elapsed_scan_clocks = 0;
for (int i = 0; i < g_iterations; ++i)
{
BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<1, BLOCK_THREADS>>>(
d_in,
d_otu,
d_elapsed);
clock_t scan_clocks;
cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost);
cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost);
elapsed_scan_clocks += scan_clocks;
}
bool correct = true;
for (int i = 0; i < TILE_SIZE; i++)
{
if (h_gpu[i] != h_reference[i])
{
printf("Incorrect result @ offset %d (%d != %d)\n",
i, h_gpu[i], h_reference[i]);
correct = false;
break;
}
}
if (h_gpu[TILE_SIZE] != h_aggregate)
{
printf("Incorrect aggregate (%d != %d)\n", h_gpu[TILE_SIZE], h_aggregate);
correct = false;
}
if (correct) printf("Correct!\n");
if (g_verbose)
{
printf("GPu output(reference output): ");
for (int i = 0; i < TILE_SIZE; i++)
printf("%d (%d), ", h_gpu[i], h_reference[i]);
printf("\n");
printf("GPU aggregate (reference aggregate)", h_gpu[TILE_SIZE], h_aggregate);
printf("\n\n");
}
printf("Average clock per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations);
if (h_in) delete[] h_in;
if (h_reference) delete[] h_reference;
if (h_gpu) delete[] h_gpu;
if (d_in) cudaFree(d_in);
if (d_out) cudaFree(d_out);
if (d_elapsed) cudaFree(d_elapsed);
}
int main(int argc, char** argv)
{
cudaDeviceProp props;
cudaGetDeviceProperties(&props, 0);
printf("Using device %s\n", props.name);
Test<1024, 1>();
Test<512, 2>();
Test<256, 4>();
Test<128, 8>();
Test<64, 16>();
Test<32, 32>();
Test<16, 64>();
return 0;
}
Time sort in Trust
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <algorithm>
#include <cstdlib>
#include "gputimer.h"
int main(void)
{
// generate N random numbers serially
int N = 1000000;
thrust::host_vector<float> h_vec(N);
std::generate(h_vec.begin(), h_vec.end(), rand);
// transfert data to the device
thrust::device_vector<float> d_vec = h_vec;
// sort data on the device (846M keys per second on GeForce GTX 480)
GpuTimer timer;
timer.Start();
thrust::sort(d_vec.begin(), d_vec.end());
timer.Stop();
// transfer data back to host
thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
printf("Thrust sorted %d keys in %g ms\n", N, timer.Elapsed());
return 0;
}
Quadratic GPU vs Serial CPU
N2 GPU: N2 work visit every edge many times but only sets depth once
CPU: N work maintains frontier to minimize visits / node
int N == << 20; cublasInit(); cublasAlloc(N, sizeof(float), (void**)&d_x); cublasAlloc(N, sizeof(float), (void*)&d_y); cublasSetVector(N, sizeof(x[0]), x, y, d_x, 1); cublasSetVector(N, sizeof(y[0]), y, 1, d_y, 1); saxpy(N, 2.0, d_x, 1, y, 1); cublasGetVector(N, sizeof(y[0]), d_y, 1, y, 1); cublasFree(d_x); cublasFree(d_y); cublasShutdown(),
Do we have warry about race condition
while (!h_done){
bfs(edges, vertices)
cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}
while(!h_done){
cudaMemcpy(&d_done, &h_true, sizeof(bool), cudaHostToDevice);
bfs(edges, vertices)
cudaMemcpy(&h_done, &d_done, sizeof(bool), cudaDeviceToHost);
}
if((vfirst != -1) && (vsecond == -1)){
vertices[vsecond] = vfirst + 1;
done = false;
}
if ((vfirst == -1) && (vsecond != -1)){
vertices[vfirst] = vsecond + 1;
done = false;
}