March 2017 – Page 8 – ソフトウェアエンジニアの技術ブログ：Software engineer tech blog

Physically Random Events

Quantum Mechanics
Thermal Noise
Key proceed mouse move

Peseudo-Random Number Generator
counter 0, 1, 2 -> Encrypt
Pool of Randomness seed-> key

Storing a file securely

Electronic Codebook Mode

ciphertexts

BITS = ('0', '1')
ASCII_BITS = 7

def display_bits(b):
	"""converts list of {0, 1}* to string"""
	return ''.join([BITS[e] for e in b])

def seq_to_bits(seq):
	return [0 if b == '0' else 1 for b in seq]

def pad_bits(bits, pad):
	"""pads seq with leading 0s up to length pad"""
	assert len(bits) <= pad
	return &#91;0&#93; * (pad - len(bits)) + bits

def convert_to_bits(n):
	"""converts an integer 'n' to bit array"""
	result = &#91;&#93;
	if n == 0:
		return &#91;0&#93;
	while n > 0:
		result = [(n % 2)] + result
		n = n / 2
	return result

def string_to_bits(s):
	def chr_to_bit(c):
		return pad_bits(convert_to_bits(ord(c)), ASCII_BITS)
	return [b for group in
			map(chr_to_bit, s)
			for b  in group]

def bits_to_char(b):
	assert len(b) == ASCII_BITS
	value = 0
	for e in b:
		value = (value * 2) + e
	return chr(value)

def list_to_string(p):
	return ''.join(p)

def bits_to_string(b):
	return ''.join([bits_to_char(b[i:i + ASCII_BITS])
		for i in range(0, len(b), ASCII_BITS)])

Lorenz Cipher

Machine generates key sequence based on its configuration
key would not repeat for 10^19 letters

Probability Review

Event: subset of outcomes from a probability space
Head = { H }
Valid = { H, T }
P(A) = ΣweA P(w)

1 – P(E) = P(Valid)

Complementary Event Prbability
VA: P(a) + p(A) = 1

Conditional probability
P(B|A) = P(Aub)/ p(A)

Perfect Cipher
the ciphertext provides an attacker with no additional information about the plaintext.
m -> enc -> c
m e M
m* e M

Eki(mi) = c
P(m=m*).P(k=k*) = 1/k

P(Ek(m)= c) = 1/|k|
P(m=m* Ek(m)=c) = P(m = m*)/|k|

Malleable
alice Ek(m)=c active attacker

Impratical |k| = |m|

Shamon’s Theorem: if a cipher is perfect, it must be impractical

Perfect cipher: P(m=m*|Ek(m)=c) = P(m=m*)

One Time Pad

Ek(m) = c0 c1 … Cn-1
m = ‘CS’ = 1010011 1000011
k = 11001000100110
c = 01101111100101

Proving Security
here is a mathematical proof, accepted by experts, that shows the cipher is secure.

Probability(Review)
Ω:set of all possible outcomes
probability space
Ω = {h, t}
uniform distribution each outcome is equally likely
p: outcome -> [0, 1]
p(h) = 1/2 p(t) = 1/2
Ω = {H, T, E}
p(h) = 0.49999
p(t) = 0.49999
ΣweΩ　p(w) = 1

Symmetric Crytosystems

plaintext -> encryption -> ciphertext -> insecure channel -> decryptron
VmeM:D(E(m))=m
Kerck hoff’s Principle CIS

meM ceC keK
E:MxK -> c D:c*k -> M
correctness property: Vm;k: Dk(Ek(m))

Security property:
Ideal: ciphertext reveals nothing about key or message

def convert_to_bits(n, pad):
	result = []
	while n > 0:
		if n % 2 == 0:
			result = [0] + result
		else:
			result = [1] + result
		n = n / 2
	while len(result) < pad:
		result = [0] + result
	return result

applied cryptography

cryptography
crypto -> secret
graphy -> writing
cryptology

symmetric cryptography and applications
asymmetric cryptography and applications

m -> enc(<-k) -> ciphertext
timing side-channel

Programming Model

__global__ void Hello(){
	printf("Hello");
}

void main(){
	Hello<<<1,1>>>();
	cudaDeviceSynchronize();
	printf("World");
}

CUB

$define CUB_STDERR

#include <stdio.h>
#include <iostream>

#include <cub/cub.cuh>

using namespace cub;

bool g_verbose = false;
int g_iterations = 100;

// ---------
// Kernels
// ---------

template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD>
__global__ void BlockPrefixSumKernel(
	int *d_in,
	int *d_out,
	clock_t *d_elapsed)
{
	typedef BlockScan<int, BLOCK_THREADS > BlockScanT;

	__shared__ typename BlockScanT::SmemStorage smem_storage;

	int data[ITEMS_PER_THREAD];
	BlockLoadVectorized(d_in, data);

	clock_t start = clock();

	int aggregate;
	BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate);

	clock_t stop = clock();

	BlockStoreVectorized(d_out, data);

	if(threadIdx.x == 0)
	{
		*d_elapsed = (start > stop)? start - stop : stop - start;
		d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
	}
}

int Initialize(
	int *h_in,
	int *h_reference,
	int num_elements)
{
	int inclusive = 0;

	for (int i = 0; i< num_elements; ++i)
	{
		h_in&#91;i&#93; = i % 17;

		h_reference&#91;i&#93; = inclusive;
		inclusive += h_in&#91;i&#93;;
	}
	return inclusive;
}

template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD>
void Test()
{
	const int TILE_SIZE = BLOCJ_THREAD * ITEMS_PER_THREAD;

	int *h_in = new int[TILE_SIZE];
	int *h_reference = new int[TILE_SIZE];
	int *h_gpu = new int[TILE_SIZE + 1];

	int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);

	int *d_in = NULL;
	int *d_out = NULL;
	clock_t *d_elapsed = NULL;
	cudaMalloc((void**)&d_in, sizeof(int)* TILE_SIZE);
	cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1));
	cudaMalloc((void**)&d_elapsed, sizeof(clock_t));

	if (g_verbose)
	{
		printf("Input data: ");
		for (int i = 0; i < TILE_SIZE; i++)
			printf("%d, ", h_in&#91;i&#93;);
		printf("\n\n");
	}

	cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);

	printf("BlockScan %d items (%d threads, %d items per thread):",
		TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD);

	clock_t elapsed_scan_clocks = 0;
	for (int i = 0; i < g_iterations; ++i)
	{
		BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<1, BLOCK_THREADS>>>(
			d_in,
			d_otu,
			d_elapsed);
		clock_t scan_clocks;
		cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost);
		cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost);
		elapsed_scan_clocks += scan_clocks;
	}

	bool correct = true;
	for (int i = 0; i < TILE_SIZE; i++)
	{
		if (h_gpu&#91;i&#93; != h_reference&#91;i&#93;)
		{
			printf("Incorrect result @ offset %d (%d != %d)\n",
				i, h_gpu&#91;i&#93;, h_reference&#91;i&#93;);
			correct = false;
			break;
		}
	}

	if (h_gpu&#91;TILE_SIZE&#93; != h_aggregate)
	{
		printf("Incorrect aggregate (%d != %d)\n", h_gpu&#91;TILE_SIZE&#93;, h_aggregate);
		correct = false;
	}
	if (correct) printf("Correct!\n");

	if (g_verbose)
	{
		printf("GPu output(reference output): ");
		for (int i = 0; i < TILE_SIZE; i++)
			printf("%d (%d), ", h_gpu&#91;i&#93;, h_reference&#91;i&#93;);
		printf("\n");
		printf("GPU aggregate (reference aggregate)", h_gpu&#91;TILE_SIZE&#93;, h_aggregate);
		printf("\n\n");
	}

	printf("Average clock per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations);

	if (h_in) delete&#91;&#93; h_in;
	if (h_reference) delete&#91;&#93; h_reference;
	if (h_gpu) delete&#91;&#93; h_gpu;
	if (d_in) cudaFree(d_in);
	if (d_out) cudaFree(d_out);
	if (d_elapsed) cudaFree(d_elapsed);
}

int main(int argc, char** argv)
{
	cudaDeviceProp props;
	cudaGetDeviceProperties(&props, 0);
	printf("Using device %s\n", props.name);

	Test<1024, 1>();
	Test<512, 2>();
	Test<256, 4>();
	Test<128, 8>();
	Test<64, 16>();
	Test<32, 32>();
	Test<16, 64>();

	return 0;
}