Hypothesis Testing

sample -> statistic -> information -> statistic -> decision

weight loss 90% guaranteed
Yes 11, No 4
H0: p = 0.9
H1: p < 0.9 critical regions Null hypothesis [python] from math import sqrt def mean(l); return float(sum(l))/len(l) def var(l): m = mean(l) return sum([(x-m)**2 for x in l])/len(l) def factor(l): return 1.96 def conf(l): return factor(l) * sqrt(val(l)/ len(l)) def test(l, h): m = mean(l) c = conf(l) return abs(h-m) <= c l = [199, 200, 201, 202, 203, 204] print mean(l) print conf(l) [/python] 95% confidence candidate A 55 candidate B 45 1.96√p(1-p)/n = 1.96√.55-.44/100 = 9.75 Candidate A: 55 +/- 9.75

Estimation probability

Confidence Intervals

60% partyA +-3% -> confidence interval 57, 63 in %.
many often confidence interval become 95% chance.

suppose we increase the sample size N, size of CI shrink.

P=0.5, μ=0.5, σ^2=0.25
mean(ΣXi), Var(ΣXi), Var(1/nΣXi), std dev, CI
n=1, 0.5, 0.25, 0.25, 0.5, 0.98
n=2, 1, 0.5, 0.125, 0.35, 0.69
n=10, 5, 2.5, 0.025, 0.16, 0.31
1.96 magic number

π 3.14
e 2.718

calculate mean

# remove outliers
# extract data between lower and upper quartile

# fit Gaussian using MLE

# compute x that corresponds to standard score z
return x

import random
from math import sqrt

def mean(data):
	return sum(data)/len(data)

def variance(data):
	mu=mean(data)
	return sum([(x-mu)**2 for x in data])/len(data)

def stddev(data):
	return sqrt(variance(data))

weight=[80.,85,200,85,69,65,68,66,85,72,85,82,65,105,75,80,
    70,74,72,70,80,60,80,75,80,78,63,88.65,90,89,91,1.00E+22,
    75,75,90,80,75,-1.00E+22,-1.00E+22,-1.00E+22,86.54,67,70,92,70,76,81,93,
    70,85,75,76,79,89,80,73.6,80,80,120,80,70,110,65,80,
    250,80,85,81,80,85,80,90,85,85,82,83,80,160,75,75,
    80,85,90,80,89,70,90,100,70,80,77,95,120,250,60]

print mean(weight)

def calculate_weight(data, z):
	data.sort()
	lowerq = (len(data)-3)/4
	upperq = lowerq * 3 + 3
	newdata = [data[i] for i in range(lowerq, upperq)]

	mu = mean(newdata)
	sigma = stddev(newdata)

	x = mu + z * sigma
	return x

print calculate_weight(weight, -2.)

central limit theorem

coin:(0,1) P(Σi=k)= n!/(n-k)!k!
Pascal Triangle

flip a coin 1000 times
mean
standard deviation

import random
from math import sqrt

def mean(data):
	return float(sum(data))/len(data)

def variance(data):
	mu=mean(data)
	return sum([(float(x)-mu)**2 for x in data])/len(data)

def stddev(data):
	return sqrt(variance(data))

def flip(N):
    return [random.random() > 0.5 for x in range(N)]

N=1000
f=flip(N)

print mean(f)
print stddev(f)

Here comes standard deviation

from math import sqrt

data3=[13.04, 1.32, 22.65, 17.44, 29.54, 23.22, 17.65, 10.12, 26.73, 16.43]


def mean(data):
    return sum(data)/len(data)
def variance(data):
    mu=mean(data)
    return mean([(x-mu)**2 for x in data])
def stddev(data):
	sigma2 = variance(data)
	return sqrt(sigma2)

print stddev(data3)

standard score(偏差値) = (data – mean)/standard deviation

xi…,xi…,xn
mean:5
variance:16
standard deviation:4
xi:9

multiply by 1.5

standard score:(9-5)/4 = 1
μ:7.5
σ:6
σ^2:36
yi:13.5
z:1

correction factor(補正率)

incremental mean

from __future__ import division

def mean(oldmean, n, x):
	return (oldmean*n+x)/(n+1)

currentmean=10
currentcount=5
new=4

print mean(currentmean, currentcount,new)

def likelihood(dist,data):
l = 1
for i in data:
l*dist[i]
return l

tests= [(({‘A’:0.2,’B’:0.2,’C’:0.2,’D’:0.2,’E’:0.2},’ABCEDDECAB’), 1.024e-07),(({‘Good’:0.6,’Bad’:0.2,’Indifferent’:0.2},[‘Good’,’Bad’,’Indifferent’,’Good’,’Good’,’Bad’]), 0.001728),(({‘Z’:0.6,’X’:0.333,’Y’:0.067},’ZXYYZXYXYZY’), 1.07686302456e-08),(({‘Z’:0.6,’X’:0.233,’Y’:0.067,’W’:0.1},’WXYZYZZZZW’), 8.133206112e-07)]

for t,l in tests:
if abs(likelihood(*t)/l-1)<0.01: print 'Correct' else: print 'Incorrect' [/python]

variance algorithm

data2 = []
def variance

return

print variance(data2)

mean =
data = []
ndata = data – mu
ndata = []
ndata.append()

data3=[13.04, 1.32, 22.65, 17.44, 29.54, 23.22, 17.65, 10.12, 26.73, 16.43]
def mean(data):
    return sum(data)/len(data)
def variance(data):
	mu = mean(data)
	ndata = []
	for i in range(len(data)):
		ndata.append((data[i] - mu)**2)
	sigma2 = mean(ndata)
	return sigma2	

another simple pattern comes here

data3=[13.04, 1.32, 22.65, 17.44, 29.54, 23.22, 17.65, 10.12, 26.73, 16.43]
def mean(data):
    return sum(data)/len(data)
def variance(data):
	mu = mean(data)
	return mean([(x-mu)**2 for x in data])

print variance(data2)