statistic

Data -> Statistics -> Decision

Data driven
social sciences, engineering, medicine, psychology, public policy, robotic, climatology, archaeology, finance, business, marketing, biology, physics

statistic is universal, useful, and fun!

scater plot
outlier is no way to plot line.
bar chart has global trends
histograph is frequency counts

plotting

from plotting import *
data = [3,4,2,4,3,5,3,6,4,3]
histplot(data)

z-table

standard normal probability
http://www.stat.ufl.edu/~athienit/Tables/Ztable.pdf

有意水準 α level α=0.05
z = 1.65
Z値とは、標準偏差の単位で観測統計量とその仮説母集団パラメータの差を測定するZ検定の統計量です。たとえば、工場の選択した鋳型グループの平均深さが10cm、標準偏差が1cmであるとします。深さ12cmの鋳型は、深さが平均より2標準偏差分大きいので、Z-値が2になります。

α = 0.05 z = +-1.96
α = 0.01 z = +-2.57
α = 0.001 z = +-3.27

平均を μ, 分散を σ2

x 標準平均

DataFrame apply()

import pandas as pd

grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

if False:
	def convert_grades_curve(exam_grades):
		return pd.qcut(exam_grades,
			[0, 0.1, 0.2, 0.5, 0.8, 1],
			labels=['F','D','C','B','A'])

	print convert_grades_curve(grades_df['exam1'])

	print grades_df.apply(convert_grades_curve)

def standardize(df):

	return None
import numpy as np
import pandas as pd

df = pd.DataFrame({
	'a': [4,5,3,1,2],
	'b': [20,10,40,50,30],
	'c': [25,20, 5, 15, 10]
	})

if False:
	print df.apply(np.mean)
	print df.apply(np.max)

def second_largest(df):
	return None

Data frame vectorized

import pandas as pd

if False:
	df1 = pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
	df2 = pd.DataFrame({'a':[10,20,30],'b':[40,50,60],'c':[70,80,90]})
	print df1 + df2

if False:
	df1 = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6], 'c':['7','8','9']})
	df2 = pd.DataFrame({'d':[10,20,30], 'c':[40,50,60], 'b':[70,80,90]})
	df1 + df2

if False:
	df1 = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6], 'c':[7,8,9]},
			index=['row1','row2','row3'])
	df2 = pd.DataFrame({'a':[10,20,30],'b':[40,50,60],'c':[70,80,90]},
			index=['row4','row3','row2'])
	print df1 + df2

entries_and_exits = pd.DataFrame({
    'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
                 3144808, 3144895, 3144905, 3144941, 3145094],
    'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
               1088317, 1088328, 1088331, 1088420, 1088753]
	})

def get_hourly_entries_and_exits(entries_and_exits):

	return None
import pandas as pd

if False:
	df = pd.DataFrame({
		'a':[1, 2, 3],
		'b':[10, 20, 30],
		'c':[5, 10, 15]
		})

	def add_one(x):
		return x + 1

	print df.applymap(add_one)

grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

def convert_grades(grades):

	return None

Calculating correlation

import pandas as pd

filename = 'xxxx.csv'
subway_df = pd.read_csv(filename)

def correlation(x, y):
	std_x = (x - x.mean()) / x.std(ddof=0)
	std_y = (y - y.mean()) / y.std(ddof=0)

	return (std_x * std_y).mean()

entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['meanprecipi']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']

print correlation(entries, rain)
print correlation(entries, temp)
print correlation(rain, temp)

print correlation(entries, cum_entries)

Accessing Element

import pandas as pd

ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

if False:
	df_1 = pd.DataFrame({'A': [0, 1, 2], 'B':[3,4,5]})
	print df_1

	df_2 = pd.DataFrame([[0,1,2],[3,4,5]], columns=['A','B','C'])
	print df_2

if False:
	print ridership_df.iloc[0]
	print ridership_df.loc['05-05-11']
	print ridership_df['R003']
	print ridership_df.iloc[1, 3]

if False:
	print ridership_df.iloc[1:4]

if False:
	print ridership_df[['R003','R005']]

if False:
	df = pd.DataFrame({'A':[0,1,2],'B':[3,4,5]})
	print df.sum()
	print df.sum(axis=1)
	print df.values.sum()

def mean_riders_for_max_station(ridership):
	overall_mean = None
	mean_for_max = None

	return (overall_mean, mean_for_max)

two dimensional numbers

import numpy as np

ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

if False:
	print ridership[1, 3]
	print ridership[1:3, 3:5]
	print ridership[1, :]

if False:
	print ridership[0, :] + ridership[1, :]
	print ridership[:, 0] + ridership[:, 1]

if False:
	a = np.array([[1,2,3],[4,5,6],[7,8,9]])
	b = np.array([[1,1,1],[2,2,2],[3,3,3]])
	print a + b

def mean_riders_for_max_station(ridership):
	overall_mean = None
	mean_for_max = None

	return (overall_mean, mean_for_max)