import pandas as pd filename = 'xxxx.csv' subway_df = pd.read_csv(filename) def correlation(x, y): std_x = (x - x.mean()) / x.std(ddof=0) std_y = (y - y.mean()) / y.std(ddof=0) return (std_x * std_y).mean() entries = subway_df['ENTRIESn_hourly'] cum_entries = subway_df['meanprecipi'] rain = subway_df['meanprecipi'] temp = subway_df['meantempi'] print correlation(entries, rain) print correlation(entries, temp) print correlation(rain, temp) print correlation(entries, cum_entries)
Accessing Element
import pandas as pd
ridership_df = pd.DataFrame(
data=[[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]],
index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
'05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
columns=['R003', 'R004', 'R005', 'R006', 'R007']
)
if False:
df_1 = pd.DataFrame({'A': [0, 1, 2], 'B':[3,4,5]})
print df_1
df_2 = pd.DataFrame([[0,1,2],[3,4,5]], columns=['A','B','C'])
print df_2
if False:
print ridership_df.iloc[0]
print ridership_df.loc['05-05-11']
print ridership_df['R003']
print ridership_df.iloc[1, 3]
if False:
print ridership_df.iloc[1:4]
if False:
print ridership_df[['R003','R005']]
if False:
df = pd.DataFrame({'A':[0,1,2],'B':[3,4,5]})
print df.sum()
print df.sum(axis=1)
print df.values.sum()
def mean_riders_for_max_station(ridership):
overall_mean = None
mean_for_max = None
return (overall_mean, mean_for_max)
two dimensional numbers
import numpy as np
ridership = np.array([
[ 0, 0, 2, 5, 0],
[1478, 3877, 3674, 2328, 2539],
[1613, 4088, 3991, 6461, 2691],
[1560, 3392, 3826, 4787, 2613],
[1608, 4802, 3932, 4477, 2705],
[1576, 3933, 3909, 4979, 2685],
[ 95, 229, 255, 496, 201],
[ 2, 0, 1, 27, 0],
[1438, 3785, 3589, 4174, 2215],
[1342, 4043, 4009, 4665, 3033]
])
if False:
print ridership[1, 3]
print ridership[1:3, 3:5]
print ridership[1, :]
if False:
print ridership[0, :] + ridership[1, :]
print ridership[:, 0] + ridership[:, 1]
if False:
a = np.array([[1,2,3],[4,5,6],[7,8,9]])
b = np.array([[1,1,1],[2,2,2],[3,3,3]])
print a + b
def mean_riders_for_max_station(ridership):
overall_mean = None
mean_for_max = None
return (overall_mean, mean_for_max)
Pandas Series apply()
import pandas as pd
if False:
s = pd.Series([1,2,3,4,5])
def add_one(x):
return x + 1
print s.apply(add_one)
names = pd.Series([
'Andre Agassi',
'Barry Bonds',
'Christopher Columbus',
'Daniel Defoe',
'Emilio Estevez',
'Fred Flintstone',
'Greta Garbo',
'Humbert Humbert',
'Ivan Ilych',
'James Joyce',
'Keira Knightley',
'Lois Lane',
'Mike Myers',
'Nick Nolte',
'Ozzy Osbourne',
'Pablo Picasso',
'Quirinus Quirrell',
'Rachael Ray',
'Susan Sarandon',
'Tina Turner',
'Ugueth Urbina',
'Vince Vaughn',
'Woodrow Wilson',
'Yoji Yamada',
'Zinedine Zidane'
])
def reverse_names(names):
split_name = name.split(" ")
first_name = split_name[0]
last_name = split_name[1]
return last_name + ', ' + first_name
Vectorized Operations
vector 123 * scalor 3 = 123123123, 369, error
these are reasonable answers
More vectorized operation
math operation
add, subtract, multiply, divide, expertise
Logical opereation
&, |, ~
Comparison operations
>, >=, <, <=, ==, !=
import numpy as np if False: a = np.array([1, 2, 3, 4]) b = np.array([1, 2, 1, 2]) print a + b print a – b print a * b print a / b print a ** b if False a = np.array([1, 2, 3, 4]) b = 2 print a + b print a – b print a * b print a / b print a ** b if False: a = np.array([1,2,3,4,5]) b = np.array([5,4,3,2,1]) print a > b print a >= b print a < b print a <= b print a == b print a != b
code snippet
import numpy as np
a = np.array([1,2,3,4])
b = a
a += np.array([1,1,1,1])
print b
+= operates in-place while + does not
import pandas as pandas if False: s1 = pd.Series([1,2,3,4], index=['a','b','c','d']) s2 = pd.Series([10,20,30,40], index['a','b','c','d']) print s1 + s2 if False: s1 = pd.Series([1,2,3,4], index=['a','b','c','d']) s2 = pd.Series([10,20,30,40], index['a','b','c','d']) print s1 + s2 if False: s1 = pd.Series([1,2,3,4], index=['a','b','c','d']) s2 = pd.Series([10,20,30,40], index['a','b','c','d']) print s1 + s2 if False: s1 = pd.Series([1,2,3,4], index=['a','b','c','d']) s2 = pd.Series([10,20,30,40], index['a','b','c','d']) print s1 + s2
NumPy Array
import numpy as np
countries = np.array([
'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
'Belize', 'Benin', 'Bhutan', 'Bolivia',
'Bosnia and Herzegovina'
])
employment = np.array([
55.70000076, 51.40000153, 50.5 , 75.69999695,
58.40000153, 40.09999847, 61.5 , 57.09999847,
60.90000153, 66.59999847, 60.40000153, 68.09999847,
66.90000153, 53.40000153, 48.59999847, 56.79999924,
71.59999847, 58.40000153, 70.40000153, 41.20000076
])
if False:
print countries[0]
print countries[3]
if False:
print countries[0:3]
print countries[:3]
print countries[17:]
print countries[:]
if False:
print countries.dtype
print employment.dtype
print np.array([0, 1, 2, 3]).dtype
print np.array([1.0, 1.5, 2.0, 2.5]).dtype
print np.array([True, False, True]).dtype
print np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype
if False:
for country in countries:
print 'Examining country {}'.format(country)
for i in range(len(countries)):
country = countries[i]
country_employment = employment[i]
print 'Country {} has employment {}'.format(country,
country_employment)
if False:
print employment.mean()
print employment.std()
print employment.max()
print employment.sum()
def max_employment(countries, employment):
max_country = None
max_value = None
return (max_country, max_value)
pandas and NumPy
Gapminder Data
-employment levels
-life expectancy
-GDP
-School Completion Rates
import pandas as pd
daily_engagement = pd.read_csv('daily_engagement_full.csv')
len(daily_engagement['acct'].unique())
One-dimensional data structures
Panda, NumPy(numerical Python)
Series -> built on Array
more features, simpler
Making histograms in python
data = [1,2,1,3,3,1,4,2] %matplotlib inline import matplotlib.pyplot as plt plt.hist(data)
Lots of different pieces of information to look at
These features can interact
plt.xlabel(“label for x axis”)
plt.ylabel(“label for y axis”)
plt.title(“title of plot”)
lesson completed
from collections import defaultdict
engagemnt_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
account_key = engagement_record['account_key']
engagement_by_account[account_key].append(engagement_record)
total_minutes_by_account = {}
for account_key, engegement_for_student in engagement_by_account.items():
total_minutes = 0
for engagement_record in engagement_for_student:
total_minutes += engagement_record['total_minutes_visited']
total_minutes_by_account[account_key] = total_minutes
total_minutes = total_minutes_by_account.values()
import numpy as np
Tracking Down
num_problem_students = 0 for enrollment in enrollments: student = enrollment['account_key'] if student not in unique_engagement_students and enrollment['join_date'] != enrollment['cancel_date']: num_problem_student += 1 num_problem_students
def within_one_week(join_date, engagement_date): time_delta = engagement_date - join_date return time_delta.days < 7 def remove_free_trial_cancels(data): new_data = [] for data_point in data: if data_point['account_key'] in paid_students: new_data.append(data_point) return new_data total_minutes = total_minutes_by_account.values() import numpy as numpy print 'Mean:' np.pean(total_minutes) print 'Standard deviation:', np.std(total_minutes) print 'Minimum:', np.min(total_minutes) print 'Maximum:', np.max(total_minutes) student_with_max_minutes = None max_minutes = 0 for student, total_minutes in total_minutes_by_account.items(): if total_minutes > max_minutes: max_minutes = total_minutes student_with_max_minutes = student