DataFrame apply()

import pandas as pd

grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

if False:
	def convert_grades_curve(exam_grades):
		return pd.qcut(exam_grades,
			[0, 0.1, 0.2, 0.5, 0.8, 1],
			labels=['F','D','C','B','A'])

	print convert_grades_curve(grades_df['exam1'])

	print grades_df.apply(convert_grades_curve)

def standardize(df):

	return None
import numpy as np
import pandas as pd

df = pd.DataFrame({
	'a': [4,5,3,1,2],
	'b': [20,10,40,50,30],
	'c': [25,20, 5, 15, 10]
	})

if False:
	print df.apply(np.mean)
	print df.apply(np.max)

def second_largest(df):
	return None

Data frame vectorized

import pandas as pd

if False:
	df1 = pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
	df2 = pd.DataFrame({'a':[10,20,30],'b':[40,50,60],'c':[70,80,90]})
	print df1 + df2

if False:
	df1 = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6], 'c':['7','8','9']})
	df2 = pd.DataFrame({'d':[10,20,30], 'c':[40,50,60], 'b':[70,80,90]})
	df1 + df2

if False:
	df1 = pd.DataFrame({'a':[1,2,3], 'b':[4,5,6], 'c':[7,8,9]},
			index=['row1','row2','row3'])
	df2 = pd.DataFrame({'a':[10,20,30],'b':[40,50,60],'c':[70,80,90]},
			index=['row4','row3','row2'])
	print df1 + df2

entries_and_exits = pd.DataFrame({
    'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
                 3144808, 3144895, 3144905, 3144941, 3145094],
    'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
               1088317, 1088328, 1088331, 1088420, 1088753]
	})

def get_hourly_entries_and_exits(entries_and_exits):

	return None
import pandas as pd

if False:
	df = pd.DataFrame({
		'a':[1, 2, 3],
		'b':[10, 20, 30],
		'c':[5, 10, 15]
		})

	def add_one(x):
		return x + 1

	print df.applymap(add_one)

grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

def convert_grades(grades):

	return None

Calculating correlation

import pandas as pd

filename = 'xxxx.csv'
subway_df = pd.read_csv(filename)

def correlation(x, y):
	std_x = (x - x.mean()) / x.std(ddof=0)
	std_y = (y - y.mean()) / y.std(ddof=0)

	return (std_x * std_y).mean()

entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['meanprecipi']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']

print correlation(entries, rain)
print correlation(entries, temp)
print correlation(rain, temp)

print correlation(entries, cum_entries)

Accessing Element

import pandas as pd

ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

if False:
	df_1 = pd.DataFrame({'A': [0, 1, 2], 'B':[3,4,5]})
	print df_1

	df_2 = pd.DataFrame([[0,1,2],[3,4,5]], columns=['A','B','C'])
	print df_2

if False:
	print ridership_df.iloc[0]
	print ridership_df.loc['05-05-11']
	print ridership_df['R003']
	print ridership_df.iloc[1, 3]

if False:
	print ridership_df.iloc[1:4]

if False:
	print ridership_df[['R003','R005']]

if False:
	df = pd.DataFrame({'A':[0,1,2],'B':[3,4,5]})
	print df.sum()
	print df.sum(axis=1)
	print df.values.sum()

def mean_riders_for_max_station(ridership):
	overall_mean = None
	mean_for_max = None

	return (overall_mean, mean_for_max)

two dimensional numbers

import numpy as np

ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

if False:
	print ridership[1, 3]
	print ridership[1:3, 3:5]
	print ridership[1, :]

if False:
	print ridership[0, :] + ridership[1, :]
	print ridership[:, 0] + ridership[:, 1]

if False:
	a = np.array([[1,2,3],[4,5,6],[7,8,9]])
	b = np.array([[1,1,1],[2,2,2],[3,3,3]])
	print a + b

def mean_riders_for_max_station(ridership):
	overall_mean = None
	mean_for_max = None

	return (overall_mean, mean_for_max)

Pandas Series apply()

import pandas as pd

if False:
	s = pd.Series([1,2,3,4,5])
	def add_one(x):
		return x + 1
	print s.apply(add_one)

names = pd.Series([
    'Andre Agassi',
    'Barry Bonds',
    'Christopher Columbus',
    'Daniel Defoe',
    'Emilio Estevez',
    'Fred Flintstone',
    'Greta Garbo',
    'Humbert Humbert',
    'Ivan Ilych',
    'James Joyce',
    'Keira Knightley',
    'Lois Lane',
    'Mike Myers',
    'Nick Nolte',
    'Ozzy Osbourne',
    'Pablo Picasso',
    'Quirinus Quirrell',
    'Rachael Ray',
    'Susan Sarandon',
    'Tina Turner',
    'Ugueth Urbina',
    'Vince Vaughn',
    'Woodrow Wilson',
    'Yoji Yamada',
    'Zinedine Zidane'
	])

def reverse_names(names):
	split_name = name.split(" ")
	first_name = split_name[0]
	last_name = split_name[1]
	return last_name + ', ' + first_name

Vectorized Operations

vector 123 * scalor 3 = 123123123, 369, error
these are reasonable answers

More vectorized operation
math operation
add, subtract, multiply, divide, expertise

Logical opereation
&, |, ~

Comparison operations
>, >=, <, <=, ==, !=

import numpy as np

if False:
	a = np.array([1, 2, 3, 4])
	b = np.array([1, 2, 1, 2])

	print a + b
	print a – b
	print a * b
	print a / b
	print a ** b

if False
	a = np.array([1, 2, 3, 4])
	b = 2

	print a + b
	print a – b
	print a * b
	print a / b
	print a ** b	

if False:
	a = np.array([1,2,3,4,5])
	b = np.array([5,4,3,2,1])

	print a > b
	print a >= b
	print a < b
	print a <= b
	print a == b
	print a != b

code snippet
import numpy as np
a = np.array([1,2,3,4])
b = a
a += np.array([1,1,1,1])
print b

+= operates in-place while + does not

import pandas as pandas

if False:
	s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
	s2 = pd.Series([10,20,30,40], index['a','b','c','d'])
	print s1 + s2

if False:
	s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
	s2 = pd.Series([10,20,30,40], index['a','b','c','d'])
	print s1 + s2

if False:
	s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
	s2 = pd.Series([10,20,30,40], index['a','b','c','d'])
	print s1 + s2

if False:
	s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
	s2 = pd.Series([10,20,30,40], index['a','b','c','d'])
	print s1 + s2

NumPy Array

import numpy as np

countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
])

employment = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])

if False:
	print countries[0]
	print countries[3]

if False:
	print countries[0:3]
	print countries[:3]
	print countries[17:]
	print countries[:]

if False:
	print countries.dtype
	print employment.dtype
	print np.array([0, 1, 2, 3]).dtype
	print np.array([1.0, 1.5, 2.0, 2.5]).dtype
	print np.array([True, False, True]).dtype
	print np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype

if False:
	for country in countries:
		print 'Examining country {}'.format(country)

	for i in range(len(countries)):
		country = countries[i]
		country_employment = employment[i]
		print 'Country {} has employment {}'.format(country,
			country_employment)

if False:
	print employment.mean()
	print employment.std()
	print employment.max()
	print employment.sum()

def max_employment(countries, employment):
	max_country = None
	max_value = None

	return (max_country, max_value)