pandas and NumPy

Gapminder Data
-employment levels
-life expectancy
-GDP
-School Completion Rates

import pandas as pd
daily_engagement = pd.read_csv('daily_engagement_full.csv')
len(daily_engagement['acct'].unique())

One-dimensional data structures
Panda, NumPy(numerical Python)
Series -> built on Array
more features, simpler

Making histograms in python

data = [1,2,1,3,3,1,4,2]

%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(data)

Lots of different pieces of information to look at
These features can interact

plt.xlabel(“label for x axis”)
plt.ylabel(“label for y axis”)
plt.title(“title of plot”)

lesson completed

from collections import defaultdict

engagemnt_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
	account_key = engagement_record['account_key']
	engagement_by_account[account_key].append(engagement_record)

total_minutes_by_account = {}

for account_key, engegement_for_student in engagement_by_account.items():
	total_minutes = 0
	for engagement_record in engagement_for_student:
		total_minutes += engagement_record['total_minutes_visited']
	total_minutes_by_account[account_key] = total_minutes

total_minutes = total_minutes_by_account.values()

import numpy as np

Tracking Down

num_problem_students = 0

for enrollment in enrollments:
	student = enrollment['account_key']
	if student not in unique_engagement_students
		and enrollment['join_date'] != enrollment['cancel_date']:
	num_problem_student += 1

num_problem_students
def within_one_week(join_date, engagement_date):
	time_delta = engagement_date - join_date
	return time_delta.days < 7

def remove_free_trial_cancels(data):
	new_data = []
	for data_point in data:
		if data_point['account_key'] in paid_students:
			new_data.append(data_point)
		return new_data

total_minutes = total_minutes_by_account.values()

import numpy as numpy

print 'Mean:' np.pean(total_minutes)
print 'Standard deviation:', np.std(total_minutes)
print 'Minimum:', np.min(total_minutes)
print 'Maximum:', np.max(total_minutes)

student_with_max_minutes = None
max_minutes = 0

for student, total_minutes in total_minutes_by_account.items():
	if total_minutes > max_minutes:
		max_minutes = total_minutes
		student_with_max_minutes = student

CSVs

import unicodecsv
enrollments_filename=''
with open('enrollments.csv', 'rb') as f:
	reader = unicodecsv.DictReader(f)
	enrollements = list(reader)

with open('daily_engagement.csv', 'rb') as f:
	reader = unicodecsv.DictReader(f)
	daily_engagement = list(reader)

with open('project_submissions.csv', 'rb') as f:
	reader = unicodecsv.DictReader(f)
	project_submissions = list(reader)

daily_engagement = None
project_submissions = None
import unicodecsv

def read_csv(filename):
	with open(filename, 'rb') as f:
		reader = unicodecsv.DictReader(f)
		return list(reader)

enrollments = read_csv('')
daily_engagement = read_csv('')
project_submissions = read_csv('')

enrollment_num_rows = 0
enrollment_num_unique_students = 0

engagement_num_rows = 0
engagement_num_unique_students = 0

submission_num_rows = 0
submission_num_unique_students = 0