Matrix Multiplication

>>> a = [1,2,3,4,5]
>>> b = [2,3,4,5,6]
>>> numpy.dot(a,b)
70

Data Wrangling Manipulation
files, databases, web APIs

Dealing with Messy Data
Acquiring Data
– Acquiring data often isn’t funcy
– Find stuff on the internet!
– A lot of data stored in text files and on gov’t website

Common Data Formats
– csv, xml, json

import pandas

def add_full_name(path_to_csv, path_to_new_csv):
	dataframe = pandas.read_csv(path_to_csv)
	dataframe['nameFull'] = dataframe['nameFirst'] + ' ' + dataframe['nameLast']
	dataframe.to_csv(path_to_new_csv)

if __name__ == "__main__":
	path_to_csv = ""
	path_to_new_csv = ""
	add_full_name(path_to_csv, path_to_new_csv)

Relational Database
Why useful? ->
it is straight forward to extract aggregated with complex filters
a database scale well
it ensures all data is consistently formatted

Schemas = Blueprints
SELECT * FROM aadhar_data;

import pandas
import pandasql

def select_first_50(filename):
	aadhaar_data = pandas.read_csv('../data/aadhaar_data.csv')
	aadhaar_data.rename(columns = lambda x: x.replace('','_').lower(), inplace=True)

	q = """
	SELECT
	register, enrolment_agency
	FROM
	aadhaar_data
	LIMIT 50;
	"""
	aadhaar_solution = pandasql.sqldf(q.lower(), locals())
	return aadhaar_solution