Complex Query

import pandas
import pandasql

def aggregate_query(filename):
	aadhaar_data = pandas.read_csv(filename)
	aadhaar_data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True)

	q = """
	SELECT
	gender, district, sum(aadhaar_generated)
	FROM
	aadhaar_data
	WHERE
	age > 50
	GROUP BY
	gender, district;
	"""

	aadhaar_solution = pandasql.sqldf(q.lower(), locals())
	return aadhaar_solution

Files, Databases, APIs
Application Programming Interface
https://www.last.fm/api

import json
import requests

def imputation(filename):

	baseball = pandas.read_csv('../data/Master.csv')

	baseball['weight'] = baseball['weight'].fillna(numpy.mean(baseball['weight']))

	print numpy.sum(baseball['weight']), numpy.mean(baseball['weight'])

Matrix Multiplication

>>> a = [1,2,3,4,5]
>>> b = [2,3,4,5,6]
>>> numpy.dot(a,b)
70

Data Wrangling Manipulation
files, databases, web APIs

Dealing with Messy Data
Acquiring Data
– Acquiring data often isn’t funcy
– Find stuff on the internet!
– A lot of data stored in text files and on gov’t website

Common Data Formats
– csv, xml, json

import pandas

def add_full_name(path_to_csv, path_to_new_csv):
	dataframe = pandas.read_csv(path_to_csv)
	dataframe['nameFull'] = dataframe['nameFirst'] + ' ' + dataframe['nameLast']
	dataframe.to_csv(path_to_new_csv)

if __name__ == "__main__":
	path_to_csv = ""
	path_to_new_csv = ""
	add_full_name(path_to_csv, path_to_new_csv)

Relational Database
Why useful? ->
it is straight forward to extract aggregated with complex filters
a database scale well
it ensures all data is consistently formatted

Schemas = Blueprints
SELECT * FROM aadhar_data;

import pandas
import pandasql

def select_first_50(filename):
	aadhaar_data = pandas.read_csv('../data/aadhaar_data.csv')
	aadhaar_data.rename(columns = lambda x: x.replace('','_').lower(), inplace=True)

	q = """
	SELECT
	register, enrolment_agency
	FROM
	aadhaar_data
	LIMIT 50;
	"""
	aadhaar_solution = pandasql.sqldf(q.lower(), locals())
	return aadhaar_solution

Pandas Vectorized Methods

>>> from pandas import Series, DataFrame
>>> d = {'one': Series([1,2,3], index=['a','b','c']),
... 'two': Series([1,2,3,4], index=['a','b','c','d'])}
>>> df = DataFrame(d)
>>> df
   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
>>> import numpy
>>> df.apply(numpy.mean)
one    2.0
two    2.5
dtype: float64
>>> df['one'].map(lambda x: x>= 1)
a     True
b     True
c     True
d    False
Name: one, dtype: bool
>>> df.applymap(lambda x: x>= 1)
     one   two
a   True  True
b   True  True
c   True  True
d  False  True
from pandas import DataFrame, Series
import numpy
def avg_bronze_medal_count():

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
    
    olympic_medal_counts = {'country_name':Series(countries),
                            'gold': Series(gold),
                            'silver': Series(silver),
                            'bronze': Series(bronze)}
    olympic_medal_counts_df = DataFrame(olympic_medal_counts)
    bronze_at_least_one_gold = olympic_medal_counts_df['bronze'][olympic_medal_counts_df['gold'] >= 1]
    avg_bronze_at_least_one_gold = numpy.mean(bronze_at_least_one_gold)

    print(avg_bronze_at_least_one_gold)

Create DataFrame

from pandas import DataFrame, series

def create_dataframe():

	countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]

    olympic_medal_counts = {'country_name': Series(countries), 'gold':Series(gold), \
    'silver': Series(silver), 'bronze': Series(bronze)}

    olympic_medal_counts_df = DataFrame(olympic_medal_counts)

    print(olympic_medal_counts_df)

concept of Series in Pandas

*This playground is inspired by Greg Reda’s post on Intro to Pandas Data Structures:
http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures/

import pandas as pd

if False:
	series = pd.Series(['Dave', 'Cheng-Han', 'Uxxxx', 42, -1789710578])
	print series

if False:
	series = pd.Series(['Dave', 'Cheng-Han', 359, 9001],
			index=['Instructor', 'Curriculum Manager', 'Course Number', 'Power Level'])
	print series['Instructor']
	print ""
	print series[['Instructor', 'Curriculum Manager', 'Course Number']]

if False:
	cuteness = pd.Series([1, 2, 3, 4, 5], index=['Cockroach', 'Fish', 'Mini Pig', 'Puppy', 'Kitten'])

	print cuteness > 3
	print ""
	print cuteness[cuteness > 3]

*This playground is inspired by Greg Reda’s post on Intro to Pandas Data Structures:
http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures/

import numpy as np
import pandas as pd

if False:
	data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
		'team':['Bears','Bears','Bears','Packers','Lions','Lions','Lions'],
		'wins': [11, 8, 10, 15, 11, 6, 10, 4],
		'losses':[5, 8, 6, 1, 5, 10, 6, 12]}
	football = pd.DataFrame(data)
	print football

if False:
	data = {'year':[2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
			'team':['Bears','Bears','Bears','Packers','Packers','Lions',
			'Lions','Lions'],
			'wins':[11, 8, 10, 15, 11, 6, 10, 4],
			'losses':[5, 8, 6, 1, 5, 10, 6, 12]}
	football = pd.DataFrame(data)
	print football.dtypes
	print ""
	print football.describe()
	print ""
	print football.head()
	print ""
	print football.tail()

Pandas

Dataframes: string, int, float, boolean

install pandas

[vagrant@localhost ~]$ pip install pyparsing
Collecting pyparsing
  Downloading pyparsing-2.2.0-py2.py3-none-any.whl (56kB)
    100% |████████████████████████████████| 61kB 328kB/s
Installing collected packages: pyparsing
Successfully installed pyparsing-2.2.0
[vagrant@localhost ~]$ pip install pandas
Collecting pandas
  Downloading pandas-0.20.3-cp35-cp35m-manylinux1_x86_64.whl (24.0MB)
    100% |████████████████████████████████| 24.0MB 49kB/s
Collecting python-dateutil>=2 (from pandas)
  Downloading python_dateutil-2.6.1-py2.py3-none-any.whl (194kB)
    100% |████████████████████████████████| 194kB 785kB/s
Requirement already satisfied: numpy>=1.7.0 in ./.pyenv/versions/3.5.2/lib/python3.5/site-packages (from pandas)
Collecting pytz>=2011k (from pandas)
  Downloading pytz-2017.2-py2.py3-none-any.whl (484kB)
    100% |████████████████████████████████| 491kB 373kB/s
Collecting six>=1.5 (from python-dateutil>=2->pandas)
  Downloading six-1.10.0-py2.py3-none-any.whl
Installing collected packages: six, python-dateutil, pytz, pandas
Successfully installed pandas-0.20.3 python-dateutil-2.6.1 pytz-2017.2 six-1.10.0
>>> from pandas import Series, DataFrame
>>> d = {'name': Series(['Braund', 'Cummings', 'Heikkinen', 'Allen'], index=['a', 'b', 'c', 'd']),
... 'age': Series([22,38,26,35], index=['a','b','c','d']),
... 'fare': Series([7.25, 71.83, 8.05], index=['a','b','d']),
... 'survived?': Series([False, True, True, False], index=['a','b','c','d'])}
>>> df = DataFrame(d)
>>> print(df)
   age   fare       name  survived?
a   22   7.25     Braund      False
b   38  71.83   Cummings       True
c   26    NaN  Heikkinen       True
d   35   8.05      Allen      False

Using numpy

>>> numbers = [1,2,3,4,5]
>>> numpy.mean(numbers)
Traceback (most recent call last):
  File "", line 1, in 
NameError: name 'numpy' is not defined
>>> import numpy
>>> numbers = [1,2,3,4,5]
>>> numpy.mean(numbers)
3.0
>>> numpy.median(numbers)
3.0
>>> numpy.std(numbers)
1.4142135623730951

install numpy

[vagrant@localhost ~]$ python -V
Python 3.5.2
[vagrant@localhost ~]$ pip list
pip (8.1.1)
setuptools (20.10.1)
You are using pip version 8.1.1, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
[vagrant@localhost ~]$ pip install --upgrade pip
Collecting pip
  Downloading pip-9.0.1-py2.py3-none-any.whl (1.3MB)
    100% |████████████████████████████████| 1.3MB 478kB/s
Installing collected packages: pip
  Found existing installation: pip 8.1.1
    Uninstalling pip-8.1.1:
      Successfully uninstalled pip-8.1.1
Successfully installed pip-9.0.1
[vagrant@localhost ~]$ pip install numpy
Collecting numpy
  Downloading numpy-1.13.1-cp35-cp35m-manylinux1_x86_64.whl (16.9MB)
    100% |████████████████████████████████| 16.9MB 54kB/s
Installing collected packages: numpy
Successfully installed numpy-1.13.1

vagrant upしようとした際に、error

コマンドラインで、vagrant upしようとしたら、error while executing `VBoxManage`と表示

>vagrant up
Bringing machine 'default' up with 'virtualbox' provider...
==> default: Importing base box 'bento/centos-6.8'...
==> default: Matching MAC address for NAT networking...
==> default: Checking if box 'bento/centos-6.8' is up to date...
==> default: A newer version of the box 'bento/centos-6.8' is available! You currently
==> default: have version '2.3.0'. The latest is version '2.3.4'. Run
==> default: `vagrant box update` to update.
==> default: Setting the name of the VM: FirstCentOs_default_1504414856800_6279
==> default: Clearing any previously set network interfaces...
==> default: Preparing network interfaces based on configuration...
    default: Adapter 1: nat
    default: Adapter 2: hostonly
==> default: Forwarding ports...
    default: 22 (guest) => 2222 (host) (adapter 1)
==> default: Booting VM...
There was an error while executing `VBoxManage`, a CLI used by Vagrant
for controlling VirtualBox. The command and stderr is shown below.

Command: ["startvm", "6663a9a1-034a-4794-b88f-xxxxxxxxxxxx", "--type", "headless"]

困りますよね。別の仮想マシンを立ち上げましたが、同じように、error while executing `VBoxManage`と表示されたので、windowsのアプリから、Oracle VM VirtualBox 5.1.22をアンインストールして、Oracle VM VirtualBox 5.1.26を再度インストール。

今度は上手くいきました。

>vagrant up
Bringing machine 'default' up with 'virtualbox' provider...
==> default: Checking if box 'bento/centos-6.8' is up to date...
==> default: A newer version of the box 'bento/centos-6.8' is available! You currently
==> default: have version '2.3.0'. The latest is version '2.3.4'. Run
==> default: `vagrant box update` to update.
==> default: Clearing any previously set forwarded ports...
==> default: Clearing any previously set network interfaces...
==> default: Preparing network interfaces based on configuration...
    default: Adapter 1: nat
    default: Adapter 2: hostonly
==> default: Forwarding ports...
    default: 22 (guest) => 2222 (host) (adapter 1)
==> default: Booting VM...
==> default: Waiting for machine to boot. This may take a few minutes...
    default: SSH address: 127.0.0.1:2222
    default: SSH username: vagrant
    default: SSH auth method: private key
    default: Warning: Connection reset. Retrying...
    default: Warning: Connection aborted. Retrying...
    default:
    default: Vagrant insecure key detected. Vagrant will automatically replace
    default: this with a newly generated keypair for better security.
    default:
    default: Inserting generated public key within guest...
    default: Removing insecure key from the guest if it's present...
    default: Key inserted! Disconnecting and reconnecting using new SSH key...
==> default: Machine booted and ready!