processing all

from bs4 import BeautifulSoup
from zipfile import zipfile
import os

datadir = "data"

def open_zip(datadir):
	with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
		myzip.extractall()

def process_all(datadir):
	files = os.listdir(datadir)
	return files

def process_file(f):
	data = []
	info = {}
	info["courier"], info["airport"] = f[:6].split("-")
	with open("{}/{}".format(datadir, f), "r") as html:
		soup = BeautifulSoup(html)
	return data

def test():
	print "Running a simple test..."
	open_zip(datadir)
	files = process_all(datadir)
	data = []
	for f in files:
		data += process_file(f)

	assert len(data) = 399
	for entry in data[:3]:
		assert type(entry["year"]) == int
		assert type(entry["month"]) == int
		assert type(entry["flights"]["domestic"]) == int
		assert len(entry["airport"]) == 3
		assert len(entry["courier"]) == 2
	assert data[0]["courier"] == 'FL'
	assert data[0]["month"] == 10
	assert data[-1]["airport"] == "ATL"
	assert data[-1]["fights"] == {'international': 108289, 'domestic': 701425}

	print "... success!"

if __name__ == "__main__":
	test()

List and dictionary

List:[…] でリスト(list)を表します。

Dictonary:{…} は、辞書(dict)と呼ばれるキーと値のペアのリストを保持します。元のリストが要素をふたつずつ持ったタプルからできている場合、この場合はそのまま dict 関数を使います。

Airport List

from bs4 import BeautifulSoup
html_page = "options.html"

def extract_airports(page):
	data = []
	with open(page, "r") as html:
		soup = BeautifulSoup(html, "lxml")

	return data

def test():
	data = extract_airports(html_page)
	assert len(data) == 15
	assert "ATL" in data
	assert "ABR" in data

if __name__ == "__main__"

Carrier List

from bs4 import BeautifulSoup
html_page = "options.html"

def extract_carriers(page):
	data = []

	with open(page, "r") as html:
		soup = BeautifulSoup(html, "lxml")

	return data

def make_request(data):
	eventvalidation = data["eventvalidation"]
	viewstate = data["viewstate"]
	airport = data["airport"]
	carrier = data["carrier"]

	r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': airport,
                          'CarrierList': carrier,
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

	return r.text

def test():
	data = extract_carriers(html_page)
	assert len(data) == 16
	assert "FL" in data
	assert "NK" in data

if __name__ == "__main__"

Scraping solution

from bs4 import BeautifulSoup

s = requests.Session()

r = s.get("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2")
soup = BeautifulSoup(r.text)
viewstate_element = soup.find(id="__VIEWSTATE")
viewstate = viewstate_element["value"]
eventvalidation_element = soup.find(id="__EVENTVALIDATION")
eventvalidation = eventvalidation_element["value"]

r = s.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
	data={'AirportList' : "BOS",
		'CarrierList' : "VX",
		'Submit' : "Submit",
		'__EVENTTARGET' : "",
		'__EVENTVALIDATION' : eventvalidation,
		'__VIEWSTATE' : viewstate})

f = open("virgin_and_logan_airport.html", "w")
f.write(r.text)

Using Beautiful Soup

import requests
from bs4 import BeautifulSoup
import json

html_page = "page_source.html"

def extract_data(page):
	data = {"eventvalidation":"",
			"viewstate": ""}
	with open(page, "r") as html;
		soup = BeautifulSoup(html, "lxml")
		ev = soup.find(id="__EVENTVALIDATION")
		data["eventvalidation"] = ev["value"]

		vs = soup.find(id="__VIEWSTATE")
		data["viewstate"] = vs["value"]

		return data

def make_request(data):
	eventvalidation = data["eventvalidation"]
	viewstate = data["viewstate"]

r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
			data={'AirportList' : "BOS",
					'CarrierList' : "VX",
					'Submit' : "Submit",
					'__EVENTTARGET' : "",
					'__EVENTARGUMENT' : "",
					'__EVENTVALIDATION' : eventvalidation,
					'__VIEWSTATE' : viewstate})

	return r.text

def test():

BeautifulSoup

Learn about BeautifulSoup
https://www.crummy.com/software/BeautifulSoup/bs4/doc/

from bs4 import BeautifulSoup

def options(soup, id):
	option_values = []
	carrier_list = soup.find(id=id)
	for option in carrier_list.find_all('option'):
		option_values.append(option['value'])
	return option_values

def print_list(label, codes):
	print "\n%s:" label
	for c in codes:
		print c

def main():
	soup = BeautifulSoup(open("virgin_and_logan_airport.html"))

	codes = options(soup, 'CarrierList')
	print_list("Carriers", codes)

	codes = options(soup, 'AirportList')
	print_list("Airports", codes)

Extracting xml Data

import xml.etree.ElementTree as ET

article_file = "exampleResearchArticle.xml"

def get_root(fname)
	tree = ET.parse(fname)
	return tree.getroot()

def get_authors(root):
	authors = []
	for author in root.findall('./fm/bibl/aug/au'):
		data = {
			"fnm": None,
			"snm": None,
			"email": None
		}
		data["fnm"] = author.find('./fnm').text
		data["snm"] = author.find('./snm').text
		data["email"] = author.find('./email').text

		authors.append(data)

	return authors

def test():
	solution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]

	root = get_root(article_file)
	data = get_authors(root)

	assert data[0] == solution[0]
	assert data[1]["fnm"] == solution[1]["fnm"]

Parsing XML

import xml.etree.ElementTree as ET
import pprint

tree = ET.parse('exampleResearchArticle.xml')
root = tree.getroot()

print "\nChildren of root:"
for child in root:
	print child.tag
import xml.etree.ElementTree as ET
import pprint

tree = ET.parse('exampleResearchArticle.xml')
root = tree.getroot()

title = root.find('./fm/bibl/title')
title_text = ""
for p in title:
	title_text += p.text
print "\nTitle:\n", title_text

print "\nAuthor email addresses:"
for a in root.findall('./fm/bibl/aug/au'):
	email = a.find('email')
	if email is not None:
		print email.text

Wrangling JSON

some important concepts
– using codecs module to write unicode files
– using authentication with web APIs
– using offset when accessing web APIs

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import codecs
import requests

URL_MAIN = "http://api.nytimes.com/svc/"
URL_POPULAR = URL_MAIN + "mostpopular/v2/"
API_KEY = { "popular": "",
			"article": ""}

def get_from_file(kind, period):
	filename = "popular-{0}-{1}.json".format(kind, period)
	with open(filename, "r") as f:
		return json.loads(f.read())

def article_overview(kind, period):
	data = get_from_file(kind, period)
	titles = []
	urls = []

	for article in data:
		section = article["section"]
		title = article["title"]
		titles.append({section: title})
		if "media" in article:
			for m in article["media"]:
				for mm in m["media-metadata"]:
					if mm["format"] == "Standard Thumbnail":
						urls.append(mm["url"])
	return (titles, urls)

def query_site(url, target, offset):
	if API_KEY["popular"] == "" or API_KEY["article"] == "":
		print "You need to register for NYTimes Developer account to run this program."
		print "See Instructor notes for information"
		return False
	params = {"api-key": API_KEY[target], "offset": offset}
	r = requests.get(url, params = params)

	if r.status_code == requests.codes.ok:
		return r.json()
	else:
		r.raise_for_status()

def get_popular(url, kind, days, section="all-sections", offset=0):
	if days not in [1,7,30]:
		print "time period can be 1, 7, 30 days only"
		return False
	if kind not in ["viewd", "shared", "emailed"]:
		print "kind can be only one of viewd/shared/emailed"
		return False

	url += "most{0}/{1}/{2}.json".format(kind, section, days)
	data = query_site(url, "popular", offset)

	return data

def save_file(kind, period):
	data = get_popular(URL_POPULAR, "viewd", 1)
	num_results = data["num_results"]
	full_data = []
	with codecs.open("popular-{0}-{1}.json".format(kind, period), encoding='utf-8', mode='w') as v:
		for offset in range(0, num_results, 20):
			data = get_popular(URL_POPULAR, kind, period, offset=offset)
			full_data += data["results"]

		v.write(json.dumps(full_data, indent=2))

def test():
	titles, urls = article_overview("viewd", 1)
	assert len(titles) == 20
	assert len(urls) == 30
	assert titles[2] == {'Opinion': 'Professors, Wee need you!'}
	assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg'

if __name__ == "__main__":
	test

Python XML
https://wiki.python.org/moin/PythonXml