Using blue print

import xml.etree.cElementTree as ET
from collections import defaultdict
import re

osm_file = open("chicago_abbrev.osm", "r")

street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

def audit_street_type(street_types, street_name):
	m = street_type_re.search(street_name)
	if m:
		street_type = m.group()
		street_types[street_type] += 1

def print_sorted_dict(d):
	keys = d.keys()
	keys = sorted(keys, key=lambda s: s.lower())
	for k in keys:

def is_street_name(elem):
	return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

def audit():
	for event, elem in ET.iterparse(osm_file):
		if is_street_name(elem):
			audit_street_type(street_types, elem.attrib['v'])
	print_sorted_dict(street_types)

if __name__ == '__main__':
	audit()

Processing Patents

import xml.etree.ElementTree as ET
PATENTS = ‘patent.data’

def get_root(fname):
tree = ET.parse(fname)
return tree.getroot()

def split_file(filename):
pass

def test():
split_file(PATENTS)
for n in range(4):
try:
fname = “{}-{}”.format(PATENTS, n)
f = open(fname, “r”)
if not f.readline().startswith(“

processing all

from bs4 import BeautifulSoup
from zipfile import zipfile
import os

datadir = "data"

def open_zip(datadir):
	with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
		myzip.extractall()

def process_all(datadir):
	files = os.listdir(datadir)
	return files

def process_file(f):
	data = []
	info = {}
	info["courier"], info["airport"] = f[:6].split("-")
	with open("{}/{}".format(datadir, f), "r") as html:
		soup = BeautifulSoup(html)
	return data

def test():
	print "Running a simple test..."
	open_zip(datadir)
	files = process_all(datadir)
	data = []
	for f in files:
		data += process_file(f)

	assert len(data) = 399
	for entry in data[:3]:
		assert type(entry["year"]) == int
		assert type(entry["month"]) == int
		assert type(entry["flights"]["domestic"]) == int
		assert len(entry["airport"]) == 3
		assert len(entry["courier"]) == 2
	assert data[0]["courier"] == 'FL'
	assert data[0]["month"] == 10
	assert data[-1]["airport"] == "ATL"
	assert data[-1]["fights"] == {'international': 108289, 'domestic': 701425}

	print "... success!"

if __name__ == "__main__":
	test()

List and dictionary

List:[…] でリスト(list)を表します。

Dictonary:{…} は、辞書(dict)と呼ばれるキーと値のペアのリストを保持します。元のリストが要素をふたつずつ持ったタプルからできている場合、この場合はそのまま dict 関数を使います。

Airport List

from bs4 import BeautifulSoup
html_page = "options.html"

def extract_airports(page):
	data = []
	with open(page, "r") as html:
		soup = BeautifulSoup(html, "lxml")

	return data

def test():
	data = extract_airports(html_page)
	assert len(data) == 15
	assert "ATL" in data
	assert "ABR" in data

if __name__ == "__main__"

Carrier List

from bs4 import BeautifulSoup
html_page = "options.html"

def extract_carriers(page):
	data = []

	with open(page, "r") as html:
		soup = BeautifulSoup(html, "lxml")

	return data

def make_request(data):
	eventvalidation = data["eventvalidation"]
	viewstate = data["viewstate"]
	airport = data["airport"]
	carrier = data["carrier"]

	r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': airport,
                          'CarrierList': carrier,
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

	return r.text

def test():
	data = extract_carriers(html_page)
	assert len(data) == 16
	assert "FL" in data
	assert "NK" in data

if __name__ == "__main__"

Scraping solution

from bs4 import BeautifulSoup

s = requests.Session()

r = s.get("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2")
soup = BeautifulSoup(r.text)
viewstate_element = soup.find(id="__VIEWSTATE")
viewstate = viewstate_element["value"]
eventvalidation_element = soup.find(id="__EVENTVALIDATION")
eventvalidation = eventvalidation_element["value"]

r = s.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
	data={'AirportList' : "BOS",
		'CarrierList' : "VX",
		'Submit' : "Submit",
		'__EVENTTARGET' : "",
		'__EVENTVALIDATION' : eventvalidation,
		'__VIEWSTATE' : viewstate})

f = open("virgin_and_logan_airport.html", "w")
f.write(r.text)

Using Beautiful Soup

import requests
from bs4 import BeautifulSoup
import json

html_page = "page_source.html"

def extract_data(page):
	data = {"eventvalidation":"",
			"viewstate": ""}
	with open(page, "r") as html;
		soup = BeautifulSoup(html, "lxml")
		ev = soup.find(id="__EVENTVALIDATION")
		data["eventvalidation"] = ev["value"]

		vs = soup.find(id="__VIEWSTATE")
		data["viewstate"] = vs["value"]

		return data

def make_request(data):
	eventvalidation = data["eventvalidation"]
	viewstate = data["viewstate"]

r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
			data={'AirportList' : "BOS",
					'CarrierList' : "VX",
					'Submit' : "Submit",
					'__EVENTTARGET' : "",
					'__EVENTARGUMENT' : "",
					'__EVENTVALIDATION' : eventvalidation,
					'__VIEWSTATE' : viewstate})

	return r.text

def test():

BeautifulSoup

Learn about BeautifulSoup
https://www.crummy.com/software/BeautifulSoup/bs4/doc/

from bs4 import BeautifulSoup

def options(soup, id):
	option_values = []
	carrier_list = soup.find(id=id)
	for option in carrier_list.find_all('option'):
		option_values.append(option['value'])
	return option_values

def print_list(label, codes):
	print "\n%s:" label
	for c in codes:
		print c

def main():
	soup = BeautifulSoup(open("virgin_and_logan_airport.html"))

	codes = options(soup, 'CarrierList')
	print_list("Carriers", codes)

	codes = options(soup, 'AirportList')
	print_list("Airports", codes)

Extracting xml Data

import xml.etree.ElementTree as ET

article_file = "exampleResearchArticle.xml"

def get_root(fname)
	tree = ET.parse(fname)
	return tree.getroot()

def get_authors(root):
	authors = []
	for author in root.findall('./fm/bibl/aug/au'):
		data = {
			"fnm": None,
			"snm": None,
			"email": None
		}
		data["fnm"] = author.find('./fnm').text
		data["snm"] = author.find('./snm').text
		data["email"] = author.find('./email').text

		authors.append(data)

	return authors

def test():
	solution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]

	root = get_root(article_file)
	data = get_authors(root)

	assert data[0] == solution[0]
	assert data[1]["fnm"] == solution[1]["fnm"]