from bs4 import BeautifulSoup
from zipfile import zipfile
import os
datadir = "data"
def open_zip(datadir):
with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
myzip.extractall()
def process_all(datadir):
files = os.listdir(datadir)
return files
def process_file(f):
data = []
info = {}
info["courier"], info["airport"] = f[:6].split("-")
with open("{}/{}".format(datadir, f), "r") as html:
soup = BeautifulSoup(html)
return data
def test():
print "Running a simple test..."
open_zip(datadir)
files = process_all(datadir)
data = []
for f in files:
data += process_file(f)
assert len(data) = 399
for entry in data[:3]:
assert type(entry["year"]) == int
assert type(entry["month"]) == int
assert type(entry["flights"]["domestic"]) == int
assert len(entry["airport"]) == 3
assert len(entry["courier"]) == 2
assert data[0]["courier"] == 'FL'
assert data[0]["month"] == 10
assert data[-1]["airport"] == "ATL"
assert data[-1]["fights"] == {'international': 108289, 'domestic': 701425}
print "... success!"
if __name__ == "__main__":
test()
List and dictionary
List:[…] でリスト(list)を表します。
Dictonary:{…} は、辞書(dict)と呼ばれるキーと値のペアのリストを保持します。元のリストが要素をふたつずつ持ったタプルからできている場合、この場合はそのまま dict 関数を使います。
Airport List
from bs4 import BeautifulSoup html_page = "options.html" def extract_airports(page): data = [] with open(page, "r") as html: soup = BeautifulSoup(html, "lxml") return data def test(): data = extract_airports(html_page) assert len(data) == 15 assert "ATL" in data assert "ABR" in data if __name__ == "__main__"
Carrier List
from bs4 import BeautifulSoup
html_page = "options.html"
def extract_carriers(page):
data = []
with open(page, "r") as html:
soup = BeautifulSoup(html, "lxml")
return data
def make_request(data):
eventvalidation = data["eventvalidation"]
viewstate = data["viewstate"]
airport = data["airport"]
carrier = data["carrier"]
r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
data={'AirportList': airport,
'CarrierList': carrier,
'Submit': 'Submit',
"__EVENTTARGET": "",
"__EVENTARGUMENT": "",
"__EVENTVALIDATION": eventvalidation,
"__VIEWSTATE": viewstate
})
return r.text
def test():
data = extract_carriers(html_page)
assert len(data) == 16
assert "FL" in data
assert "NK" in data
if __name__ == "__main__"
Scraping solution
from bs4 import BeautifulSoup
s = requests.Session()
r = s.get("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2")
soup = BeautifulSoup(r.text)
viewstate_element = soup.find(id="__VIEWSTATE")
viewstate = viewstate_element["value"]
eventvalidation_element = soup.find(id="__EVENTVALIDATION")
eventvalidation = eventvalidation_element["value"]
r = s.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
data={'AirportList' : "BOS",
'CarrierList' : "VX",
'Submit' : "Submit",
'__EVENTTARGET' : "",
'__EVENTVALIDATION' : eventvalidation,
'__VIEWSTATE' : viewstate})
f = open("virgin_and_logan_airport.html", "w")
f.write(r.text)
Using Beautiful Soup
import requests
from bs4 import BeautifulSoup
import json
html_page = "page_source.html"
def extract_data(page):
data = {"eventvalidation":"",
"viewstate": ""}
with open(page, "r") as html;
soup = BeautifulSoup(html, "lxml")
ev = soup.find(id="__EVENTVALIDATION")
data["eventvalidation"] = ev["value"]
vs = soup.find(id="__VIEWSTATE")
data["viewstate"] = vs["value"]
return data
def make_request(data):
eventvalidation = data["eventvalidation"]
viewstate = data["viewstate"]
r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
data={'AirportList' : "BOS",
'CarrierList' : "VX",
'Submit' : "Submit",
'__EVENTTARGET' : "",
'__EVENTARGUMENT' : "",
'__EVENTVALIDATION' : eventvalidation,
'__VIEWSTATE' : viewstate})
return r.text
def test():
BeautifulSoup
Learn about BeautifulSoup
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4 import BeautifulSoup
def options(soup, id):
option_values = []
carrier_list = soup.find(id=id)
for option in carrier_list.find_all('option'):
option_values.append(option['value'])
return option_values
def print_list(label, codes):
print "\n%s:" label
for c in codes:
print c
def main():
soup = BeautifulSoup(open("virgin_and_logan_airport.html"))
codes = options(soup, 'CarrierList')
print_list("Carriers", codes)
codes = options(soup, 'AirportList')
print_list("Airports", codes)
Extracting xml Data
import xml.etree.ElementTree as ET
article_file = "exampleResearchArticle.xml"
def get_root(fname)
tree = ET.parse(fname)
return tree.getroot()
def get_authors(root):
authors = []
for author in root.findall('./fm/bibl/aug/au'):
data = {
"fnm": None,
"snm": None,
"email": None
}
data["fnm"] = author.find('./fnm').text
data["snm"] = author.find('./snm').text
data["email"] = author.find('./email').text
authors.append(data)
return authors
def test():
solution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]
root = get_root(article_file)
data = get_authors(root)
assert data[0] == solution[0]
assert data[1]["fnm"] == solution[1]["fnm"]
Parsing XML
import xml.etree.ElementTree as ET
import pprint
tree = ET.parse('exampleResearchArticle.xml')
root = tree.getroot()
print "\nChildren of root:"
for child in root:
print child.tag
import xml.etree.ElementTree as ET
import pprint
tree = ET.parse('exampleResearchArticle.xml')
root = tree.getroot()
title = root.find('./fm/bibl/title')
title_text = ""
for p in title:
title_text += p.text
print "\nTitle:\n", title_text
print "\nAuthor email addresses:"
for a in root.findall('./fm/bibl/aug/au'):
email = a.find('email')
if email is not None:
print email.text
Wrangling JSON
some important concepts
– using codecs module to write unicode files
– using authentication with web APIs
– using offset when accessing web APIs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import codecs
import requests
URL_MAIN = "http://api.nytimes.com/svc/"
URL_POPULAR = URL_MAIN + "mostpopular/v2/"
API_KEY = { "popular": "",
"article": ""}
def get_from_file(kind, period):
filename = "popular-{0}-{1}.json".format(kind, period)
with open(filename, "r") as f:
return json.loads(f.read())
def article_overview(kind, period):
data = get_from_file(kind, period)
titles = []
urls = []
for article in data:
section = article["section"]
title = article["title"]
titles.append({section: title})
if "media" in article:
for m in article["media"]:
for mm in m["media-metadata"]:
if mm["format"] == "Standard Thumbnail":
urls.append(mm["url"])
return (titles, urls)
def query_site(url, target, offset):
if API_KEY["popular"] == "" or API_KEY["article"] == "":
print "You need to register for NYTimes Developer account to run this program."
print "See Instructor notes for information"
return False
params = {"api-key": API_KEY[target], "offset": offset}
r = requests.get(url, params = params)
if r.status_code == requests.codes.ok:
return r.json()
else:
r.raise_for_status()
def get_popular(url, kind, days, section="all-sections", offset=0):
if days not in [1,7,30]:
print "time period can be 1, 7, 30 days only"
return False
if kind not in ["viewd", "shared", "emailed"]:
print "kind can be only one of viewd/shared/emailed"
return False
url += "most{0}/{1}/{2}.json".format(kind, section, days)
data = query_site(url, "popular", offset)
return data
def save_file(kind, period):
data = get_popular(URL_POPULAR, "viewd", 1)
num_results = data["num_results"]
full_data = []
with codecs.open("popular-{0}-{1}.json".format(kind, period), encoding='utf-8', mode='w') as v:
for offset in range(0, num_results, 20):
data = get_popular(URL_POPULAR, kind, period, offset=offset)
full_data += data["results"]
v.write(json.dumps(full_data, indent=2))
def test():
titles, urls = article_overview("viewd", 1)
assert len(titles) == 20
assert len(urls) == 30
assert titles[2] == {'Opinion': 'Professors, Wee need you!'}
assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg'
if __name__ == "__main__":
test
Python XML
https://wiki.python.org/moin/PythonXml