import xml.etree.cElementTree as ET from collections import defaultdict import re osm_file = open("chicago_abbrev.osm", "r") street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE) street_types = defaultdict(int) def audit_street_type(street_types, street_name): m = street_type_re.search(street_name) if m: street_type = m.group() street_types[street_type] += 1 def print_sorted_dict(d): keys = d.keys() keys = sorted(keys, key=lambda s: s.lower()) for k in keys: def is_street_name(elem): return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street") def audit(): for event, elem in ET.iterparse(osm_file): if is_street_name(elem): audit_street_type(street_types, elem.attrib['v']) print_sorted_dict(street_types) if __name__ == '__main__': audit()
Month: February 2017
Processing Patents
import xml.etree.ElementTree as ET
PATENTS = ‘patent.data’
def get_root(fname):
tree = ET.parse(fname)
return tree.getroot()
def split_file(filename):
pass
def test():
split_file(PATENTS)
for n in range(4):
try:
fname = “{}-{}”.format(PATENTS, n)
f = open(fname, “r”)
if not f.readline().startswith(“
processing all
from bs4 import BeautifulSoup from zipfile import zipfile import os datadir = "data" def open_zip(datadir): with ZipFile('{0}.zip'.format(datadir), 'r') as myzip: myzip.extractall() def process_all(datadir): files = os.listdir(datadir) return files def process_file(f): data = [] info = {} info["courier"], info["airport"] = f[:6].split("-") with open("{}/{}".format(datadir, f), "r") as html: soup = BeautifulSoup(html) return data def test(): print "Running a simple test..." open_zip(datadir) files = process_all(datadir) data = [] for f in files: data += process_file(f) assert len(data) = 399 for entry in data[:3]: assert type(entry["year"]) == int assert type(entry["month"]) == int assert type(entry["flights"]["domestic"]) == int assert len(entry["airport"]) == 3 assert len(entry["courier"]) == 2 assert data[0]["courier"] == 'FL' assert data[0]["month"] == 10 assert data[-1]["airport"] == "ATL" assert data[-1]["fights"] == {'international': 108289, 'domestic': 701425} print "... success!" if __name__ == "__main__": test()
List and dictionary
List:[…] でリスト(list)を表します。
Dictonary:{…} は、辞書(dict)と呼ばれるキーと値のペアのリストを保持します。元のリストが要素をふたつずつ持ったタプルからできている場合、この場合はそのまま dict 関数を使います。
Airport List
from bs4 import BeautifulSoup html_page = "options.html" def extract_airports(page): data = [] with open(page, "r") as html: soup = BeautifulSoup(html, "lxml") return data def test(): data = extract_airports(html_page) assert len(data) == 15 assert "ATL" in data assert "ABR" in data if __name__ == "__main__"
Carrier List
from bs4 import BeautifulSoup html_page = "options.html" def extract_carriers(page): data = [] with open(page, "r") as html: soup = BeautifulSoup(html, "lxml") return data def make_request(data): eventvalidation = data["eventvalidation"] viewstate = data["viewstate"] airport = data["airport"] carrier = data["carrier"] r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2", data={'AirportList': airport, 'CarrierList': carrier, 'Submit': 'Submit', "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__EVENTVALIDATION": eventvalidation, "__VIEWSTATE": viewstate }) return r.text def test(): data = extract_carriers(html_page) assert len(data) == 16 assert "FL" in data assert "NK" in data if __name__ == "__main__"
Scraping solution
from bs4 import BeautifulSoup s = requests.Session() r = s.get("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2") soup = BeautifulSoup(r.text) viewstate_element = soup.find(id="__VIEWSTATE") viewstate = viewstate_element["value"] eventvalidation_element = soup.find(id="__EVENTVALIDATION") eventvalidation = eventvalidation_element["value"] r = s.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2", data={'AirportList' : "BOS", 'CarrierList' : "VX", 'Submit' : "Submit", '__EVENTTARGET' : "", '__EVENTVALIDATION' : eventvalidation, '__VIEWSTATE' : viewstate}) f = open("virgin_and_logan_airport.html", "w") f.write(r.text)
Using Beautiful Soup
import requests from bs4 import BeautifulSoup import json html_page = "page_source.html" def extract_data(page): data = {"eventvalidation":"", "viewstate": ""} with open(page, "r") as html; soup = BeautifulSoup(html, "lxml") ev = soup.find(id="__EVENTVALIDATION") data["eventvalidation"] = ev["value"] vs = soup.find(id="__VIEWSTATE") data["viewstate"] = vs["value"] return data def make_request(data): eventvalidation = data["eventvalidation"] viewstate = data["viewstate"] r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2", data={'AirportList' : "BOS", 'CarrierList' : "VX", 'Submit' : "Submit", '__EVENTTARGET' : "", '__EVENTARGUMENT' : "", '__EVENTVALIDATION' : eventvalidation, '__VIEWSTATE' : viewstate}) return r.text def test():
BeautifulSoup
Learn about BeautifulSoup
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4 import BeautifulSoup def options(soup, id): option_values = [] carrier_list = soup.find(id=id) for option in carrier_list.find_all('option'): option_values.append(option['value']) return option_values def print_list(label, codes): print "\n%s:" label for c in codes: print c def main(): soup = BeautifulSoup(open("virgin_and_logan_airport.html")) codes = options(soup, 'CarrierList') print_list("Carriers", codes) codes = options(soup, 'AirportList') print_list("Airports", codes)
Extracting xml Data
import xml.etree.ElementTree as ET article_file = "exampleResearchArticle.xml" def get_root(fname) tree = ET.parse(fname) return tree.getroot() def get_authors(root): authors = [] for author in root.findall('./fm/bibl/aug/au'): data = { "fnm": None, "snm": None, "email": None } data["fnm"] = author.find('./fnm').text data["snm"] = author.find('./snm').text data["email"] = author.find('./email').text authors.append(data) return authors def test(): solution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}] root = get_root(article_file) data = get_authors(root) assert data[0] == solution[0] assert data[1]["fnm"] == solution[1]["fnm"]