import xml.etree.ElementTree as ET article_file = "exampleResearchArticle.xml" def get_root(fname) tree = ET.parse(fname) return tree.getroot() def get_authors(root): authors = [] for author in root.findall('./fm/bibl/aug/au'): data = { "fnm": None, "snm": None, "email": None } data["fnm"] = author.find('./fnm').text data["snm"] = author.find('./snm').text data["email"] = author.find('./email').text authors.append(data) return authors def test(): solution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}] root = get_root(article_file) data = get_authors(root) assert data[0] == solution[0] assert data[1]["fnm"] == solution[1]["fnm"]
Parsing XML
import xml.etree.ElementTree as ET import pprint tree = ET.parse('exampleResearchArticle.xml') root = tree.getroot() print "\nChildren of root:" for child in root: print child.tag
import xml.etree.ElementTree as ET import pprint tree = ET.parse('exampleResearchArticle.xml') root = tree.getroot() title = root.find('./fm/bibl/title') title_text = "" for p in title: title_text += p.text print "\nTitle:\n", title_text print "\nAuthor email addresses:" for a in root.findall('./fm/bibl/aug/au'): email = a.find('email') if email is not None: print email.text
Wrangling JSON
some important concepts
– using codecs module to write unicode files
– using authentication with web APIs
– using offset when accessing web APIs
#!/usr/bin/env python # -*- coding: utf-8 -*- import json import codecs import requests URL_MAIN = "http://api.nytimes.com/svc/" URL_POPULAR = URL_MAIN + "mostpopular/v2/" API_KEY = { "popular": "", "article": ""} def get_from_file(kind, period): filename = "popular-{0}-{1}.json".format(kind, period) with open(filename, "r") as f: return json.loads(f.read()) def article_overview(kind, period): data = get_from_file(kind, period) titles = [] urls = [] for article in data: section = article["section"] title = article["title"] titles.append({section: title}) if "media" in article: for m in article["media"]: for mm in m["media-metadata"]: if mm["format"] == "Standard Thumbnail": urls.append(mm["url"]) return (titles, urls) def query_site(url, target, offset): if API_KEY["popular"] == "" or API_KEY["article"] == "": print "You need to register for NYTimes Developer account to run this program." print "See Instructor notes for information" return False params = {"api-key": API_KEY[target], "offset": offset} r = requests.get(url, params = params) if r.status_code == requests.codes.ok: return r.json() else: r.raise_for_status() def get_popular(url, kind, days, section="all-sections", offset=0): if days not in [1,7,30]: print "time period can be 1, 7, 30 days only" return False if kind not in ["viewd", "shared", "emailed"]: print "kind can be only one of viewd/shared/emailed" return False url += "most{0}/{1}/{2}.json".format(kind, section, days) data = query_site(url, "popular", offset) return data def save_file(kind, period): data = get_popular(URL_POPULAR, "viewd", 1) num_results = data["num_results"] full_data = [] with codecs.open("popular-{0}-{1}.json".format(kind, period), encoding='utf-8', mode='w') as v: for offset in range(0, num_results, 20): data = get_popular(URL_POPULAR, kind, period, offset=offset) full_data += data["results"] v.write(json.dumps(full_data, indent=2)) def test(): titles, urls = article_overview("viewd", 1) assert len(titles) == 20 assert len(urls) == 30 assert titles[2] == {'Opinion': 'Professors, Wee need you!'} assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg' if __name__ == "__main__": test
Python XML
https://wiki.python.org/moin/PythonXml
Excel to CSV
# -*- coding: utf-8 -*- import xlrd import os import csv from zipfile import zipfile datafile = "2013_ERCOT_Hourly_Load_Data.xls" outfile = "2013_Max_Loads.csv" def open_zip(datafile): with ZipFile('{0}.zip'.format(datafile), 'r') as myzip: myzip.extractall() def parse_file(datafile): workbook = xlrd.open_workbook(datafile) sheet = workbook.sheet_by_index(0) data = {} for n in range(1, 9): station = sheet.cell_value(0, n) cv = sheet.col_values(n, start_rowx=1, end_rowx=None) maxval = max(cv) maxpos = cv.index(maxval) + 1 maxtime = sheet.cell_value(maxpos, 0) realtime = xlrd.xldate_as_tuple(maxtime, 0) data[station] = {"maxval": maxval, "maxtime": realtime} print data return date def save_file(data, filename): with open(filename, "w") as f: w = csv.writer(f, delimiter='|') w.writerow(["Station", "Year", "Month", "Day", "Hour", "Max Load"]) for s in date: year, month, day, hour, _ , _= data[s]["maxtime"] w.writerow([s, year, month, day, hour, data[s]["maxval"]]) def test(): open_zip(datafile) data = parse_file(datafile) save_file(data, outfile) number_of_rows = 0 stations = [] ans = {'FAR_WEST' : {'Max Load': '2281.2722140000024', 'Year': '2013', 'Month': '6', 'Day': '26', 'Hour': '17'}} correct_stations = ['COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST'] fields = ['Year', 'Month', 'Day', 'Hour', 'Max Load'] with open(outfile) as of: csvfile = csv.DictReader(of, delimiter='|') for line in csvfile: station = line['Station'] if station == 'FAR_WEST': for field in fields: if field == 'Max Load': max_answer = round(float(ans[station][field]), 1) max_line = round(float(line[field]), 1) assert max_answer == max_line else: assert ans[station][field] == line[field] number_of_rows += 1 stations.append(station) assert number_of_rows == 8 assert set(stations) == set(correct_stations) if __name__ == "__main__": test()
using csv module
import csv import os DATADIR = "" DATAFILE = "745090.csv" def parse_file(datafile): name = "" data = [] with open(datafile, 'rb') as f: pass return (name, data) def test(): datafile = os.path.join(DATADIR, DATAFILE) name, data = parse_file(datafile) assert name == "MOUNTAIN VIEW MOFFETT FLD NAS" assert data[0][1] == "01:00" assert data[2][0] == "01/01/2005" assert data[2][5] == "2" if __name__ == "__main__": test()
JSON Playground
def main(): results = query_by_name(ARTIST_URL, query_type["simple"], "Lucero") artist_id = results["artist"][1]["id"] print "\nARTIST:" pretty_print(results["artist"][1]) artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id) releases = artist_data["releases"] print "\nONE RELEASE:" pretty_print(release[0], indent=2) release_titles = [r["title"] for r in releases] print "\nALL TITLES:" for i in release_titles: print t if __name__ == '__main__': main()
XLRD
#!/usr/bin/env python import xlrd from zipfile import zipfile datafile = "2013_ERCOT_Hourly_Load_Data.xls" def open_zip(datafile): with ZipFile('{0}.zip'.format(datafile),'r') as myzip: myzip.extractall() def parse_file(datafile): workbook = xlrd.open_workbook(datafile) sheet = workbook.sheet_by_index(0) data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)] cv = sheet.col_value(1, start_rowx=1, end_rowx=None) maxval = max(cv) minval = min(cv) maxpos = cv.index(maxval) + 1 minpos = cv.index(minval) + 1 maxtime = sheet.cell_value(maxpos, 0) realtime = xlrd.xldate_as_tuple(maxtime, 0) mintime = sheet.cell_value(minpos, 0) realmintime = xlrd.xldate_as_tupple(mintime, 0) data = { 'maxtime':(0,0,0,0,0,0), 'maxvalue': 0, 'mintime': (0,0,0,0,0,0), 'minvalue': 0, 'avgcoast': 0 } return data def test(): open_zip(datafile) data = parse_file(datafile) assert data['maxtime'] == (2013, 8, 13, 17, 0, 0) assert round(data['maxvalue'], 10) == round(18779.02551, 10)
Reading Excel file
import xlrd datafile = "2013_ERCOT_Hourly_Load_Data.xls". def parse_file(datafile): workbook = xlrd.open_workbook(datafile) sheet = workbook.sheet_by_index(0) data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)] print "\nList Comprehension" print "data[3][2]:", print data[3][2] print "\nCells in a nested loop:" for row in range(sheet.nrows): for col in range(sheet.ncols): if row == 50: print sheet.cell value(row, col).
import xlrd datafile = "2013_ERCOT_Hourly_Load_Data.xls". def parse_file(datafile): workbook = xlrd.open_workbook(datafile) sheet = workbook.sheet_by_index(0) data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)] data = { 'maxtime': (0, 0, 0, 0, 0, 0), 'maxvalue': 0, 'mintime': (0, 0, 0, 0, 0, 0), 'minvalue': 0, 'avgcoast': 0 } return data data = parse_file(datafile) assert data['maxtime'] == (2013, 8, 13, 17, 0, 0) assert round(data['maxvalue'], 10) == round(18779,02551, 10)
Using CSV Module
import os import pprint import csv DATADIR = "" DATAFILE = "beatles-diskography.csv" def parse_csv(datafile): data = [] n = 0 with open(datafile,'rb') as sd: r = csv.DictReader(sd) for line in r: data.append(line) return data if __name__ == '__main__': datafile = os.path.join(DATADIR, DATAFILE) parse_scv(datafile) d = parse_csv(datafile) pprint.pprint(d)
Parsing CSV Files
import os DATADIR = "" DATAFILE = "beatles-diskography.csv" def parse_file(datafile): data = [] with open(datafile, "r") as f: header = f.readline().split(",") counter = 0 for line in f: if counter == 10: break fields = line.split(",") entry = {} for i, value in enumerate(fields): entry[header[i].strip()] = value.strip() data.append(entry) counter += 1 return data def test(): # a simple test of your implemetation datafile = os.path.join(DATADIR, DATAFILE) d = parse_file(datafile) firstline = {'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum', 'BPI Certification': 'Gold'} tenthline = {'Title': '', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '10 July 1964', 'US Chart Position': '-', 'RIAA Certification': '', 'BPI Certification': 'Gold'} assert d[0] == firstline assert d[9] == tenthline
Title,Released,Label,UK Chart Position,US Chart Position,BPI Certification,RIAA Certification Please Please Me,22 March 1963,Parlophone(UK),1,-,Gold,Platinum With the Beatles,22 November 1963,Parlophone(UK),1,-,Platinum,Gold Beatlemania! With the Beatles,25 November 1963,Capitol(CAN),-,-,, Introducing... The Beatles,10 January 1964,Vee-Jay(US),-,2,, Meet the Beatles!,20 January 1964,Capitol(US),-,1,,5xPlatinum Twist and Shout,3 February 1964,Capitol(CAN),-,-,, The Beatles' Second Album,10 April 1964,Capitol(US),-,1,,2xPlatinum The Beatles' Long Tall Sally,11 May 1964,Capitol(CAN),-,-,, A Hard Day's Night,26 June 1964,United Artists(US)c,-,1,,4xPlatinum ,10 July 1964,Parlophone(UK),1,-,Gold, Something New,20 July 1964,Capitol(US),-,2,,Platinum Beatles for Sale,4 December 1964,Parlophone(UK),1,-,Gold,Platinum Beatles '65,15 December 1964,Capitol(US),-,1,,3xPlatinum Beatles VI,14 June 1965,"Parlophone(NZ), Capitol(US)",-,1,,Platinum Help!,6 August 1965,Parlophone(UK),1,-,Platinum, ,13 August 1965,Capitol(US) c,-,1,,3xPlatinum Rubber Soul,3 December 1965,Parlophone(UK),1,-,Platinum, ,6 December 1965,Capitol(US) c ,-,1,,6xPlatinum Yesterday and Today,15 June 1966,Capitol(US),-,1,,2xPlatinum Revolver,5 August 1966,Parlophone(UK),1,-,Platinum, ,8 August 1966,Capitol(US) c,-,1,,5xPlatinum Sgt. Pepper's Lonely Hearts Club Band,1 June 1967,"Parlophone(UK), Capitol(US)",1,1,3xPlatinum,11xPlatinum Magical Mystery Tour,27 November 1967,"Parlophone(UK), Capitol(US)",31[D],1,Platinum,6xPlatinum The Beatles,22 November 1968,"Apple(UK), Capitol(US)",1,1,Platinum,19xPlatinum Yellow Submarine,13 January 1969,"Apple(UK), Capitol(US)",3,2,Silver,Platinum Abbey Road,26 September 1969,"Apple(UK), Capitol(US)",1,1,2xPlatinum,12xPlatinum Let It Be,8 May 1970,"Apple(UK),United Artists(US)",1,1,Gold,4xPlatinum