BeautifulSoup

Learn about BeautifulSoup
https://www.crummy.com/software/BeautifulSoup/bs4/doc/

from bs4 import BeautifulSoup

def options(soup, id):
	option_values = []
	carrier_list = soup.find(id=id)
	for option in carrier_list.find_all('option'):
		option_values.append(option['value'])
	return option_values

def print_list(label, codes):
	print "\n%s:" label
	for c in codes:
		print c

def main():
	soup = BeautifulSoup(open("virgin_and_logan_airport.html"))

	codes = options(soup, 'CarrierList')
	print_list("Carriers", codes)

	codes = options(soup, 'AirportList')
	print_list("Airports", codes)

Wrangling JSON

some important concepts
– using codecs module to write unicode files
– using authentication with web APIs
– using offset when accessing web APIs

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import codecs
import requests

URL_MAIN = "http://api.nytimes.com/svc/"
URL_POPULAR = URL_MAIN + "mostpopular/v2/"
API_KEY = { "popular": "",
			"article": ""}

def get_from_file(kind, period):
	filename = "popular-{0}-{1}.json".format(kind, period)
	with open(filename, "r") as f:
		return json.loads(f.read())

def article_overview(kind, period):
	data = get_from_file(kind, period)
	titles = []
	urls = []

	for article in data:
		section = article["section"]
		title = article["title"]
		titles.append({section: title})
		if "media" in article:
			for m in article["media"]:
				for mm in m["media-metadata"]:
					if mm["format"] == "Standard Thumbnail":
						urls.append(mm["url"])
	return (titles, urls)

def query_site(url, target, offset):
	if API_KEY["popular"] == "" or API_KEY["article"] == "":
		print "You need to register for NYTimes Developer account to run this program."
		print "See Instructor notes for information"
		return False
	params = {"api-key": API_KEY[target], "offset": offset}
	r = requests.get(url, params = params)

	if r.status_code == requests.codes.ok:
		return r.json()
	else:
		r.raise_for_status()

def get_popular(url, kind, days, section="all-sections", offset=0):
	if days not in [1,7,30]:
		print "time period can be 1, 7, 30 days only"
		return False
	if kind not in ["viewd", "shared", "emailed"]:
		print "kind can be only one of viewd/shared/emailed"
		return False

	url += "most{0}/{1}/{2}.json".format(kind, section, days)
	data = query_site(url, "popular", offset)

	return data

def save_file(kind, period):
	data = get_popular(URL_POPULAR, "viewd", 1)
	num_results = data["num_results"]
	full_data = []
	with codecs.open("popular-{0}-{1}.json".format(kind, period), encoding='utf-8', mode='w') as v:
		for offset in range(0, num_results, 20):
			data = get_popular(URL_POPULAR, kind, period, offset=offset)
			full_data += data["results"]

		v.write(json.dumps(full_data, indent=2))

def test():
	titles, urls = article_overview("viewd", 1)
	assert len(titles) == 20
	assert len(urls) == 30
	assert titles[2] == {'Opinion': 'Professors, Wee need you!'}
	assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg'

if __name__ == "__main__":
	test

Python XML
https://wiki.python.org/moin/PythonXml

Excel to CSV

# -*- coding: utf-8 -*-

import xlrd
import os
import csv
from zipfile import zipfile

datafile = "2013_ERCOT_Hourly_Load_Data.xls"
outfile = "2013_Max_Loads.csv"

def open_zip(datafile):
	with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
		myzip.extractall()

def parse_file(datafile):
	workbook = xlrd.open_workbook(datafile)
	sheet = workbook.sheet_by_index(0)
	data = {}

	for n in range(1, 9):
		station = sheet.cell_value(0, n)
		cv = sheet.col_values(n, start_rowx=1, end_rowx=None)

		maxval = max(cv)
		maxpos = cv.index(maxval) + 1
		maxtime = sheet.cell_value(maxpos, 0)
		realtime = xlrd.xldate_as_tuple(maxtime, 0)
		data[station] = {"maxval": maxval,
						"maxtime": realtime}

		print data
		return date

def save_file(data, filename):
	with open(filename, "w") as f:
		w = csv.writer(f, delimiter='|')
		w.writerow(["Station", "Year", "Month", "Day", "Hour", "Max Load"])
		for s in date:
			year, month, day, hour, _ , _= data[s]["maxtime"]
			w.writerow([s, year, month, day, hour, data[s]["maxval"]])

def test():
	open_zip(datafile)
	data = parse_file(datafile)
	save_file(data, outfile)

	number_of_rows = 0
	stations = []

	ans = {'FAR_WEST' : {'Max Load': '2281.2722140000024',
						'Year': '2013',
						'Month': '6',
						'Day': '26',
						'Hour': '17'}}
	correct_stations = ['COAST', 'EAST', 'FAR_WEST', 'NORTH',
						'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST']
	fields = ['Year', 'Month', 'Day', 'Hour', 'Max Load']

	with open(outfile) as of:
		csvfile = csv.DictReader(of, delimiter='|')
		for line in csvfile:
			station = line['Station']
			if station == 'FAR_WEST':
				for field in fields:
					if field == 'Max Load':
						max_answer = round(float(ans[station][field]), 1)
						max_line = round(float(line[field]), 1)
						assert max_answer == max_line

					else:
						assert ans[station][field] == line[field]

				number_of_rows += 1
				stations.append(station)
			assert number_of_rows == 8

			assert set(stations) == set(correct_stations)

if __name__ == "__main__":
	test()

using csv module

import csv
import os

DATADIR = ""
DATAFILE = "745090.csv"

def parse_file(datafile):
	name = ""
	data = []
	with open(datafile, 'rb') as f:
		pass
	return (name, data)

def test():
	datafile = os.path.join(DATADIR, DATAFILE)
	name, data = parse_file(datafile)

	assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
	assert data[0][1] == "01:00"
	assert data[2][0] == "01/01/2005"
	assert data[2][5] == "2"

if __name__ == "__main__":
	test()

JSON Playground

def main():
	results = query_by_name(ARTIST_URL, query_type["simple"], "Lucero")

	artist_id = results["artist"][1]["id"]
	print "\nARTIST:"
	pretty_print(results["artist"][1])

	artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
	releases = artist_data["releases"]
	print "\nONE RELEASE:"
	pretty_print(release[0], indent=2)
	release_titles = [r["title"] for r in releases]

	print "\nALL TITLES:"
	for i in release_titles:
		print t 

if __name__ == '__main__':
	main()

Hold and roll

def hold(state):
	(p, me, you, pending) = state
	return (other[p], you, me+pending, 0)

def roll(state, d):
	(p, me, you, pending) = state
	if d == 1:
		return (other[p], you, me+1, 0)
	else:
		return (p, me, you, pending+d)

otehr = {1:0, 0:1}

def test_actions():
	s = (0, 10, 20, 30)
	assert hold(s) == (1, 20, 40, 0)
	assert roll(s, 6) == (0, 1, 20, 36)
	assert roll(s, 1) == (1, 20, 11, 0)
	return 'test_actions passes'
import random

possible_moves = ['roll', 'hold']

def clueless(state):
	return random.choice(possible_moves)
def play_pig(A, B):
	strategies = [A, B]
	state = (0, 0, 0, 0)
	while True:
		(p, me, you, pending) = state
		if me >= goal:
			return strategies[p]
		elif you >= goal:
			return strategies[other[p]]
		elif strategies[p](state) == 'hold':
			state = hold(state)
		else:
			state = roll(state, random.randint(1,6))

update wrapper

from functools import update_wrapper

def genseq(x, y, Ns):
	def n_ary_f(x, *args):
		return x if not args else f(x, n_ary_f(*args))
	update_wrapper(n_ary_f, f)
	return n_ary_f

def seq(x, y): return ('seq', x, y)

>>> help(seq)
Help on function n_ary_f in module __main__:

n_ary_f(x, *args)
@decorator
def trace(f):
	indent = ' '
	def _f(*args):
		signature = '%s(%s)' % (f.__name__, ', '.join(map(repr, args)))
		print '%s--> %s' % (trace.level*indent, signature)
		trace.level += 1
		try:
			print '%s<-- %s === %s' % ((trace.level-1)*indent,
									signature, result)

			finally:
				trace.level -= 1
			return result
		trace.level = 0
		return _f
&#91;/python&#93;

&#91;python&#93;
G = grammar(r"""
Exp => Term [+-] Exp | Term
Term => Factor [*/] Term | Factor
Factor => Funcall | Var | Num | [(] Exp[)]
Funcall => Var [(] Exp[)]
Exps => Exp [,] Exps | Exp
Var => [a-zA-Z_]\w*
Num => [-+]?[.][0-9]*)
""")
JSON = grammar("""
object => { } | { members }
members => pair, members | pair
pair => string: value
array => [[][]] | [[] elements []]
elements => value, elements | value
value => string | number | object | array | true | false | null
string => "[^"]*"
number => int frac exp | int frac | int exp | int
int => ->[1-9][0-9]*
frac => [.][0-9]+
exp => [eE][-+]?[0-9]+
""", whitespace='\s*')

def inverse(f, delta=1/128.):
def f_1(y):
x = 0
while f(x) < y: x += delta return x if (f(x)-y < y-f(x-delta)) else x-delta return f_1 def square(x): return x * x print sqrt(100) print sqrt(99) print sqrt(100000000) [/python]

Regular expression

def match1(p, text):
	if not text: return False
	return p == '.' or p == text[0]

def match_star(p, pattern, text):
	return (match(pattern, text) or
			(match(p, text) and
				match_star(p, pattern, text[1:])))

print test()

api

def lit(string):  return ('lit', string)
def seq(x, y):    return ('seq', x, y)
def alt(x, y):    return ('alt', x, y)
def star(x):      return ('star', x)
def plus(x):      return seq(x, star(x))
def opt(x):       return alt(lit(''), x) #opt(x) means that x is optional
def oneof(chars): return ('oneof', tuple(chars))
dot = ('dot',)
eol = ('eol',)
def search(pattern, text):
	for i in range(len(text)):
		m = match(pattern, text[i:])
		if :
			return m

def match(pattern, text):
	remainders = matchset(pattern, text)
	if remainders:
		shortest = min(remainders, key=len)
		return

def components(pattern):
	x = pattern[1] if len(pattern) > 1 else None
	y = pattern[2] if len(pattern) > 2 else None
	return pattern[0], x, y
def matchset(pattern, text):
	elif 'seq' == op:
		return set(t2 for t1 in matchset(x, text) for t2 in matchset(y, t1))

def seq(x, y) return lambda text: set().union(*map(y, x(text)))

def alt(x, y) return lambda text: x(text) | y(ext)
def genseq(x, y, Ns):
	Nss = range(max(Ns)+1)
	return set(m1 + m2
		for m1 in x(Nss) for m2 in y(Nss)
		if len(m1 + m2) in Ns)

Compile word

def compile_word(word):
	if word.isupper():
		terms = [('%s*%s' % (10**i, d))
			for (i, d) in enumerate(word[::-1])]
		return '(' + '+'.join(terms) + ')'
	else:
		return word

Regular Expression
->find substring string

ex. s = ‘some long thing with words’
s.find(‘word’)

‘baa*!’

* a*
? a?
. a.

def search(pattern, text):
	if pattern.startswitch('^'):
		return match(pattern[l:], text)
	else:
		return match('.*' + pattern, text)

def match(pattern, text):
	if pattern == '':
		return True
	elif pattern == '$':
		return (text == '')
	elif len(pattern) > 1 and pattern[1] in '*?':

	else:
		return (match1(pattern[0], text) and
		match(pattern[1:], text[1:]))