pythonに慣れよう2

# coding: utf-8
x = 598
print(x / 3)

[vagrant@localhost python]$ python app.py
199

print(True and False)

プログラミングの基礎やるとき、いつも思うんだが、これ何でFalseになるかね?
[vagrant@localhost python]$ python app.py
False

TrueとFalseなら、TrueFalseでいいと思うんだが。
美女 and 野獣 => 美女と野獣 でしょ。
美女 and 野獣 => 野獣 でもあながち間違いではないが。 あ、例がまずかった。

print("Data Scraping" + " +S\n")
print("Data Science" + " +A\n")

改行コードいれなくても勝手に改行されますね。この辺は、phpと違うようです。
[vagrant@localhost python]$ python app.py
Data Scraping +S

Data Science +A

pythonに慣れよう

# coding: utf-8
print("hello world")

[vagrant@localhost python]$ python app.py
hello world

変数

# coding: utf-8

msg = "機械学習頑張るぞ!"
print(msg)

[vagrant@localhost python]$ python app.py
機械学習頑張るぞ!

Python では 定数 はサポートされていない。慣習的に大文字とアンダーバー(_)のみの変数が固定値を表現する。

PI = 3.14
MAX_BUFFER_SIZE = 1024
msg = "機械学習\n頑張る\tぞ!"
print(msg)

[vagrant@localhost python]$ python app.py
機械学習
頑張る ぞ!

html = """<html><body>machine learning</body>
</html>"""
print(html)

[vagrant@localhost python]$ python app.py
machine learning

python api sample2

import os
import urllib
import webapp2
import jinja2

from apiclient.discovery import build
from optparse import OptionParser

JINJA_ENVIRONMENT = jinja2.Environment(
	loader=jinja2.FileSystemLoader(os.path.dirname(__file__)),
	extensions=['jinja2.ext.autoescape'])

DEVELPOER_KEY = "REPLACE_ME"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

class MainHandler(webapp2.RequestHandler):

	def get(self):
		if DEVELOPER_KEY == "REPLACE_ME":
			self.response.write("""You must set up a project and get an API key
									to run this project. Please visit 
									<landing page> to do so."""
		else:
			youtube = build(
				YOUTUBE_API_SERVICE_NAME,
				YOUTUBE_API_VERSION,
				developerKey=DEVELOPER_KEY)
			search_response = youtube.search().list(
				q="Hello",
				part="id,snippet",
				maxResults=5
				).execute()

				videos = []
				channels = []
				playlists = []

				for search_result in search_response.get("items", []):
					if search_result["id"]["kind"] == "youtube#video":
						videos.append("%s (%s)" % (search_result["snippet"]["title"],
							search_result["id"]["videoId"]))
					elif search_result["id"]["kind"] == "youtube#channel":
						channels.append("%s (%s)" % (search_result["snippet"]["title"],
							search_result["id"]["channelId"]))
					elif search_result["id"]["kind"] == "youtube#playlist":
						playlists.append("%s (%s)" % (search_result["snippet"]["title"],
							search_result["id"]["playlistId"]))

				template_values = {
					'videos': videos,
					'channels': channels,
					'playlists': playlists
				}

				self.response.headers['Content-type'] = 'text/plain'
				template = JINJA_ENVIRONMENT.get_template('index.html')
				self.response.write(template.render(template_values))				

			app = webapp2.WSGIApplication([
				('/.*', MainHandler)
				], debug=True)

Optimizer

an optimizer
– find minimum values of functions
– build parameterized models based on data
– refine allocations to stocks in portfolios
f(x) = x^2 + x^3 + s
f(x) = (x-1.5)^2 + 0.5

"""Minimize an objective function, using SciPy."""

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.optimize as spo

def f(X):
	"""Given a scalar X, return some value (a real number)."""
	Y = (X - 1.5)**2 + 0.5
	print "X = {}, Y = {}".format(X, Y)
	return Y

def test_run():
	Xguess = 2.0
	min_result = spo.minimize(f, Xguess, method='SLSQP', options={'disp': True})
	print "Minima found at:"
	print "X = {}, Y = {}".format(min_result.x, min_result.fun)

if __name__ == "__main__":
	test_run()

Pandas Fillna()

Pandas Fillna() documentation
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html
DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs)

fillna(method='ffill')
"""Plot a histogram"""

import pandas as pd
import matplotlib.pyplot as plt

from util import get_data, plot_data

def compute_daily_returns(df):
	daily_returns = df.copy()
	daily_returns[1:] = (df[1:] / df[:-1].values) - 1
	daily_returns.ix[0, :] = 0
	return daily_returns

def test_run():
	dates = pd.date_range('2009-01-01','2012-12-31')
	symbols = ['SPY']
	df = get_data(symbols, dates)
	plot_data(df)

	daily_returns - compute_daily_returns(df)
	plot_data(daily_returns, title="Daily returns", ylabel="Daily returns")

if __name__ == "__main__":
	test_run()

scatterplots in python

"""Scatterplot."""

import pandas as pd
import matplotlib.pyplot as plt

from util import get_data, plot_data

def compute_daily_returns(df):
	daily_returns = df.copy()
	daily_returns[1:] = (df[1:] / df[:-1].values) - 1
	daily_returns.ix[0, :] = 0
	return daily_returns

def test_run():
	dates = pd.date_range('2009-01-01', '2012-12-31')
	symbols = ['SPY', 'XOM', 'GLD']
	df = get_data(symbols, dates)
	
	daily_returns = compute_daily_returns(df)

	daily_returns.plot(kind='scatter',x='SPY',y='XOM')
	plt.show()

if __name__ == "__main__":
	test_run()

Arithmetic operations

import numpy as np

def test_run():
	a = np.array([(1, 2, 3, 4, 5),(10, 20, 30, 40, 50)])
	print "Original array a:\n", a

	print "\nMultiply a by 2:\n", 2 * a

if __name__ == "__main__":
	test_run()

Rolling statistics is buying opportunity
rolling standard dev

def test_run():
	dates = pd.date_range('2012-01-01','2012-12-31')
	symbols = ['SPY']
	df = get_data(symbols, dates)

	ax = df['SPY'].plot(title="SPY rolling mean", label='SPY')

	rm_SPY = pd.rolling_mean(df['SPY'], window=20)

	rm_SPY.plot(label='Rolling mean', ax=ax)

Array attributes

import numpy as np

def test_run():
	a = np.random.random((5, 4))
	print a
	print a.shape

if __name__ == "__main__":
	test_run()
import numpy as np

def test_run():
	a = np.random.random((5, 4))
	print a.size

if __name__ == "__main__":
	test_run()
import numpy as np

def test_run():
	np.random.seed(693)
	a = np.random.randint(0, 10, size=(5, 4))
	print "Array:\n", a

if __name__ == "__main__":
	test_run()

Locate maximum value

import numpy as np

def get_max_index(a):
	return a.argmax()

def test_run():
	a = np.array([9, 6, 2, 3, 12, 14, 7, 10], dtype=np.int32)
	print "Array:", a

	print "Maximum value", a.max()
	print "Index of max.:", get_max_index(a)


if __name__ == "__main__":
	test_run()

Timing python operations

import time

def time_run():
	t1 = time.time()
	print "ML4T"
	t2 = time.time()
	print "The time taken by print statement is ",t2 - t1," seconds"

if __name__ == "__main__":
	test_run()

Accessing array element

import numpy as np

def test_run():
	a = np.random.rand(5, 4)
	print "Array:\n", a

	element = a[3, 2]
	print element

if __name__ == "__main__":
	test_run()

Indexing an array with another array

import numpy as np

def test_run():
	a = np.random.rand(5)
	indices = np.array([1,1,2,3])
	print a[indices]

if __name__ == "__main__":
	test_run()

Replace a slice

nd1[0:2,0:2]=nd2[-2:,2:4]

"""Creating NumPy arrays."""
import numpy as np

def test_run():
	print np.array([(2, 3, 4),(5, 6, 7)])

if __name__ == "__main__":
	test_run()
"""Creating NumPy arrays."""
import numpy as np

def test_run():
	print np.empty(5)
	print np.empty((5,4,3))

if __name__ == "__main__":
	test_run()
import numpy as np

def test_run():
	print np.ones((5, 4), dtype=np.int_)

if __name__ == "__main__":
	test_run()

plot on “equal footing”

The best way to normalize price data so that all prices start at 1.0
df1 = df1/df1[0]

import os
import pandas as pd
import matplotlib.pyplot as plt

def plot_selected(df, columns, start_index, end_index):

def symbol_to_path(symbol, base_dir="data"):
	"""Return CSV file path given ticker symbol."""
	return os.path.join(base_dir, "{}.csv".format(str(symbol)))

def get_data(symbols, dates):
	df = pd.DataFrame(index=dates)
	if 'SPY' not in symbols:
		symbols.insert(0, 'SPY')

	for symbol in symbols:
		df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
			parse_dates=True, usecols=['Date', 'Adj Close'], na_values=['nan'])
		df_temp = df_temp.rename(colums={'Adj Close': symbol})
		df = df.join(df_temp)
		if symbol = 'SPY':
			df = df.dropna(subset=["SPY"])

		return df

def plot_data(df, title="Stock prices"):
	ax = df.plot(title=title, fontsize=12)
	ax.set_xlabel("Date")
	ax.set_ylabel("Price")
	plt.show()

def test_run():
	dates = pd.date_range('2010-01-01', '2010-12-31')

	symbols = ['GOOG', 'IBM', 'GLD']

	df = get_data(symbols, dates)

	plot_selected(df, ['SPY', 'IBM'], '2010-03-01', '2010-04-01')

if __name__ == "__main__":
	test_run()

Pandas dataframe

Problems to solve
-data ranges
-multiple stocks
-align dates
-proper date order

Building a dataframe

'''Build a dataframe in pandas'''
import pandas as pd

def test_run():
	start_date='2010-01-22'
	end_date='2010-01-26'
	dates=pd.date_range(start_date,end_date)
	print dates

if __name__ == "__main__":
	test_run()
'''Build a dataframe in pandas'''
import pandas as pd

def test_run():
	start_date='2010-01-22'
	end_date='2010-01-26'
	dates=pd.date_range(start_date,end_date)

	#Create an empty dataframe
	df1=pd.DataFrame(index=dates)

	#Read SPY data into temporary dataframe
	dfSPY = pd.read_csv("data/SPY.csv",index_col="Date",parse_dates=True)
	print dfSPY

	#Join the two dataframes using DataFrame.join()
	#df1=df1.join(dfSPY)
	#print df1

if __name__ == "__main__":
	test_run()
"""Utility functions"""

import os
import pandas as pd

def symbol_to_path(symbol, base_dir="data"):
	"""Return CSV file path given ticker symbol."""
	return os.path.join(base_dir, "{}.csv".format(str(symbol)))

def get_data(symbols, dates):
	"""Read stock data (adjusted close) for given symbols from csv files"""
	df = pd.DataFrame(index=dates)
	if 'SPY' not in symbols:
		symbols.insert(0, 'SPY')

	for symbol in symbols:

	return df

def test_run():
	# Define a data range
	dates = pd.date_range('2010-01-22','2010-01-26')

	# Choose stock symbols to read
	symbols = ['GOOG', 'IBM', 'GLD']

	# Get stock data
	df = get_data(symbols, dates)
	print df

if __name__ == "__main__":
	test_run()