Pandas Fillna()

Pandas Fillna() documentation
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html
DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs)

fillna(method='ffill')
"""Plot a histogram"""

import pandas as pd
import matplotlib.pyplot as plt

from util import get_data, plot_data

def compute_daily_returns(df):
	daily_returns = df.copy()
	daily_returns[1:] = (df[1:] / df[:-1].values) - 1
	daily_returns.ix[0, :] = 0
	return daily_returns

def test_run():
	dates = pd.date_range('2009-01-01','2012-12-31')
	symbols = ['SPY']
	df = get_data(symbols, dates)
	plot_data(df)

	daily_returns - compute_daily_returns(df)
	plot_data(daily_returns, title="Daily returns", ylabel="Daily returns")

if __name__ == "__main__":
	test_run()

scatterplots in python

"""Scatterplot."""

import pandas as pd
import matplotlib.pyplot as plt

from util import get_data, plot_data

def compute_daily_returns(df):
	daily_returns = df.copy()
	daily_returns[1:] = (df[1:] / df[:-1].values) - 1
	daily_returns.ix[0, :] = 0
	return daily_returns

def test_run():
	dates = pd.date_range('2009-01-01', '2012-12-31')
	symbols = ['SPY', 'XOM', 'GLD']
	df = get_data(symbols, dates)
	
	daily_returns = compute_daily_returns(df)

	daily_returns.plot(kind='scatter',x='SPY',y='XOM')
	plt.show()

if __name__ == "__main__":
	test_run()

Arithmetic operations

import numpy as np

def test_run():
	a = np.array([(1, 2, 3, 4, 5),(10, 20, 30, 40, 50)])
	print "Original array a:\n", a

	print "\nMultiply a by 2:\n", 2 * a

if __name__ == "__main__":
	test_run()

Rolling statistics is buying opportunity
rolling standard dev

def test_run():
	dates = pd.date_range('2012-01-01','2012-12-31')
	symbols = ['SPY']
	df = get_data(symbols, dates)

	ax = df['SPY'].plot(title="SPY rolling mean", label='SPY')

	rm_SPY = pd.rolling_mean(df['SPY'], window=20)

	rm_SPY.plot(label='Rolling mean', ax=ax)

Array attributes

import numpy as np

def test_run():
	a = np.random.random((5, 4))
	print a
	print a.shape

if __name__ == "__main__":
	test_run()
import numpy as np

def test_run():
	a = np.random.random((5, 4))
	print a.size

if __name__ == "__main__":
	test_run()
import numpy as np

def test_run():
	np.random.seed(693)
	a = np.random.randint(0, 10, size=(5, 4))
	print "Array:\n", a

if __name__ == "__main__":
	test_run()

Locate maximum value

import numpy as np

def get_max_index(a):
	return a.argmax()

def test_run():
	a = np.array([9, 6, 2, 3, 12, 14, 7, 10], dtype=np.int32)
	print "Array:", a

	print "Maximum value", a.max()
	print "Index of max.:", get_max_index(a)


if __name__ == "__main__":
	test_run()

Timing python operations

import time

def time_run():
	t1 = time.time()
	print "ML4T"
	t2 = time.time()
	print "The time taken by print statement is ",t2 - t1," seconds"

if __name__ == "__main__":
	test_run()

Accessing array element

import numpy as np

def test_run():
	a = np.random.rand(5, 4)
	print "Array:\n", a

	element = a[3, 2]
	print element

if __name__ == "__main__":
	test_run()

Indexing an array with another array

import numpy as np

def test_run():
	a = np.random.rand(5)
	indices = np.array([1,1,2,3])
	print a[indices]

if __name__ == "__main__":
	test_run()

Replace a slice

nd1[0:2,0:2]=nd2[-2:,2:4]

"""Creating NumPy arrays."""
import numpy as np

def test_run():
	print np.array([(2, 3, 4),(5, 6, 7)])

if __name__ == "__main__":
	test_run()
"""Creating NumPy arrays."""
import numpy as np

def test_run():
	print np.empty(5)
	print np.empty((5,4,3))

if __name__ == "__main__":
	test_run()
import numpy as np

def test_run():
	print np.ones((5, 4), dtype=np.int_)

if __name__ == "__main__":
	test_run()

plot on “equal footing”

The best way to normalize price data so that all prices start at 1.0
df1 = df1/df1[0]

import os
import pandas as pd
import matplotlib.pyplot as plt

def plot_selected(df, columns, start_index, end_index):

def symbol_to_path(symbol, base_dir="data"):
	"""Return CSV file path given ticker symbol."""
	return os.path.join(base_dir, "{}.csv".format(str(symbol)))

def get_data(symbols, dates):
	df = pd.DataFrame(index=dates)
	if 'SPY' not in symbols:
		symbols.insert(0, 'SPY')

	for symbol in symbols:
		df_temp = pd.read_csv(symbol_to_path(symbol), index_col='Date',
			parse_dates=True, usecols=['Date', 'Adj Close'], na_values=['nan'])
		df_temp = df_temp.rename(colums={'Adj Close': symbol})
		df = df.join(df_temp)
		if symbol = 'SPY':
			df = df.dropna(subset=["SPY"])

		return df

def plot_data(df, title="Stock prices"):
	ax = df.plot(title=title, fontsize=12)
	ax.set_xlabel("Date")
	ax.set_ylabel("Price")
	plt.show()

def test_run():
	dates = pd.date_range('2010-01-01', '2010-12-31')

	symbols = ['GOOG', 'IBM', 'GLD']

	df = get_data(symbols, dates)

	plot_selected(df, ['SPY', 'IBM'], '2010-03-01', '2010-04-01')

if __name__ == "__main__":
	test_run()

Pandas dataframe

Problems to solve
-data ranges
-multiple stocks
-align dates
-proper date order

Building a dataframe

'''Build a dataframe in pandas'''
import pandas as pd

def test_run():
	start_date='2010-01-22'
	end_date='2010-01-26'
	dates=pd.date_range(start_date,end_date)
	print dates

if __name__ == "__main__":
	test_run()
'''Build a dataframe in pandas'''
import pandas as pd

def test_run():
	start_date='2010-01-22'
	end_date='2010-01-26'
	dates=pd.date_range(start_date,end_date)

	#Create an empty dataframe
	df1=pd.DataFrame(index=dates)

	#Read SPY data into temporary dataframe
	dfSPY = pd.read_csv("data/SPY.csv",index_col="Date",parse_dates=True)
	print dfSPY

	#Join the two dataframes using DataFrame.join()
	#df1=df1.join(dfSPY)
	#print df1

if __name__ == "__main__":
	test_run()
"""Utility functions"""

import os
import pandas as pd

def symbol_to_path(symbol, base_dir="data"):
	"""Return CSV file path given ticker symbol."""
	return os.path.join(base_dir, "{}.csv".format(str(symbol)))

def get_data(symbols, dates):
	"""Read stock data (adjusted close) for given symbols from csv files"""
	df = pd.DataFrame(index=dates)
	if 'SPY' not in symbols:
		symbols.insert(0, 'SPY')

	for symbol in symbols:

	return df

def test_run():
	# Define a data range
	dates = pd.date_range('2010-01-22','2010-01-26')

	# Choose stock symbols to read
	symbols = ['GOOG', 'IBM', 'GLD']

	# Get stock data
	df = get_data(symbols, dates)
	print df

if __name__ == "__main__":
	test_run()

Plotting stock price data

import pandas as pd
import matplotlib.pyplot as plt

def test_run():
	df = pd.read_csv("data/APPL.csv")
	print df['Adj Close']
	df['Adj Close'].plot()
	plot.show()

if __name__ == "__main__":
	test_run()

Here we go

import pandas as pd
import matplotlib.pyplot as plt

def test_run():
	df = pd.read_csv("data/IBM.csv")
	df['High'].plot()
	plot.show()

if __name__ == "__main__":
	test_run()

plot two column, you can observe two lines

import pandas as pd
import matplotlib.pyplot as plt

def test_run():
	df = pd.read_csv("data/APPL.csv")
	df[['Close','Adj Close']].plot()
	plot.show()

if __name__ == "__main__":
	test_run()

unsupported operand type(s) for +: ‘int’ and ‘str’

Compute mean volume

    df = pd.read_csv("data/{}.csv".format(symbol))  # read in data
    s = sum(df)
    l = len(df)
    print(s/l)
unsupported operand type(s) for +: 'int' and 'str'

TypeError showed.

We must calculate dataframe mean.

import pandas as pd

def get_mean_volume(sympol):
	df = pd.read_csv("data/{}.csv".format(symbol))
    print(df.mean())

def test_run():
	for symbol in ['AAPL', 'IBM']:
		print "Mean Volume"
		print symbol, get_mean_volume(symbol)

if __name__ == "__main__": # if run standalone
	test_run()
Mean Volume
AAPL Open         1.363176e+02
High         1.380075e+02
Low          1.344201e+02
Close        1.362885e+02
Volume       2.149143e+07
Adj Close    1.282174e+02
dtype: float64
None
Mean Volume
IBM Open         1.109328e+02
High         1.121182e+02
Low          1.098853e+02
Close        1.110325e+02
Volume       7.103571e+06
Adj Close    1.022113e+02
dtype: float64
None

Here is a solution

import pandas as pd

def get_mean_volume(sympol):
	df = pd.read_csv("data/{}.csv".format(symbol))
    return df['Volume'].mean()

def test_run():
	for symbol in ['AAPL', 'IBM']:
		print "Mean Volume"
		print symbol, get_mean_volume(symbol)

if __name__ == "__main__": # if run standalone
	test_run()
Mean Volume
AAPL 21491431.3386
Mean Volume
IBM 7103570.80315

python for stock data

features:
1. strong scientific libraries
2. strongly maintained
3. fast

install pandas into Centos

$ sudo easy_install pandas

>>> import numpy
>>> numpy.version.version
'1.13.3'

Print last 5 rows of the data frame

import pandas as pd


def test_run():
    df = pd.read_csv("data/AAPL.csv")
    print(df[-5:])

if __name__ == "__main__":
    test_run()

compute max closing price of Apple and IBM

import pandas as pd

def get_max_close(sympol)
	
	df = pd.read_csv("data/{}.csv".format(symbol))
	return df['Close'].max()

def test_run():
	for symbol in ['AAPL', 'IBM']:
		print "Max close"
		print symbol, get_max_close(symbol)

if __name__ == "__main__": # if run standalone
	test_run()

Mapper

def mapper():

	for line in sys.stdin:

		data = line.strip.split("")

		for i in data:
			cleaned_data = i.translate(string.maketrans("",""), string.punctuation).lower()
			print "{0}\t{t}".format(cleaned_data,1)

			mapper()

Reduce stage -> reducer

import sys

def reducer():
	word_count = 0
	old_key = None

	for line in sys.stdin:
		data = line.strip().split("\t")

		if len(data) != 2:
			continue

		if old_key and old_key != this_key: 
			print"{0}\t{1}".format(old_key, word_count)
			word_count = 0

		old_key = this_key
		word_count += float(count)

	if old_key != None:
		print "{0}\t{1}".format(old_key, word_count)
#! /bin/bash

cat ../../data/aliceInWorderland.txt | python word_count_mapper.py | sort | python word_count_reducer.py