Python – Page 25 – ソフトウェアエンジニアの技術ブログ：Software engineer tech blog

Pandas dataframe

Problems to solve
-data ranges
-multiple stocks
-align dates
-proper date order

Building a dataframe

'''Build a dataframe in pandas'''
import pandas as pd

def test_run():
	start_date='2010-01-22'
	end_date='2010-01-26'
	dates=pd.date_range(start_date,end_date)
	print dates

if __name__ == "__main__":
	test_run()

'''Build a dataframe in pandas'''
import pandas as pd

def test_run():
	start_date='2010-01-22'
	end_date='2010-01-26'
	dates=pd.date_range(start_date,end_date)

	#Create an empty dataframe
	df1=pd.DataFrame(index=dates)

	#Read SPY data into temporary dataframe
	dfSPY = pd.read_csv("data/SPY.csv",index_col="Date",parse_dates=True)
	print dfSPY

	#Join the two dataframes using DataFrame.join()
	#df1=df1.join(dfSPY)
	#print df1

if __name__ == "__main__":
	test_run()

"""Utility functions"""

import os
import pandas as pd

def symbol_to_path(symbol, base_dir="data"):
	"""Return CSV file path given ticker symbol."""
	return os.path.join(base_dir, "{}.csv".format(str(symbol)))

def get_data(symbols, dates):
	"""Read stock data (adjusted close) for given symbols from csv files"""
	df = pd.DataFrame(index=dates)
	if 'SPY' not in symbols:
		symbols.insert(0, 'SPY')

	for symbol in symbols:

	return df

def test_run():
	# Define a data range
	dates = pd.date_range('2010-01-22','2010-01-26')

	# Choose stock symbols to read
	symbols = ['GOOG', 'IBM', 'GLD']

	# Get stock data
	df = get_data(symbols, dates)
	print df

if __name__ == "__main__":
	test_run()

Plotting stock price data

import pandas as pd
import matplotlib.pyplot as plt

def test_run():
	df = pd.read_csv("data/APPL.csv")
	print df['Adj Close']
	df['Adj Close'].plot()
	plot.show()

if __name__ == "__main__":
	test_run()

Here we go

import pandas as pd
import matplotlib.pyplot as plt

def test_run():
	df = pd.read_csv("data/IBM.csv")
	df['High'].plot()
	plot.show()

if __name__ == "__main__":
	test_run()

plot two column, you can observe two lines

import pandas as pd
import matplotlib.pyplot as plt

def test_run():
	df = pd.read_csv("data/APPL.csv")
	df[['Close','Adj Close']].plot()
	plot.show()

if __name__ == "__main__":
	test_run()

unsupported operand type(s) for +: ‘int’ and ‘str’

Compute mean volume

    df = pd.read_csv("data/{}.csv".format(symbol))  # read in data
    s = sum(df)
    l = len(df)
    print(s/l)

unsupported operand type(s) for +: 'int' and 'str'

TypeError showed.

We must calculate dataframe mean.

import pandas as pd

def get_mean_volume(sympol):
	df = pd.read_csv("data/{}.csv".format(symbol))
    print(df.mean())

def test_run():
	for symbol in ['AAPL', 'IBM']:
		print "Mean Volume"
		print symbol, get_mean_volume(symbol)

if __name__ == "__main__": # if run standalone
	test_run()

Mean Volume
AAPL Open         1.363176e+02
High         1.380075e+02
Low          1.344201e+02
Close        1.362885e+02
Volume       2.149143e+07
Adj Close    1.282174e+02
dtype: float64
None
Mean Volume
IBM Open         1.109328e+02
High         1.121182e+02
Low          1.098853e+02
Close        1.110325e+02
Volume       7.103571e+06
Adj Close    1.022113e+02
dtype: float64
None

Here is a solution

import pandas as pd

def get_mean_volume(sympol):
	df = pd.read_csv("data/{}.csv".format(symbol))
    return df['Volume'].mean()

def test_run():
	for symbol in ['AAPL', 'IBM']:
		print "Mean Volume"
		print symbol, get_mean_volume(symbol)

if __name__ == "__main__": # if run standalone
	test_run()

Mean Volume
AAPL 21491431.3386
Mean Volume
IBM 7103570.80315

python for stock data

features:
1. strong scientific libraries
2. strongly maintained
3. fast

install pandas into Centos

$ sudo easy_install pandas

>>> import numpy
>>> numpy.version.version
'1.13.3'

Print last 5 rows of the data frame

import pandas as pd


def test_run():
    df = pd.read_csv("data/AAPL.csv")
    print(df[-5:])

if __name__ == "__main__":
    test_run()

compute max closing price of Apple and IBM

import pandas as pd

def get_max_close(sympol)
	
	df = pd.read_csv("data/{}.csv".format(symbol))
	return df['Close'].max()

def test_run():
	for symbol in ['AAPL', 'IBM']:
		print "Max close"
		print symbol, get_max_close(symbol)

if __name__ == "__main__": # if run standalone
	test_run()

Mapper

def mapper():

	for line in sys.stdin:

		data = line.strip.split("")

		for i in data:
			cleaned_data = i.translate(string.maketrans("",""), string.punctuation).lower()
			print "{0}\t{t}".format(cleaned_data,1)

			mapper()

Reduce stage -> reducer

import sys

def reducer():
	word_count = 0
	old_key = None

	for line in sys.stdin:
		data = line.strip().split("\t")

		if len(data) != 2:
			continue

		if old_key and old_key != this_key: 
			print"{0}\t{1}".format(old_key, word_count)
			word_count = 0

		old_key = this_key
		word_count += float(count)

	if old_key != None:
		print "{0}\t{1}".format(old_key, word_count)

#! /bin/bash

cat ../../data/aliceInWorderland.txt | python word_count_mapper.py | sort | python word_count_reducer.py

Groundwork

<title>My Website</title>
<div id="introduction">
	<p>
	welcome to my website!
	</p>
</div>
<div id="image-gallery">
	<p>
	This is my cat!
	<img src="cat.jpg" alt="Meow!">
	<a href="https://en.wikipedia.org/wiki/Cat">Learn more about cats!</a>
	</p>
</div>

while continue_crawl(search_history, target_url):
continue_crawl([‘https://en.wikipedia.org/wiki/Floating_point’], ‘https://en.wikipedia.org/wiki/Philosophy’)

def continue_crawl(search_history, target_url, max_steps=25):
	if search_history[-1] == target_url:
		print("We've found the target article!")
		return False
	elif len(search_history) > max_steps:
		print("The serach has gone on suspiciously long, aborting search!")
		return False
	elif search_history[-1] in search_history[:-1]:
		print("We've arrived at an article we've already seen, aborting search!")
		return False
	else:
		return True

import time
import urllib

import bs4
import requests

start_url = "https://en.wikipedia.org/wiki/Special:Random"
target_url = "https://en.wikipedia.org/wiki/Philosophy"

def find_first_link(url):
	response = requests.get(url)
	html = response.text
	soup = bs4.BeautifulSoup(html, "html.parser")

	content_div = soup.find(id="mw-content-text").find(class_="mw-parser-output")

	article_link = None

	for element in content_div.find_all("p", recursive=False):

		if element.find("a", recursive=False):
			article_link = element.find("a", recursive=False).get('href')

	if not article_link:
		return

	return first_link

def continue_crawl(search_history, target_url, max_steps=25):
	if search_history[-1] == target_url:
		print("We've found the target article!")
		return False
	elif len(search_history) > max_steps:
		print("The search has gone on suspiciously long, aborting search!")
		return False
	elif search_history[-1] in search_history[:-1]:
		print("We've arrived at an article we've already seen, aborting search!")
		return False
	else:
		return True

article_chain = [start_url]

while continue_crawl(article_chain, target_url):
	print(article_chain[-1])

	first_link = find_first_link(article_chain[-1])
	if not first_link:
		print("We've arrived at an article with no links, aborting search!")
		break

	article_chain.append(first_link)

	time.sleep(2)

Reading from a File

f = open('/my_path/my_file.txt', 'r')

with open('/my_path/my_file.txt', 'r') as f:
	file_data = f.read()

camelot_lines = []
with open("camelot.txt") as f:
	for line in f:
		camelot_lines.append(line.strip())

print(camelot_lines)

def create_cast_list(filename):
	cast_list = []
	with open(filename) as f:

		for line in f:
			line_data = line.split(',')
			cast_list.append(line_data[0])
		return cast_list

The Python Standard Library is organised into parts called modules. Many modules are simply Python files, like the Python scripts you’ve already used and written. In order to be able to use the code contained in a module we must import it, either in the interactive interpreter or in a Python script of our own.

The syntax for importing a module is simply import package_name.

>>> import math
>>> print(math.factorial(3))
6

from module_name import object_name
e.g.
from collections import defaultdict

>>> import multiprocessing as mp
>>> mp.cpu_count()
4

import an individual item from a module and give it a different name
from module_name import object_name as different_name
from csv import reader as csvreader

Python Standard Library
csv, collections, random, string, re, math, os, os.path, sys, json

word_file = “words.txt”
word_list = []

with open(word_file, ‘r’) as words:
for line in words:
word = line.strip().lower()
if 3 < len(word) < 8 word_list.append(word) [/python] [python] def generate_password(): return random.choice(word_list) + random.choice(word_list) + random.choice(word_list) def generate_password(): return str().join(random.sample(word_list,3)) [/python]

Tuple

>>> print(type(AngkorWat))

>>> print("Angkor wat is at latitude: {}".format(AngkorWat[0]))
Angkor wat is at latitude: 13.4125
>>> print("Angkor wat is at longitude: {}".format(AngkorWat[1]))
Angkor wat is at longitude: 103.866667

>>> dimensions = 52, 40, 100
>>> length, width, height = dimensions
>>> print("the dimensions are {}x{}x{}".format(length, width, height))
the dimensions are 52x40x100

world_heritage_locations = {(13.4125, 103.866667): "Angkor Wat",
						(25.73333, 32.6): "Ancident Thebes",
						(30.330556, 35.4433330): "Petra",
						(-13.116667, -72.583333): "Machu Picchu"}

def box(width, height, symbol):
	print(symbol * width)

	for _ in range(height-2):
		print(symbol + " " * (width-2) + symbol)

	print(symbol + width)

def print_list(l, numbered, bullet_character):
	for index, element in enumerate(l):
		if numbered:
			print("{}: {}".format(index+1, element))
		else:
			print("{} {}".format(bullet_character, element))

def word_count(document, search_term):
“”” Count how many times search_term appears in document. “””
words = document.split()
answer = 0
for word in words:
if word == search_term:
answer += 1
return answer

def nearest_square(limit):
“”” Find the largest square number smaller than limit.”””
answer = 0
while (answer+1)**2 < limit: answer += 1 return answer**2 [/python]

Dictionaries

Rather than storing single objects like lists and sets do, dictionaries store pairs of elements: keys and values.

elements = {'hydrogen': 1, 'helium': 2, 'carbon': 6}
>>> print(element['carbon'])
6

>>> elements['lithium'] = 3
>>> print(elements['lithium'])
3

populations = {'Shanghai':17.8, 'Istanbul':13.3, 'Karachi':13.0, 'Mumbai':12.5}

if 'mithril' in elements:
	print("That's a real element!")
else:
	print("There's no such element")

>>> elements.get('dilithium')
>>> elements['dilithium']

Traceback (most recent call last):
  File "", line 1, in 
    elements['dilithium']
KeyError: 'dilithium'
>>> elements.get('kryptonite', 'There\'s no such element!')
"There's no such element!"

colors = set(['Pathalo Blue', 'Indian Yellow', 'Sap Green'])
for color in colors:
	print(color)

>>> elements = {'hydrogen': {'number':1, 'weight':1.00794, 'symbol':'H'},
	    'helium':{'number':2, 'weight':4.002602, 'symbol':'He'}}
>>> print(elements['helium'])
{'symbol': 'He', 'number': 2, 'weight': 4.002602}
>>> print(elements.get('unobtainium', 'There\'s no such element!'))
There's no such element!
>>> print(elements['helium']['weight'])
4.002602

Reorganizing code

Factoring: decomposing a complex problem into simpler parts.
Refactoring: restructuring existing code.

Reading the code, is it clear what each piece does? How could it be be easier?
If you needed to change some part of the functionality, would that be easy? Would you have to change the same thing in several places?
If you break down what the function does into steps, how many steps are there? It’s best to have each function doing only one thing.
Is there unnecessary repetition? Does every piece of code get used? Could anything be more succinct whilst still being readable? This is called the DRY (Don’t Repeat Yourself) principle.

def check_answers(my_answers, answers):
	results = [None, None, None, None, None]
	if my_answers[0] == answers[0]:
		results[0] = True
	elif my_answers[0] != answers[0]:
		result[0] = False
	if my_answers[1] == answers[1]:
		result[1] = True
	elif my_asnwers[1] != anwers[0]:
		results[1] = False
	if my_answers[2] == answers[2]:
        results[2] = True
    elif my_answers[2] != answers[2]:
        results[2] = False
    if my_answers[3] == answers[3]:
        results[3] = True
    elif my_answers[3] != answers[3]:
        results[3] = False
    if my_answers[4] == answers[4]:
        results[4] = True
    elif my_answers[4] != answers[4]:
        results[4] = False
    count_correct = 0
    count_incorrect = 0
    for result in results:
    	if result == True:
    		count_correct += 1
    	if result != True:
    		count_incorrect += 1
    if count_correct/5 > 0.7:
    	return "Congratulations, you passed the test! you scored " + str(count_correct) + " out of 5."
    elif count_incorrect/5 >= 0.3:
    	return "Unfortunately, you did not pass. You scored " + str(count_correct) + " out of 5."

>>> len(countries)
785
>>> countries[:5]
['Angola', 'Maldives', 'India', 'United States', 'India']

def remove_duplicates(source):
	target = []

	for element in source:
		if element not in target:
			target.append(element)

	return target

country_set = set(countries)
len(country_set)

country_set.add("Florin")

squares = set()

def nearest_square(limit):
answer = 0
while (answer+1)**2 < limit: answer += 1 return answer**2 n = 1 while n**2 < 2000: squares.add(n**2) n += 1 [/python]