Range Queries

#!/usr/bin/env python
import pprint

client = MongoClient("mongodb://localhost:27017")

db = client.examples

def find():
	query = {"population" : {"$gt" : 250000}}
	cities = db.cities.find(query)

	num_cities = 0
	for c in cities:
		pprint.pprint(c)
		num_cities += 1

	print "\nNumber of cities matching: %d\n" % num_cities
from datetime import datetime

def range_query():
	query = {}
	return query

def get_db():
	from pymongo import MongoClient
	client = MongoClient('localhost:27017')
	db = client.examples
	return db

if __name__ == "__main__":
	db = get_db()
	query = range_query()
	cities = db.cities.find(query)

	print "Found cities:", cities.count()
	import pprint
	pprint.pprint(cities[0])

Multiple Field

from pymongo import MongoClient
import pprint

client = MongoClient("mongodb://localhost:27017")

db = client.examples

def find():
	autos = db.autos.find(
		{
			"manufacturer" : "Toyota" , "class": "mid-size car"
		})
		for a in autos:
			pprint.pprint(a)

if __name__ == '__main__':
	find()
#!/usr/bin/env python
from autos import process_file

def insert_autos(infile, db):
	data = process_file(infile)

if __name__ == "__main__":
	from pymongo import MongoClient
	client = MongoClient("mongodb://localhost:27017")
	db = client.examples

	insert_autos('autos-small.csv', db)
	print db.autos.find_one()

PyMongo

from pymongo import MongoClient
import pprint

client = MongoClient('mongodb://localhost:27017/')

tesla_s = {
	"manufacturer" : "Tesla Motors",
	"class" : "full-size",
	"body style" : "5-door liftback",
	"production" : [2012, 2013],
	"model years" : [2013],
	"layout" : ["Rear-motor", "rear-wheel drive"],
	"designer" : {
		"firstname" : "Franz",
		"surname" : "von Holzhusen"
	}
}

db = client.examples
db.autos.insert(tesla_s)

for a in db.autos.find():
	pprint.pprint(a)

query

from pymongo import MongoClient
import pprint

client = MongoClient('mongodb://localhost:27017/')

db = client.examples

def find():
	autos = db.autos.find({ "manufacturer" : "Toyota" })
	for a in autos:
		pprint.pprint(a)

if __name__ == '__main__':
	find()
#!/usr/bin/env python

def porsche_query():
	query = {}
	return query

def get_db(db_name):
	from pymongo import MongoClient
	client = MongoClient('localhost:27017')
	db = client[db_name]
	return db

def find_porsche(db, query):
	return db.autos.find(query)

if __name__ == "__main__":
	db = get_db('examples')
	query = porsche_query()
	results = find_porsche(db, query)

	print "Printing first 3 result\n"
	import pprint
	for car in rsults[:3]:
		pprint.pprint(car)

Data modeling

api.mongodb.com

{
	"manufacturer" : "Tesla Motors",
	"class" : "full-size",
	"body style" : "5-door liftback"
}

production felds

{
	"production" : [2012, 2013],
	"model years" : [2013],
	"layout" : ["Rear-motor", "rear-wheel drive"]
}

designer

{
	"designer" : {
		"firstname" : "Franz",
		"surname" : "von Holzhusen"
	}
}
{
	"assembly" : [
		{
			"country" : "United States",
			"city" : "Fremount",
			"state" : "California"
		},
		{
			"country" : "The Netherlands",
			"city" : "Tilburg"
		}
	]
}
{
	"manufacturer" : "Tesla Motors",
	"class" : "full-size",
	"body style" : "5-door liftback",
	"production" : [2012, 2013],
	"model years" : [2013],
	"layout" : ["Rear-motor", "rear-wheel drive"],
	"designer" : {
		"firstname" ; "Franz",
		"surname" : "von Holzhusen"
	},
	"assembly" : [
		{
			"country" : "United States",
			"city" : "Fremont",
			"state" : "California"
		},]
}
def add_city(db):
	db.cities.insert({"name": "Chicago"})

def get_city(db):
	return db.cities.find_one()

def get_db():
	from pymongo import MongoClient
	client = MongoClient('localhost:27017')
	db = client.examples
	return db

if __name__ == "__main__":
	add_city(db)
	print get_city(db)

Fixing the area

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import codecs
import csv
import json
import pprint

CITIES = 'cities.csv'

def fix_area(area):
	return area

def process_file(filename):
	data = []

	with open(filename, "r") as f:
		reader = csv.DictReader(f)

		for i in range(3):
			l = reader.next()

		for line in reader:
			if "areaLand" in line:
				line["areaLand"] = fix_area(line["areaLand"])

	return data

def test():
	data = process_file(CITIES)

	print "Printing three example results:"
	for n in range(5, 8):
		pprint.pprint(data[n]["areaLand"])

	assert data[3]["areaLand"] == None
	assert data[8]["areaLand"] == 55166700.0
	assert data[20]["areaLand"] == 14581600.0
	assert data[33]["areaLand"] == 20564500.0

if __name__ == "__main__":
	test()

Data Quality

import codecs
import csv
import json
import pprint

CITIES = 'cities.csv'

FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
          "isPartOf_label", "areaCode", "populationTotal", "elevation",
          "maximumElevation", "minimumElevation", "populationDensity",
          "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]

def audit_file(filename, fields):
	fieldtypes = {}

	return fieldtypes

def test():
	fieldtypes = audit_file(CITIES, FIELDS)

	pprint.pprint(fieldtypes)

	assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)])
	assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])

if __name__ =="__main__":
	test()

Auditing Uniformity

import csv
import pprint

fieldname = "wgs84_pos#lat"
minval = -90
maxval = 90

def skip_lines(input_file, skip):
	for i in range(0, skip):
		next(input_file)

def is_number(s):
	try:
		float(s)
		return True
	except ValueError:
		return False
	else:
		v = float(v)
		if not ((minval < v) and (v < maxval)):
			print "Found out of range value:", v

if __name__ == "__main__":
	input_file = csv.DictReader(open("cities3.csv"))
	skip_lines(input_file, 3)
	counts = {"nulls"; 0, "empties" : 0, "arrays" : 0}
	nrows = 0
	for row in input_file:
		audit_float_field(row[fieldname], counts)
		nrows += 1
	print "num cities:", nrows
	print "nulls:", counts['nulls']
	print "empties:", counts['empties']
	print "arrays:", counts['arrays']

Auditing Accuracy

client = MongoClient("mongodb://localhost:27017")
db = client.examples

def skip_lines(input_file, skip):
	for i in range(0, skip):
		next(input_file)

def audit_country(input_file):
	for row in input_file:
		country = row['country_label']
		country = country.strip()
		if (country == "NULL") or (country == ""):
			continue
		if db.countires.find({ "name" ; country }).count() != 1:
			print "Not found:", country

if __name__ == '__main__':

Correcting Validity

import csv
import pprint

INPUT_FILE = 'autos.csv'
OUTPUT_GOOD = 'autos-valid.csv'
OUTPUT_BAD = 'FIXME-autos.csv'

def process_file(input_file, output_good, output_bad):

	with open(input_file, "r") as f:
		reader = csv.DictReader(f)
		header = reader.fieldnames

	with open(output_good, "w") as g:
		writer = csv.DictWriter(g, delimiter=",", fieldnames = header)
		writer.writeheader()
		for row in YOURDATA:
			writer.writerow(row)

	def test():
		process_file(INPUT_FILE, OUTPUT_GOOD, OUTPUT_BAD)

	if __name__ == "__main__":

Population density

def ensure_float(v):
	if is_number(v):
		return float(v)

def audit_population_density(input_file):
	for row in input_file:
		population = ensure_float(row['populationTotal'])
		area = ensure_float(row['areaLand'])
		population_density = ensure_float(row['populationDensity'])
		if population and area and population_density:
			calculated_density = population / area
			if math.fabs(calculated_density - population_density) > 10:
				print "Possibly bad population density for ", row['name']

if __name__ == '__main__':
	input_file = csv.DictReader(open("cities.csv"))
	skip_lines(input_file, 3)
	audit_population_density(input_file)