import json
def insert_data(data, db)
passs
if __name__ == "__main__":
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.examples
with open('arachnid.json') as f:
data = json.loads(f.read())
insert_data(data, db)
print db.arachnid.find_one()
Preparing data
import codecs
import csv
import json
import pprint
import re
DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
'URI': 'uri',
'rdf-schema#comment': 'description',
'synonym': 'synonym',
'name': 'name',
'family_label': 'family',
'class_label': 'class',
'phylum_label': 'phylum',
'order_label': 'order',
'kingdom_label': 'kingdom',
'genus_label': 'genus'}
def process_file(filename, fields):
process_fields = fields.keys()
data = []
with open(filename, "r") as f:
reader = csv.DictReader(f)
for i in range(3):
l = reader.next()
for line in reader:
pass
return data
def parse_array(v):
if(v[0] == "{") and (v[-1] == "}"):
v = v.lstrip("{")
v = v.rstrip("}")
v_array = v.split("|")
v_array = [i.strip() for i in v_array]
return v_array
return [v]
def test():
data = process_file(DATAFILE, FIELDS)
print "your first entry:"
pprint.pprint(data[0])
first_entry = {
"synonym": None,
"name": "Argiope",
"classification" : {
"kingdom":"Animal",
"family":"Orb-weaver spider",
"order": "Spider",
"phylum": "Arthropod",
"genus": None,
"class": "Arachnid"
},
"uri": "http://dbpedia.org/resource/Argiope_(spider)",
"label":"Argiope",
"description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced."
}
assert len(data) = 76
assert data[0] == first_entry
assert data[17]["name"] == "Ogdenia"
assert data[48]["label"] == "Hydrachnidiae"
assert data[14]["synonym"] == ["Cyrene Peckham & Peckham"]
if __name__ == "__main__"
test
Range Queries
#!/usr/bin/env python
import pprint
client = MongoClient("mongodb://localhost:27017")
db = client.examples
def find():
query = {"population" : {"$gt" : 250000}}
cities = db.cities.find(query)
num_cities = 0
for c in cities:
pprint.pprint(c)
num_cities += 1
print "\nNumber of cities matching: %d\n" % num_cities
from datetime import datetime
def range_query():
query = {}
return query
def get_db():
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client.examples
return db
if __name__ == "__main__":
db = get_db()
query = range_query()
cities = db.cities.find(query)
print "Found cities:", cities.count()
import pprint
pprint.pprint(cities[0])
Multiple Field
from pymongo import MongoClient
import pprint
client = MongoClient("mongodb://localhost:27017")
db = client.examples
def find():
autos = db.autos.find(
{
"manufacturer" : "Toyota" , "class": "mid-size car"
})
for a in autos:
pprint.pprint(a)
if __name__ == '__main__':
find()
#!/usr/bin/env python
from autos import process_file
def insert_autos(infile, db):
data = process_file(infile)
if __name__ == "__main__":
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.examples
insert_autos('autos-small.csv', db)
print db.autos.find_one()
PyMongo
from pymongo import MongoClient
import pprint
client = MongoClient('mongodb://localhost:27017/')
tesla_s = {
"manufacturer" : "Tesla Motors",
"class" : "full-size",
"body style" : "5-door liftback",
"production" : [2012, 2013],
"model years" : [2013],
"layout" : ["Rear-motor", "rear-wheel drive"],
"designer" : {
"firstname" : "Franz",
"surname" : "von Holzhusen"
}
}
db = client.examples
db.autos.insert(tesla_s)
for a in db.autos.find():
pprint.pprint(a)
query
from pymongo import MongoClient
import pprint
client = MongoClient('mongodb://localhost:27017/')
db = client.examples
def find():
autos = db.autos.find({ "manufacturer" : "Toyota" })
for a in autos:
pprint.pprint(a)
if __name__ == '__main__':
find()
#!/usr/bin/env python
def porsche_query():
query = {}
return query
def get_db(db_name):
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client[db_name]
return db
def find_porsche(db, query):
return db.autos.find(query)
if __name__ == "__main__":
db = get_db('examples')
query = porsche_query()
results = find_porsche(db, query)
print "Printing first 3 result\n"
import pprint
for car in rsults[:3]:
pprint.pprint(car)
Data modeling
{
"manufacturer" : "Tesla Motors",
"class" : "full-size",
"body style" : "5-door liftback"
}
production felds
{
"production" : [2012, 2013],
"model years" : [2013],
"layout" : ["Rear-motor", "rear-wheel drive"]
}
designer
{
"designer" : {
"firstname" : "Franz",
"surname" : "von Holzhusen"
}
}
{
"assembly" : [
{
"country" : "United States",
"city" : "Fremount",
"state" : "California"
},
{
"country" : "The Netherlands",
"city" : "Tilburg"
}
]
}
{
"manufacturer" : "Tesla Motors",
"class" : "full-size",
"body style" : "5-door liftback",
"production" : [2012, 2013],
"model years" : [2013],
"layout" : ["Rear-motor", "rear-wheel drive"],
"designer" : {
"firstname" ; "Franz",
"surname" : "von Holzhusen"
},
"assembly" : [
{
"country" : "United States",
"city" : "Fremont",
"state" : "California"
},]
}
def add_city(db):
db.cities.insert({"name": "Chicago"})
def get_city(db):
return db.cities.find_one()
def get_db():
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client.examples
return db
if __name__ == "__main__":
add_city(db)
print get_city(db)
Fixing the area
#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs import csv import json import pprint CITIES = 'cities.csv' def fix_area(area): return area def process_file(filename): data = [] with open(filename, "r") as f: reader = csv.DictReader(f) for i in range(3): l = reader.next() for line in reader: if "areaLand" in line: line["areaLand"] = fix_area(line["areaLand"]) return data def test(): data = process_file(CITIES) print "Printing three example results:" for n in range(5, 8): pprint.pprint(data[n]["areaLand"]) assert data[3]["areaLand"] == None assert data[8]["areaLand"] == 55166700.0 assert data[20]["areaLand"] == 14581600.0 assert data[33]["areaLand"] == 20564500.0 if __name__ == "__main__": test()
Data Quality
import codecs
import csv
import json
import pprint
CITIES = 'cities.csv'
FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
"isPartOf_label", "areaCode", "populationTotal", "elevation",
"maximumElevation", "minimumElevation", "populationDensity",
"wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]
def audit_file(filename, fields):
fieldtypes = {}
return fieldtypes
def test():
fieldtypes = audit_file(CITIES, FIELDS)
pprint.pprint(fieldtypes)
assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)])
assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])
if __name__ =="__main__":
test()
Auditing Uniformity
import csv
import pprint
fieldname = "wgs84_pos#lat"
minval = -90
maxval = 90
def skip_lines(input_file, skip):
for i in range(0, skip):
next(input_file)
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
else:
v = float(v)
if not ((minval < v) and (v < maxval)):
print "Found out of range value:", v
if __name__ == "__main__":
input_file = csv.DictReader(open("cities3.csv"))
skip_lines(input_file, 3)
counts = {"nulls"; 0, "empties" : 0, "arrays" : 0}
nrows = 0
for row in input_file:
audit_float_field(row[fieldname], counts)
nrows += 1
print "num cities:", nrows
print "nulls:", counts['nulls']
print "empties:", counts['empties']
print "arrays:", counts['arrays']
Auditing Accuracy
client = MongoClient("mongodb://localhost:27017")
db = client.examples
def skip_lines(input_file, skip):
for i in range(0, skip):
next(input_file)
def audit_country(input_file):
for row in input_file:
country = row['country_label']
country = country.strip()
if (country == "NULL") or (country == ""):
continue
if db.countires.find({ "name" ; country }).count() != 1:
print "Not found:", country
if __name__ == '__main__':