def get_db(db_name):
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client[db_name]
return db
def make_pipeline():
pipeline = [ ]
return pipeline
def aggregate(db, pipeline):
return [doc for doc in db.tweets.aggregate(pipeline)]
if __name__ == '__main__':
db = get_db('twitter')
pipeline = make_pipeline()
result = aggregate(db, pipeline)
import pprint
assert len(result) == 1
assert result[0]["followers"] == 17209
twitter data-set
{
"_id" : ObjectID("xxxx"),
"text" : "Something interesting ...",
"entities" : {
"user_mentions" : [
{
"screen_name" : "somebody_else",
...
}
],
"urls" : [],
"hashtags": []
},
"user" : {
"friends_count" : 544,
"screen_name" : "somebody",
"followers_count" : 100,
}
}
from pymongo import MongoClient
import pprint
client = MongoClient("mongodb://localhost:27017")
db = client.twitter
def most_tweets():
result = db.tweets.aggregate([
{ "$group" : {"_id" : "$user.screen_name",
"count": {"$sum" : 1}}},
{ "$sort" : {"count" : -1 }}
])
return result
if __name__ == '__main__':
result = most_tweets()
pprint.pprint(result)
Insert into the DB
import json
def insert_data(data, db)
passs
if __name__ == "__main__":
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.examples
with open('arachnid.json') as f:
data = json.loads(f.read())
insert_data(data, db)
print db.arachnid.find_one()
Preparing data
import codecs
import csv
import json
import pprint
import re
DATAFILE = 'arachnid.csv'
FIELDS ={'rdf-schema#label': 'label',
'URI': 'uri',
'rdf-schema#comment': 'description',
'synonym': 'synonym',
'name': 'name',
'family_label': 'family',
'class_label': 'class',
'phylum_label': 'phylum',
'order_label': 'order',
'kingdom_label': 'kingdom',
'genus_label': 'genus'}
def process_file(filename, fields):
process_fields = fields.keys()
data = []
with open(filename, "r") as f:
reader = csv.DictReader(f)
for i in range(3):
l = reader.next()
for line in reader:
pass
return data
def parse_array(v):
if(v[0] == "{") and (v[-1] == "}"):
v = v.lstrip("{")
v = v.rstrip("}")
v_array = v.split("|")
v_array = [i.strip() for i in v_array]
return v_array
return [v]
def test():
data = process_file(DATAFILE, FIELDS)
print "your first entry:"
pprint.pprint(data[0])
first_entry = {
"synonym": None,
"name": "Argiope",
"classification" : {
"kingdom":"Animal",
"family":"Orb-weaver spider",
"order": "Spider",
"phylum": "Arthropod",
"genus": None,
"class": "Arachnid"
},
"uri": "http://dbpedia.org/resource/Argiope_(spider)",
"label":"Argiope",
"description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced."
}
assert len(data) = 76
assert data[0] == first_entry
assert data[17]["name"] == "Ogdenia"
assert data[48]["label"] == "Hydrachnidiae"
assert data[14]["synonym"] == ["Cyrene Peckham & Peckham"]
if __name__ == "__main__"
test
Range Queries
#!/usr/bin/env python
import pprint
client = MongoClient("mongodb://localhost:27017")
db = client.examples
def find():
query = {"population" : {"$gt" : 250000}}
cities = db.cities.find(query)
num_cities = 0
for c in cities:
pprint.pprint(c)
num_cities += 1
print "\nNumber of cities matching: %d\n" % num_cities
from datetime import datetime
def range_query():
query = {}
return query
def get_db():
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client.examples
return db
if __name__ == "__main__":
db = get_db()
query = range_query()
cities = db.cities.find(query)
print "Found cities:", cities.count()
import pprint
pprint.pprint(cities[0])
Multiple Field
from pymongo import MongoClient
import pprint
client = MongoClient("mongodb://localhost:27017")
db = client.examples
def find():
autos = db.autos.find(
{
"manufacturer" : "Toyota" , "class": "mid-size car"
})
for a in autos:
pprint.pprint(a)
if __name__ == '__main__':
find()
#!/usr/bin/env python
from autos import process_file
def insert_autos(infile, db):
data = process_file(infile)
if __name__ == "__main__":
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.examples
insert_autos('autos-small.csv', db)
print db.autos.find_one()
PyMongo
from pymongo import MongoClient
import pprint
client = MongoClient('mongodb://localhost:27017/')
tesla_s = {
"manufacturer" : "Tesla Motors",
"class" : "full-size",
"body style" : "5-door liftback",
"production" : [2012, 2013],
"model years" : [2013],
"layout" : ["Rear-motor", "rear-wheel drive"],
"designer" : {
"firstname" : "Franz",
"surname" : "von Holzhusen"
}
}
db = client.examples
db.autos.insert(tesla_s)
for a in db.autos.find():
pprint.pprint(a)
query
from pymongo import MongoClient
import pprint
client = MongoClient('mongodb://localhost:27017/')
db = client.examples
def find():
autos = db.autos.find({ "manufacturer" : "Toyota" })
for a in autos:
pprint.pprint(a)
if __name__ == '__main__':
find()
#!/usr/bin/env python
def porsche_query():
query = {}
return query
def get_db(db_name):
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client[db_name]
return db
def find_porsche(db, query):
return db.autos.find(query)
if __name__ == "__main__":
db = get_db('examples')
query = porsche_query()
results = find_porsche(db, query)
print "Printing first 3 result\n"
import pprint
for car in rsults[:3]:
pprint.pprint(car)
Data modeling
{
"manufacturer" : "Tesla Motors",
"class" : "full-size",
"body style" : "5-door liftback"
}
production felds
{
"production" : [2012, 2013],
"model years" : [2013],
"layout" : ["Rear-motor", "rear-wheel drive"]
}
designer
{
"designer" : {
"firstname" : "Franz",
"surname" : "von Holzhusen"
}
}
{
"assembly" : [
{
"country" : "United States",
"city" : "Fremount",
"state" : "California"
},
{
"country" : "The Netherlands",
"city" : "Tilburg"
}
]
}
{
"manufacturer" : "Tesla Motors",
"class" : "full-size",
"body style" : "5-door liftback",
"production" : [2012, 2013],
"model years" : [2013],
"layout" : ["Rear-motor", "rear-wheel drive"],
"designer" : {
"firstname" ; "Franz",
"surname" : "von Holzhusen"
},
"assembly" : [
{
"country" : "United States",
"city" : "Fremont",
"state" : "California"
},]
}
def add_city(db):
db.cities.insert({"name": "Chicago"})
def get_city(db):
return db.cities.find_one()
def get_db():
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client.examples
return db
if __name__ == "__main__":
add_city(db)
print get_city(db)
Fixing the area
#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs import csv import json import pprint CITIES = 'cities.csv' def fix_area(area): return area def process_file(filename): data = [] with open(filename, "r") as f: reader = csv.DictReader(f) for i in range(3): l = reader.next() for line in reader: if "areaLand" in line: line["areaLand"] = fix_area(line["areaLand"]) return data def test(): data = process_file(CITIES) print "Printing three example results:" for n in range(5, 8): pprint.pprint(data[n]["areaLand"]) assert data[3]["areaLand"] == None assert data[8]["areaLand"] == 55166700.0 assert data[20]["areaLand"] == 14581600.0 assert data[33]["areaLand"] == 20564500.0 if __name__ == "__main__": test()
Data Quality
import codecs
import csv
import json
import pprint
CITIES = 'cities.csv'
FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
"isPartOf_label", "areaCode", "populationTotal", "elevation",
"maximumElevation", "minimumElevation", "populationDensity",
"wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]
def audit_file(filename, fields):
fieldtypes = {}
return fieldtypes
def test():
fieldtypes = audit_file(CITIES, FIELDS)
pprint.pprint(fieldtypes)
assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)])
assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])
if __name__ =="__main__":
test()