def get_db(db_name): from pymongo import MongoClient client = MongoClient('localhost:27017') db = client[db_name] return db def make_pipeline(): pipeline = [ ] return pipeline def aggregate(db, pipeline): return [doc for doc in db.tweets.aggregate(pipeline)] if __name__ == '__main__': db = get_db('twitter') pipeline = make_pipeline() result = aggregate(db, pipeline) import pprint assert len(result) == 1 assert result[0]["followers"] == 17209
Category: Python
Insert into the DB
import json def insert_data(data, db) passs if __name__ == "__main__": from pymongo import MongoClient client = MongoClient("mongodb://localhost:27017") db = client.examples with open('arachnid.json') as f: data = json.loads(f.read()) insert_data(data, db) print db.arachnid.find_one()
Preparing data
import codecs import csv import json import pprint import re DATAFILE = 'arachnid.csv' FIELDS ={'rdf-schema#label': 'label', 'URI': 'uri', 'rdf-schema#comment': 'description', 'synonym': 'synonym', 'name': 'name', 'family_label': 'family', 'class_label': 'class', 'phylum_label': 'phylum', 'order_label': 'order', 'kingdom_label': 'kingdom', 'genus_label': 'genus'} def process_file(filename, fields): process_fields = fields.keys() data = [] with open(filename, "r") as f: reader = csv.DictReader(f) for i in range(3): l = reader.next() for line in reader: pass return data def parse_array(v): if(v[0] == "{") and (v[-1] == "}"): v = v.lstrip("{") v = v.rstrip("}") v_array = v.split("|") v_array = [i.strip() for i in v_array] return v_array return [v] def test(): data = process_file(DATAFILE, FIELDS) print "your first entry:" pprint.pprint(data[0]) first_entry = { "synonym": None, "name": "Argiope", "classification" : { "kingdom":"Animal", "family":"Orb-weaver spider", "order": "Spider", "phylum": "Arthropod", "genus": None, "class": "Arachnid" }, "uri": "http://dbpedia.org/resource/Argiope_(spider)", "label":"Argiope", "description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly coloured abdomen. These spiders are distributed throughout the world. Most countries in tropical or temperate climates host one or more species that are similar in appearance. The etymology of the name is from a Greek name meaning silver-faced." } assert len(data) = 76 assert data[0] == first_entry assert data[17]["name"] == "Ogdenia" assert data[48]["label"] == "Hydrachnidiae" assert data[14]["synonym"] == ["Cyrene Peckham & Peckham"] if __name__ == "__main__" test
Range Queries
#!/usr/bin/env python import pprint client = MongoClient("mongodb://localhost:27017") db = client.examples def find(): query = {"population" : {"$gt" : 250000}} cities = db.cities.find(query) num_cities = 0 for c in cities: pprint.pprint(c) num_cities += 1 print "\nNumber of cities matching: %d\n" % num_cities
from datetime import datetime def range_query(): query = {} return query def get_db(): from pymongo import MongoClient client = MongoClient('localhost:27017') db = client.examples return db if __name__ == "__main__": db = get_db() query = range_query() cities = db.cities.find(query) print "Found cities:", cities.count() import pprint pprint.pprint(cities[0])
Multiple Field
from pymongo import MongoClient import pprint client = MongoClient("mongodb://localhost:27017") db = client.examples def find(): autos = db.autos.find( { "manufacturer" : "Toyota" , "class": "mid-size car" }) for a in autos: pprint.pprint(a) if __name__ == '__main__': find()
#!/usr/bin/env python from autos import process_file def insert_autos(infile, db): data = process_file(infile) if __name__ == "__main__": from pymongo import MongoClient client = MongoClient("mongodb://localhost:27017") db = client.examples insert_autos('autos-small.csv', db) print db.autos.find_one()
PyMongo
from pymongo import MongoClient import pprint client = MongoClient('mongodb://localhost:27017/') tesla_s = { "manufacturer" : "Tesla Motors", "class" : "full-size", "body style" : "5-door liftback", "production" : [2012, 2013], "model years" : [2013], "layout" : ["Rear-motor", "rear-wheel drive"], "designer" : { "firstname" : "Franz", "surname" : "von Holzhusen" } } db = client.examples db.autos.insert(tesla_s) for a in db.autos.find(): pprint.pprint(a)
query
from pymongo import MongoClient import pprint client = MongoClient('mongodb://localhost:27017/') db = client.examples def find(): autos = db.autos.find({ "manufacturer" : "Toyota" }) for a in autos: pprint.pprint(a) if __name__ == '__main__': find()
#!/usr/bin/env python def porsche_query(): query = {} return query def get_db(db_name): from pymongo import MongoClient client = MongoClient('localhost:27017') db = client[db_name] return db def find_porsche(db, query): return db.autos.find(query) if __name__ == "__main__": db = get_db('examples') query = porsche_query() results = find_porsche(db, query) print "Printing first 3 result\n" import pprint for car in rsults[:3]: pprint.pprint(car)
Data modeling
{ "manufacturer" : "Tesla Motors", "class" : "full-size", "body style" : "5-door liftback" }
production felds
{ "production" : [2012, 2013], "model years" : [2013], "layout" : ["Rear-motor", "rear-wheel drive"] }
designer
{ "designer" : { "firstname" : "Franz", "surname" : "von Holzhusen" } }
{ "assembly" : [ { "country" : "United States", "city" : "Fremount", "state" : "California" }, { "country" : "The Netherlands", "city" : "Tilburg" } ] }
{ "manufacturer" : "Tesla Motors", "class" : "full-size", "body style" : "5-door liftback", "production" : [2012, 2013], "model years" : [2013], "layout" : ["Rear-motor", "rear-wheel drive"], "designer" : { "firstname" ; "Franz", "surname" : "von Holzhusen" }, "assembly" : [ { "country" : "United States", "city" : "Fremont", "state" : "California" },] }
def add_city(db): db.cities.insert({"name": "Chicago"}) def get_city(db): return db.cities.find_one() def get_db(): from pymongo import MongoClient client = MongoClient('localhost:27017') db = client.examples return db if __name__ == "__main__": add_city(db) print get_city(db)
Fixing the area
#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs import csv import json import pprint CITIES = 'cities.csv' def fix_area(area): return area def process_file(filename): data = [] with open(filename, "r") as f: reader = csv.DictReader(f) for i in range(3): l = reader.next() for line in reader: if "areaLand" in line: line["areaLand"] = fix_area(line["areaLand"]) return data def test(): data = process_file(CITIES) print "Printing three example results:" for n in range(5, 8): pprint.pprint(data[n]["areaLand"]) assert data[3]["areaLand"] == None assert data[8]["areaLand"] == 55166700.0 assert data[20]["areaLand"] == 14581600.0 assert data[33]["areaLand"] == 20564500.0 if __name__ == "__main__": test()
Data Quality
import codecs import csv import json import pprint CITIES = 'cities.csv' FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label", "isPartOf_label", "areaCode", "populationTotal", "elevation", "maximumElevation", "minimumElevation", "populationDensity", "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"] def audit_file(filename, fields): fieldtypes = {} return fieldtypes def test(): fieldtypes = audit_file(CITIES, FIELDS) pprint.pprint(fieldtypes) assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)]) assert fieldtypes['areaMetro'] == set([type(1.1), type(None)]) if __name__ =="__main__": test()
Auditing Uniformity
import csv import pprint fieldname = "wgs84_pos#lat" minval = -90 maxval = 90 def skip_lines(input_file, skip): for i in range(0, skip): next(input_file) def is_number(s): try: float(s) return True except ValueError: return False else: v = float(v) if not ((minval < v) and (v < maxval)): print "Found out of range value:", v if __name__ == "__main__": input_file = csv.DictReader(open("cities3.csv")) skip_lines(input_file, 3) counts = {"nulls"; 0, "empties" : 0, "arrays" : 0} nrows = 0 for row in input_file: audit_float_field(row[fieldname], counts) nrows += 1 print "num cities:", nrows print "nulls:", counts['nulls'] print "empties:", counts['empties'] print "arrays:", counts['arrays']