import sys
import string
import logging
from util import mapper_logfile
logging.basicConfig(filename=mapper_logfile, format='%(message)s',
level=logging.INFO, filemode='w')
def mapper():
for line in sys.stdin:
data = line.strip().split(",")
if len(data) != 12 or data[0] == 'Register':
continue
print "{0}\t{1}".format(data[3], data[8])
mapper()
import sys
import logging
from util import reducer_logfile
logging.basicConfig(filename=reducer_logfile, format='%(message)s',
level=logging.INFO, filemode='w')
def reducer():
aadhaar_generated = 0
old_key = None
for line in sys.stdin:
data = line.strip().split("\t")
if len(data) != 2:
continue
this_key, count = data
if old_key and old_key != this_key:
print "{0}\t{1}".format(old_key, aadhaar_generated)
aadhaar_generated = 0
old_key = this_key
aadhaar_generated += float(count)
if old_key != None:
print "{0}\t{1}".format(old_key, aadhaar_generated)
reducer()
Mapreduce programming model -> HADOOP!
(1)Hive, (2)Pig
mahout, giraph, cassandra
Using Mapreduce with Subway data