def mapper(): for line in sys.stdin: data = line.strip.split("") for i in data: cleaned_data = i.translate(string.maketrans("",""), string.punctuation).lower() print "{0}\t{t}".format(cleaned_data,1) mapper()
Reduce stage -> reducer
import sys def reducer(): word_count = 0 old_key = None for line in sys.stdin: data = line.strip().split("\t") if len(data) != 2: continue if old_key and old_key != this_key: print"{0}\t{1}".format(old_key, word_count) word_count = 0 old_key = this_key word_count += float(count) if old_key != None: print "{0}\t{1}".format(old_key, word_count)
#! /bin/bash cat ../../data/aliceInWorderland.txt | python word_count_mapper.py | sort | python word_count_reducer.py