Tf – term frequency
Idf – inverse document frequency
make everything as simple as possible, but no simpler – Albert Einstein
#!/usr/bin/python
import sys
import reader
import poi_emails
def getToFromStrings(f):
f.seek(0)
to_string, from_string, cc_string = reader.getAddresses(f)
to_emails = reader.parseAddresses( to_string )
from_emails = reader.parseAddresses( from_string )
cc_emails = reader.parseAddresses( cc_string )
return to_emails, from_emails, cc_emails
def poiFlagEmail(f):
to_emails, from_emails, cc_emails = getToFromStrings(f)
poi_email_list = poi_emails.poiEmails()
to_poi = False
from_poi = False
cc_poi = False
if to_emails:
ctr = 0
while not to_poi and ctr < len(to_emails):
if to_emails[ctr] in poi_email_list:
to_poi = True
ctr += 1
if cc_emails:
ctr = 0
while not to_poi and ctr < len(cc_emails):
if cc_emails[ctr] in poi_email_list:
cc_poi = True
ctr += 1
return to poi, from poi, cc poi
[/python]
[python]
#!/usr/bin/python
import os
import sys
import zipfile
from poi_flag_email import poiFlagEmail, getToFromStrings
data_dict = {}
with zipfile.ZipFile('emails.zip', "r") as z:
z.extractall()
for email_message in os.listdir("emails"):
if email_message == ".DS_Store":
continue
message = open(os.getcwd()+"/emails/"+email_message, "r")
to_addresses, from_addresses, cc_addresses = getToFromStrings(message)
to_poi, from_poi, cc_poi = poiFlagEmail(message)
for recipient in to_addresses:
if recipient not in data_dict:
data_dict[recipient] = {"from_poi_to_this_person":0}
if from_poi:
data_dict[recipient]["from_poi_to_this_person"] += 1
message.close()
for item in data_dict:
print item, data_dict[item]
def submitData():
return data dict
[/python]