I am trying to explore the enron email dataset using python Jupyter notebook. But I am getting this attribute error. I am trying to read the emails and convert them into csv format so that I can further apply Ml for sentiment analysis.
import tarfile
import re
from datetime import datetime
from collections import namedtuple, Counter
import pandas as pd
import altair as alt
tar =tarfile.open(r"C:\Users\nikip\Documents\2021\Interview Preparation\sentiment analysis\enron_mail_20150507.tar.gz", "r")
items = tar.getmembers()
Email = namedtuple('Email', 'Date, From, To, Subject, Cc, Bcc, Message')
def get_msg(item_number):
f = tar.extractfile(items[item_number])
try:
date = from_ = to = subject = cc= bcc = message= ''
in_to = False
in_message = False
to = []
message = []
item = f.read().decode()
item = item.replace('\r', '').replace('\t', '')
lines = item.split('\n')
for num, line in enumerate(lines):
if line.startswith('Date:') and not date:
date = datetime.strptime(' '.join(line.split('Date: ')[1].split()[:-2]), '%a, %d %b %Y %H:%M:%S')
elif line.startswith('From:') and not from_:
from_ = line.replace('From:', '').strip()
elif line.startswith('To:')and not to:
in_to = True
to = line.replace('To:', '').replace(',', '').replace(',', '').split()
elif line.startswith('Subject:') and not subject:
in_to = False
subject = line.replace('Subject:', '').strip()
elif line.startswith('Cc:') and not cc:
cc = line.replace('Cc:', '').replace(',', '').replace(',', '').split()
elif line.startswith('Bcc:') and not bcc:
bcc = line.replace('Bcc:', '').replace(',', '').replace(',', '').split()
elif in_to:
to.extend(line.replace(',', '').split())
elif line.statswith('Subject:') and not subject:
in_to =False
elif line.startswith('X-FileName'):
in_message = True
elif in_message:
message.append(line)
to = '; '.join(to).strip()
cc = '; '.join(cc).strip()
bcc = '; '.join(bcc).strip()
message = ' '.join(message).strip()
email = Email(date, from_, to, subject, cc, bcc, message)
return email
except Exception as e:
return e
msg = get_msg(3002)
msg.date
I am getting error message like below:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-11-e1439579a8e7> in <module>
----> 1 msg.To
AttributeError: 'AttributeError' object has no attribute 'To'
Can someone help ?thanks in advance
The problem is that you are return an exception in your get_msg function, which broadly looks like this:
def get_msg(item_number):
try:
...do some stuff...
except Exception as e:
return e
It looks like you're triggering an AttributeError exception somewhere in your code, and you're returning that exception, rather than an Email object.
You almost never want to have an except statement that suppresses all exceptions like that, because it will hide errors in your code (as we see here). It is generally better practice to catch specific exceptions, or at least log the error if your code will continue despite the exception.
As a first step, I would suggest removing the entire try/except block and get your code working without it.
Related
I am working to extract issues data from a repo on Github using Github3.py.
The following is a part of my code to extract issues from a repo:
I used these libraries in the main code:
from github3 import login
from mysql.connector import IntegrityError
import config as cfg
import project_list
from github3.exceptions import NotFoundError
from github3.exceptions import GitHubException
import datetime
from database import Database
import sys
import re
import time
Then the main code is:
DEBUG = False
def process(url, start):
re_pattern = re.compile(u'[^\u0000-\uD7FF\uE000-\uFFFF]', re.UNICODE)
splitted = url.split("/")
org_name = splitted[3]
repo_name = splitted[4]
while True:
try:
gh = login(token = cfg.TOKEN)
repo = gh.repository(org_name, repo_name)
print("{} =====================".format(repo))
if start is None:
i = 1
else:
i = int(start)
if start is None:
j = 1
else:
j = int(start)
Database.connect()
while True:
try:
issue = repo.issue(i)
issue_id = issue.id
issue_number = issue.number
status_issue = str(issue.state)
close_author = str(issue.closed_by)
com_count = issue.comments_count
title = re_pattern.sub(u'\uFFFD', issue.title)
created_at = issue.created_at
closed_at = issue.closed_at
now = datetime.datetime.now()
reporter = str(issue.user)
body_text = issue.body_text
body_html = issue.body_html
if body_text is None:
body_text = ""
if body_html is None:
body_html = ""
body_text = re_pattern.sub(u'\uFFFD', body_text)
body_html = re_pattern.sub(u'\uFFFD', body_html)
Database.insert_issues(issue_id, issue_number, repo_name,status_issue , close_author, com_count, title, reporter, created_at, closed_at, now, body_text, body_html)
print("{} inserted.".format(issue_id))
if DEBUG == True:
break;
except NotFoundError as e:
print("Exception # {}: {}".format(i, str(e)))
except IntegrityError as e:
print("Data was there # {}".format(str(e)))
i += 1
j += 1
except GitHubException as e:
print("Exception: {}".format(str(e)))
time.sleep(1000)
i -= 1
j -= 1
if __name__ == "__main__":
if len(sys.argv) == 1:
sys.exit("Please specify project name: python issue-github3.py <project name>")
if len(sys.argv) == 2:
start = None
print("Start from the beginning")
else:
start = sys.argv[2]
project = sys.argv[1]
url = project_list.get_project(project)
process(url, start)
With the above code, everything is ok for me and I can extract issues from a repo on GitHub.
Problem: Exception: 410 Issues are disabled for this repo occurs after 100 successful issues extraction from a repo.
How could I solve this problem?
As mentioned in the main code, I fixed the exception 404 (i.e., Not found issues) with the library of from github3.exceptions import NotFoundError and the below code:
except NotFoundError as e:
print("Exception # {}: {}".format(i, str(e)))
Given the main code, what library and code should I use to fix exception 410?
I found an easy way to fix it but it doesn't solve the problem completely.
As I mentioned before, the exception occurs after 100 successful issue number (i.e., issue number of 101 is a problem), and as #LhasaDad said above, there is no issue number of 101 in the repo (I checked it manually). So we just need to put 102 instead of None where start = None, then execute the code again.
I'm using the following code (got it from StackOverflow :)) to get all unread e-mail from specific email adresses. It works perfect!
I would however like to get the actual recived (or sent) date for each e-mail I'm getting an attached file from. But I dont know how to do that?
import email
import imaplib
import os
import sys
import random
import string
import glob
import unicodedata
def remove_accents(s):
nkfd_form = unicodedata.normalize('NFKD', s)
return u''.join([c for c in nkfd_form if not unicodedata.combining(c)])
def remove_non_ascii(text):
return unidecode(unicode(text, encoding = "cp865"))
def replace_non_ascii(x): return ''.join(i if ord(i) < 128 else '_' for i in x)
detach_dir = r'\\1.1.1.1\xxx\xxx\drop_folder'
try:
imapSession = imaplib.IMAP4_SSL('outlook.office365.com')
typ, accountDetails = imapSession.login('xxxx', 'xxxx')
if typ != 'OK':
print ('Not able to sign in!')
raise
imapSession.select('Inbox')
typ, data = imapSession.search(None, '(UNSEEN FROM "#xxxx.xxx")')
if typ != 'OK':
print ('Error searching Inbox.')
raise
# Iterating over all emails
for msgId in data[0].split():
typ, messageParts = imapSession.fetch(msgId, '(RFC822)')
if typ != 'OK':
print ('Error fetching mail.')
raise
emailBody = messageParts[0][1]
mail = email.message_from_string(emailBody)
for part in mail.walk():
if part.get_content_maintype() == 'multipart':
# print part.as_string()
continue
if part.get('Content-Disposition') is None:
# print part.as_string()
continue
fileName = part.get_filename().encode('utf-8')
if bool(fileName):
filePath = os.path.join(detach_dir, 'EXP-' + fileName + '.xls')
if not os.path.isfile(filePath) :
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
imapSession.close()
imapSession.logout()
except :
print ('Not able to download all attachments.')
You could fetch the INTERNALDATE instead of, or in addition to the RFC822 item; It is (generally) time the server received the message.
You will have to do some parsing of the return item, since imaplib does no parsing of FETCH results. It will be easier to parse if it's the only thing you fetch.
The response will look something like
* 5 FETCH (INTERNALDATE "17-Jul-2018 02:44:25 -0700")
I have this code to download emails and save the text body. It saves the text body as a text file using the text in the subject field as the name of the file. The subject SHOULD basically be a student number + school week number, like:
1234567891week8
pathToFiles = '/home/pedro/getEmailtexts/emailTexts17BE/'
server = IMAPClient(HOST, use_uid=True, ssl=True)
server.login(USERNAME, PASSWORD)
select_info = server.select_folder('Inbox')
unseenMessages = server.search(['UNSEEN'])
print('Number of unseen messages is ' + str(len(unseenMessages)))
for uid, message_data in server.fetch(unseenMessages, 'RFC822').items():
email_message = email.message_from_bytes(message_data[b'RFC822'])
print(' message UID is ' + str(uid))
print(email_message.get('Subject'))
messageSubject = email_message.get('Subject')
file = messageSubject + '.txt'
theFile = open(pathToFiles + file, 'w')
rawMessage = server.fetch(unseenMessages, ['BODY[]', 'FLAGS'])
message = pyzmail.PyzMessage.factory(rawMessage[uid][b'BODY[]'])
text = message.text_part.get_payload().decode(message.text_part.charset)
saveText = text.rstrip()
theFile.write(saveText)
theFile.close()
However some students, or their email programs, are putting something weird in the subject field, causing a breakdown. Here is a sample output from my bash terminal:
1725010108week8
message UID is 33
1725010135week8
message UID is 34
1725010126 week8
message UID is 35
������������������1725010118week8
Traceback (most recent call last):
File "./getAnswersFromEmail17BEv2.py", line 45, in <module>
file = messageSubject + '.txt'
TypeError: unsupported operand type(s) for +: 'Header' and 'str'
pedro#pedro-newssd:~/getEmailtexts/python$
I have to go to the email, delete the offending email, and start again.
I think I may be able to insert a try ... except ... in there somehow, but I can't see exactly how. Or maybe there is some other way to deal with a dodgy subject
Do you have any tips for an amateur on how to get round this? How to make the program go to the next email on this kind of error?
My first suggestion would be converting the messageSubject variable to a string.
i.e
file = str(messageSubject) + '.txt'
To use the try except clause, the following snippet will allow the code to move onto the next email if it cannot create the file variable for the email
try:
file = messageSubject + '.txt'
except TypeError:
continue
# Alexis Lucattini: Thank you very much!
This solved crazy subject problems:
messageSubject = str(email_message.get('Subject'))
Then I got " AttributeError: 'NoneType' object has no attribute 'get_payload'"
In the end, I used both suggestions.
At least 1 student must have sent an empty email, which caused problems like this:
UID is 421
1825010336week8
Message subject is 1825010336week8
UID is 424
1825010334Week
Message subject is 1825010334Week
UID is 425
=?gb18030?B?MTgyNTAxMDIzNyDA7s7Ex78=?=
Message subject is =?gb18030?B?MTgyNTAxMDIzNyDA7s7Ex78=?=
UID is 426
=?gb18030?B?ufnT7ubDIDE4MjUwMTAyNDQ=?=
Message subject is =?gb18030?B?ufnT7ubDIDE4MjUwMTAyNDQ=?=
UID is 430
=?gb18030?B?MTgyNTAxMDExMyxBLEIsQSxCLEEsQyxELEcsQSxD?=
=?gb18030?B?LEYsSCxBLEQsQixBLEgsRyxDLEEsQSxCLEEsQSxC?=
=?gb18030?B?LEIsRixELEosRSxBLEcsQyxILEmBMIQyCgoK?=
Message subject is =?gb18030?B?MTgyNTAxMDExMyxBLEIsQSxCLEEsQyxELEcsQSxD?=
=?gb18030?B?LEYsSCxBLEQsQixBLEgsRyxDLEEsQSxCLEEsQSxC?=
=?gb18030?B?LEIsRixELEosRSxBLEcsQyxILEmBMIQyCgoK?=
UID is 431
=?gb18030?B?MTgyNTAxMDEzMSxBLEIsQSxCLEEsQyxELEcsQSxD?=
=?gb18030?B?LEYsSCxBLEQsQixBLEgsRyxDLEEsQSxCLEEsQSxC?=
=?gb18030?B?LEIsRixELEosRSxBLEksQyxILEcKCgo=?=
Message subject is =?gb18030?B?MTgyNTAxMDEzMSxBLEIsQSxCLEEsQyxELEcsQSxD?=
=?gb18030?B?LEYsSCxBLEQsQixBLEgsRyxDLEEsQSxCLEEsQSxC?=
=?gb18030?B?LEIsRixELEosRSxBLEksQyxILEcKCgo=?=
UID is 432
1825010207week8
Message subject is 1825010207week8
UID is 434
������������������1825010136week7
Message subject is ������������������1825010136week7
Traceback (most recent call last):
File "./getAnswersFromEmail18BEv2.py", line 52, in <module>
text = message.text_part.get_payload().decode(message.text_part.charset)
AttributeError: 'NoneType' object has no attribute 'get_payload'
pedro#pedro-newssd:~/getEmailtexts/python$ ^C
pedro#pedro-newssd:~/getEmailtexts/python$
So I put a try except in for that error:
for uid, message_data in server.fetch(unseenMessages, 'RFC822').items():
email_message = email.message_from_bytes(message_data[b'RFC822'])
print('UID is ' + str(uid))
print(email_message.get('Subject'))
messageSubject = str(email_message.get('Subject'))
print('Message subject is ' + messageSubject)
if messageSubject == None:
messageSubject = 'idiot'
file = messageSubject + '.txt'
theFile = open(pathToFiles + file, 'w')
rawMessage = server.fetch(unseenMessages, ['BODY[]', 'FLAGS'])
try:
message = pyzmail.PyzMessage.factory(rawMessage[uid][b'BODY[]'])
text = message.text_part.get_payload().decode(message.text_part.charset)
saveText = text.rstrip()
theFile.write(saveText)
theFile.close()
except AttributeError:
continue
After that, all emails downloaded without a problem.
That is good, because now I can use this routine and the routine which marks the answers and writes the scores to an Excel file all together in 1 program.
Thanks for the advice!
I wrote a script to send an email if the values match a certain criteria. I'm wanting to send 1 email instead of multiple emails upon every check. I thought I can mitigate by throwing in another function but I can't figure out how to do it. Any ideas on how to accomplish this?
import csv, requests, xmltodict, smtplib, email.utils
from email.mime.text import MIMEText
def sendEmail(host, value, devicename):
# Create the message
msg = MIMEText('This is the body of the message.')
msg['To'] = email.utils.formataddr(('Recipient', 'XXXXXX'))
msg['From'] = email.utils.formataddr(('Author', 'XXXXX'))
msg['Subject'] = 'Simple test message'
server = smtplib.SMTP('XXXXXXX')
server.set_debuglevel(True) # show communication with the server
try:
server.sendmail('XXXXXX', ['XXXXXX'], msg.as_string())
finally:
server.quit()
def check(hostIP, value):
xml = """<?xml version="1.0" encoding="iso-8859-1"?>"""
headers = {'Content-Type': 'application/xml'}
response = requests.post('http://' + hostIP + '/RPC2', data=xml, headers=headers).text
doc = xmltodict.parse(response)
uptime = str(doc['response'])
maxtime = '300'
time = str(uptimeValue)
day = time // (24 * 3600)
if day >= maxtime:
print 'it is'
sendEmail(str(hostIP), str(value), str(devicename))
else:
print "it is not!"
def main():
try:
with open('list.csv', 'r') as file:
reader = csv.DictReader(file)
for row in reader:
check(row['Host'], row['Value'])
except Exception as error:
print ValueError("Could not properly read the csv file")
sys.exit(0)
if __name__ == "__main__":
main()
Replace
def sendEmail(message):
and
if day >= maxtime:
print 'it is'
return (str(hostIP), str(value), str(devicename))
and
device_list = []
for row in reader:
result = check(row['Host'], row['Value'])
if result:
device_list.append(', '.join(result))
sendEmail('\n'.join(device_list))
Instead of using check to send the email, use it to add an item to go into the message body, probably in an array or dictionary. Then, after you are done processing your file, you can use that collected information to build the body of your email to give to a single call to sendEmail.
That means you don't call sendEmail from inside of check; you call it from main once you are done with the file.
Alright so I load the email in from gmail with imaplib and then when I'm trying to parse the email it does not separate anything out in a usable format. I suspect this is because somewhere in the process '<' or '>' are being added to the raw email.
Here is what the debugger is showing me after I have called the method:
As you can see it hasn't really parsed anything into a usable format.
Here is the code I'm using: (NOTE: the .replace('>', '') seems to have no effect on the end result.)
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login('myEmail#gmail.com', 'password')
mail.list()
mail.select('inbox')
typ, data = mail.search(None, 'ALL')
ids = data[0]
id_list = ids.split()
# get the most recent email id
latest_email_id = int( id_list[-1] )
# iterate through 15 messages in descending order starting with latest_email_id
# the '-1' dictates reverse looping order
for i in range( latest_email_id -10, latest_email_id-15, -1 ):
typ, data = mail.fetch( str(i), '(RFC822)' )
for response_part in data:
if isinstance(response_part, tuple):
msg = str(response_part[1]).replace('<', '')
msg = msg.replace('>', '')
msg = email.message_from_string(msg)
#msg = feedparser.parse(response_part[1])
varSubject = msg['subject']
varFrom = msg['from']
python email.message_from_string() parse problems and Parsing email with Python both had very similar and identical problems to me (I think) and they solved it by altering their email, however I'm reading my email straight from Google's servers so I'm not sure exactly what to do to the email to fix it up since removing all '<' and '>' obviously won't work.
So, how do I fix the email that is read from imaplib so that it can be easily read with email.message_from_string()? (Or any other improvements/possible solutions as I'm not 100% certain the '<' and '>' are actually the problem, I'm only guessing based off of those other questions asked.)
Cheers
You shouldn't parse <, > and data between them - it is like parsing HTML, but much more complicated. There are existing solutions to do it.
Here is my code that can read mail with attachments, extract data that can be used for further use and process it to human and code readable format. As you can see, all tasks are being made by third-party modules.
from datetime import datetime
import imaplib
import email
import html2text
from os import path
class MailClient(object):
def __init__(self):
self.m = imaplib.IMAP4_SSL('your.server.com')
self.Login()
def Login(self):
result, data = self.m.login('login#domain.com', 'p#s$w0rd')
if result != 'OK':
raise Exception("Error connecting to mailbox: {}".format(data))
def ReadLatest(self, delete = True):
result, data = self.m.select("inbox")
if result != 'OK':
raise Exception("Error reading inbox: {}".format(data))
if data == ['0']:
return None
latest = data[0].split()[-1]
result, data = self.m.fetch(latest, "(RFC822)")
if result != 'OK':
raise Exception("Error reading email: {}".format(data))
if delete:
self.m.store(latest, '+FLAGS', '\\Deleted')
message = email.message_from_string(data[0][1])
res = {
'From' : email.utils.parseaddr(message['From'])[1],
'From name' : email.utils.parseaddr(message['From'])[0],
'Time' : datetime.fromtimestamp(email.utils.mktime_tz(email.utils.parsedate_tz(message['Date']))),
'To' : message['To'],
'Subject' : email.Header.decode_header(message["Subject"])[0][0],
'Text' : '',
'File' : None
}
for part in message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get_content_maintype() == 'text':
# reading as HTML (not plain text)
_html = part.get_payload(decode = True)
res['Text'] = html2text.html2text(_html)
elif part.get_content_maintype() == 'application' and part.get_filename():
fname = path.join("your/folder", part.get_filename())
attachment = open(fname, 'wb')
attachment.write(part.get_payload(decode = True))
attachment.close()
if res['File']:
res['File'].append(fname)
else:
res['File'] = [fname]
return res
def __del__(self):
self.m.close()