Python3 reading body text from gmail - python

I'm trying to use python3 to parse through an email and output the contents of the body in a list of strings. The contents of the body always follows this pattern:
string \n string \n string \n etc.
The error I get currently is initial_value must be str or None, not bytes
import imaplib
import email
import time
import smtplib
from_email = "someemail#gmail.com"
from_pwd = "somepass"
smtp_server = "imap.gmail.com"
smtp_port= 993
def readmail(from_email,from_pwd,smtp_server,smtp_port ):
try:
mail = imaplib.IMAP4_SSL(smtp_server)
mail.login(from_email,from_pwd)
mail.select('inbox')
result, data = mail.search(None, 'ALL')
mail_ids = data[0]
id_list = mail_ids.split()
first_email_id =id_list[0]
latest_email_id = id_list[-1] #most recent email
result,data = mail.fetch(latest_email_id, "(RFC822)")
raw_email = data[0][1]
#read the email
email_message = email.message_from_string(raw_email)
return email_message_instance.get_payload()
except Exception as e:
print(e)
print(readmail(from_email,from_pwd,smtp_server,smtp_port))
I know the error lies after the "#read the email" comments because I was able to print out the raw_email

email.message_from_string() expects string argument. You can use email.message_from_bytes() instead to fix this issue.
Alternatively, you can convert raw_email to a string as follows
mail_content = raw_email.decode('utf-8')
and pass mail_content in email.message_from_string()

Related

How can I read the mail body of a mail with Python?

Log in and read subject works. An error occurs when reading the body. What is the error? In the internet the error was always in this part : " email.message_from_bytes(data[0][1].decode())"but I think this part is correct.
# Connection settings
HOST = 'imap.host'
USERNAME = 'name#domain.com'
PASSWORD = 'password'
m = imaplib.IMAP4_SSL(HOST, 993)
m.login(USERNAME, PASSWORD)
m.select('INBOX')
result, data = m.uid('search', None, "UNSEEN")
if result == 'OK':
for num in data[0].split()[:5]:
result, data = m.uid('fetch', num, '(RFC822)')
if result == 'OK':
email_message_raw = email.message_from_bytes(data[0][1])
email_from = str(make_header(decode_header(email_message_raw['From'])))
# von Edward Chapman -> https://stackoverflow.com/questions/7314942/python-imaplib-to-get-gmail-inbox-subjects-titles-and-sender-name
subject = str(email.header.make_header(email.header.decode_header(email_message_raw['Subject'])))
# content = email_message_raw.get_payload(decode=True)
# von Todor Minakov -> https://stackoverflow.com/questions/17874360/python-how-to-parse-the-body-from-a-raw-email-given-that-raw-email-does-not
b = email.message_from_string(email_message_raw)
body = ""
if b.is_multipart():
for part in b.walk():
ctype = part.get_content_type()
cdispo = str(part.get('Content-Disposition'))
# skip any text/plain (txt) attachments
if ctype == 'text/plain' and 'attachment' not in cdispo:
body = part.get_payload(decode=True) # decode
break
# not multipart - i.e. plain text, no attachments, keeping fingers crossed
else:
body = b.get_payload(decode=True)
m.close()
m.logout()
txt = body
regarding = subject
print("###########################################################")
print(regarding)
print("###########################################################")
print(txt)
print("###########################################################")
Error message:
TypeError: initial_value must be str or None, not Message
Thanks for the comments and reply
You have everything in place. Just have to understand a few concepts.
"email" library allows you to convert typical email bytes into an easily usable object called Message using its parser APIs, such as message_from_bytes(), message_from_string(), etc.
The typical error is due to an input error.
email.message_from_bytes(data[0][1].decode())
The function above, message_from_bytes, takes bytes as an input not str. So, it is redundant to decode data[0][1] and also inputting through the parser API.
In short, you are trying to parse the original email message twice using message_from_bytes(data[0][1]) and message_from_string(email_message_raw). Get rid of one of them and you will be all set!
Try this approach:
HOST = 'imap.host'
USERNAME = 'name#domain.com'
PASSWORD = 'password'
m = imaplib.IMAP4_SSL(HOST, 993)
m.login(USERNAME, PASSWORD)
m.select('INBOX')
result, data = m.uid('search', None, "UNSEEN")
if result == 'OK':
for num in data[0].split()[:5]:
result, data = m.uid('fetch', num, '(RFC822)')
if result == 'OK':
email_message = email.message_from_bytes(data[0][1])
email_from = str(make_header(decode_header(email_message_raw['From'])))
# von Edward Chapman -> https://stackoverflow.com/questions/7314942/python-imaplib-to-get-gmail-inbox-subjects-titles-and-sender-name
subject = str(email.header.make_header(email.header.decode_header(email_message_raw['Subject'])))
# content = email_message_raw.get_payload(decode=True)
# von Todor Minakov -> https://stackoverflow.com/questions/17874360/python-how-to-parse-the-body-from-a-raw-email-given-that-raw-email-does-not
# b = email.message_from_string(email_message_raw)
# this is already set as Message object which have many methods (i.e. is_multipart(), walk(), etc.)
b = email_message
body = ""
if b.is_multipart():
for part in b.walk():
ctype = part.get_content_type()
cdispo = str(part.get('Content-Disposition'))
# skip any text/plain (txt) attachments
if ctype == 'text/plain' and 'attachment' not in cdispo:
body = part.get_payload(decode=True) # decode
break
# not multipart - i.e. plain text, no attachments, keeping fingers crossed
else:
body = b.get_payload(decode=True)
m.close()
m.logout()
txt = body
regarding = subject
print("###########################################################")
print(regarding)
print("###########################################################")
print(txt)
print("###########################################################")
from imap_tools import MailBox, AND
# get email bodies from INBOX
with MailBox('imap.mail.com').login('test#mail.com', 'password', 'INBOX') as mailbox:
for msg in mailbox.fetch():
body = msg.text or msg.html
https://github.com/ikvk/imap_tools

Read body from raw email

How can i read the body of any mail its not coming properly in this
manner
I tried this :
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login(email_user, email_pass)
mail.select('Inbox')
type, data = mail.search(None, 'ALL')
mail_ids = data[0]
id_list = mail_ids.split()
for num in data[0].split():
typ, data = mail.fetch(num, '(RFC822)' )
raw_email = data[0][1] # converts byte literal to string removing b''
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
subject = str(email_message).split("Subject: ", 1)[1].split("\nTo:", 1)[0]
#body = str(email_message).split("body: ", 1)[1].split("\nTo:", 1)[0]
print(email_message);
its showing
If you simply want to parse the email and access the body, then consider using mail-parser. It's a simple mail-parser that takes as input a raw email and generates a parsed object.
import mailparser
mail = mailparser.parse_from_file(f)
mail = mailparser.parse_from_file_obj(fp)
mail = mailparser.parse_from_string(raw_mail)
mail = mailparser.parse_from_bytes(byte_mail)
How to Use:
mail.body #use this to access the body contents
mail.to

IMAP message gets UnicodeDecodeError 'utf-8' codec can't decode

After 5 hours of trying, time to get some help. Sifted through all the stackoverflow questions related to this but couldn't find the answer.
The code is a gmail parser - works for most emails but some emails cause the UnicodeDecodeError. The problem is "raw_email.decode('utf-8')" but changing it (see comments) causes a different problem down below.
# Source: https://stackoverflow.com/questions/7314942/python-imaplib-to-get-gmail-inbox-subjects-titles-and-sender-name
import datetime
import time
import email
import imaplib
import mailbox
from vars import *
import re # to remove links from str
import string
EMAIL_ACCOUNT = 'gmail_login'
PASSWORD = 'gmail_psswd'
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login(EMAIL_ACCOUNT, PASSWORD)
mail.list()
mail.select('inbox')
result, data = mail.uid('search', None, "ALL") # (ALL/UNSEEN)
id_list = data[0].split()
email_rev = reversed(id_list) # Returns a type list.reverseiterator, which is not list
email_list = list(email_rev)
i = len(email_list)
todays_date = time.strftime("%m/%d/%Y")
for x in range(i):
latest_email_uid = email_list[x]
result, email_data = mail.uid('fetch', latest_email_uid, '(RFC822)')
raw_email = email_data[0][1] # Returns a byte
raw_email_str = raw_email.decode('utf-8') # Returns a str
#raw_email_str = base64.b64decode(raw_email_str1) # Tried this but didn't work.
#raw_email_str = raw_email.decode('utf-8', errors='ignore') # Tried this but caused a TypeError down where var subject is created because something there is expecting a str or byte-like
email_message = email.message_from_string(raw_email_str)
date_tuple = email.utils.parsedate_tz(email_message['Date'])
date_short = f'{date_tuple[1]}/{date_tuple[2]}/{date_tuple[0]}'
# Header Details
if date_short == '12/23/2019':
#if date_tuple:
# local_date = datetime.datetime.fromtimestamp(email.utils.mktime_tz(date_tuple))
# local_message_date = "%s" %(str(local_date.strftime("%a, %d %b %Y %H:%M:%S")))
email_from = str(email.header.make_header(email.header.decode_header(email_message['From'])))
subject = str(email.header.make_header(email.header.decode_header(email_message['Subject'])))
#print(subject)
if email_from.find('restaurants#uber.com') != -1:
print('yay')
# Body details
if email_from.find('restaurants#uber.com') != -1 and subject.find('Payment Summary') != -1:
for part in email_message.walk():
if part.get_content_type() == "text/plain":
body = part.get_payload(decode=True)
body = body.decode("utf-8") # Convert byte to str
body = body.replace("\r\n", " ")
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', body) # removes url links
text2 = text.translate(str.maketrans('', '', string.punctuation))
body_list = re.sub("[^\w]", " ", text2).split()
print(body_list)
print(date_short)
else:
continue
Here is an example how to retrieve and read mail parts with imapclient and the email.* modules from the python standard libs:
from imapclient import IMAPClient
import email
from email import policy
def walk_parts(part, level=0):
print(' ' * 4 * level + part.get_content_type())
# do something with part content (applies encoding by default)
# part.get_content()
if part.is_multipart():
for part in part.get_payload():
get_parts(part, level + 1)
# context manager ensures the session is cleaned up
with IMAPClient(host="your_mail_host") as client:
client.login('user', 'password')
# select some folder
client.select_folder('INBOX')
# do something with folder, e.g. search & grab unseen mails
messages = client.search('UNSEEN')
for uid, message_data in client.fetch(messages, 'RFC822').items():
email_message = email.message_from_bytes(
message_data[b'RFC822'], policy=policy.default)
print(uid, email_message.get('From'), email_message.get('Subject'))
# alternatively search for specific mails
msgs = client.search(['SUBJECT', 'some subject'])
#
# do something with a specific mail:
#
# fetch a single mail with UID 12345
raw_mails = client.fetch([12345], 'RFC822')
# parse the mail (very expensive for big mails with attachments!)
mail = email.message_from_bytes(
raw_mails[12345][b'RFC822'], policy=policy.default)
# Now you have a python object representation of the mail and can dig
# into it. Since a mail can be composed of several subparts we have
# to walk the subparts.
# walk all parts at once
for part in mail.walk():
# do something with that part
print(part.get_content_type())
# or recurse yourself into sub parts until you find the interesting part
walk_parts(mail)
See the docs for email.message.EmailMessage. There you find all needed bits to read into a mail message.
use 'ISO 8859-1' instead of 'utf-8'
I had the same issue And after a lot of research I realized that I simply need to use, message_from_bytes function from email rather than using message_from_string
so for your code simply replace:
raw_email_str = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_str)
to
email_message = email.message_from_bytes(raw_email)
should work like a charm :)

Python Email body content Base64 not able to decode

While am trying to parse the body of an email, the body will get as
VCBJTkZPUk1BVElPTjwvdGQ+Cgk8L3RyPgoJPHRyPjx0ZD4mbmJzcDs8L3RkPjwvdHI+Cgk8dHI+
And when I try to decode it separately, it works successfully
import base64
data="VCBJTkZPUk1BVElPTjwvdGQ+Cgk8L3RyPgoJPHRyPjx0ZD4mbmJzcDs8L3RkPjwvdHI+Cgk8dHI+"
print(base64.b64decode((data)))
Output:
b'T INFORMATION</td>\n\t</tr>\n\t<tr><td> </td></tr>\n\t<tr>'
But while i tried the same in my mail parsing script, it doesnt works
try:
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login(FROM_EMAIL,FROM_PWD)
mail.select('inbox')
type, data = mail.search(None, 'ALL')
mail_ids = data[0]
id_list = mail_ids.split()
first_email_id = int(id_list[0])
latest_email_id = int(id_list[-1])
print(id_list)
print(first_email_id)
print(latest_email_id)
for i in data[0].decode().split(' '):
print(i)
typ, data = mail.fetch(i, '(RFC822)' )
data=(data[0][1])
print(base64.b64decode(data))
except Exception as e:
print(str(e))
The output is getting as follows:
b"\r\xe9b\xbd\xea\xdeu:-\xa2|\xa9\xae\x8b^rH&j)\\\"
Is there any way to decode this ?

TypeError: initial_value must be str or none, not bytes in python 3?

Here is my code:
import imaplib
from email.parser import HeaderParser
conn = imaplib.IMAP4_SSL('imap.gmail.com')
conn.login('example#gmail.com', 'password')
conn.select()
conn.search(None, 'ALL')
data = conn.fetch('1', '(BODY[HEADER])')
header_data = data[1][0][1]
parser = HeaderParser()
msg = parser.parsestr(header_data)
From this i get the error message:
TypeError: initial_value must be str or none, not bytes
Im using python 3 which apparently automatically decodes. So why am i still getting this error message?
You can try:
header_data = data[1][0][1].decode('utf-8')
I would suggest do this,(Python 3)
typ, data = conn.fetch('1', '(RFC822)') # will read the first email
email_content = data[0][1]
msg = email.message_from_bytes(email_content) # this needs to be corrected in your case
emailDate = msg["Date"]
emaiSubject = msg["Subject"]
You probably want to use a BytesHeaderParser in this case.

Categories