IMAP message gets UnicodeDecodeError 'utf-8' codec can't decode

IMAP message gets UnicodeDecodeError 'utf-8' codec can't decode - python

After 5 hours of trying, time to get some help. Sifted through all the stackoverflow questions related to this but couldn't find the answer.
The code is a gmail parser - works for most emails but some emails cause the UnicodeDecodeError. The problem is "raw_email.decode('utf-8')" but changing it (see comments) causes a different problem down below.
# Source: https://stackoverflow.com/questions/7314942/python-imaplib-to-get-gmail-inbox-subjects-titles-and-sender-name
import datetime
import time
import email
import imaplib
import mailbox
from vars import *
import re # to remove links from str
import string
EMAIL_ACCOUNT = 'gmail_login'
PASSWORD = 'gmail_psswd'
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login(EMAIL_ACCOUNT, PASSWORD)
mail.list()
mail.select('inbox')
result, data = mail.uid('search', None, "ALL") # (ALL/UNSEEN)
id_list = data[0].split()
email_rev = reversed(id_list) # Returns a type list.reverseiterator, which is not list
email_list = list(email_rev)
i = len(email_list)
todays_date = time.strftime("%m/%d/%Y")
for x in range(i):
latest_email_uid = email_list[x]
result, email_data = mail.uid('fetch', latest_email_uid, '(RFC822)')
raw_email = email_data[0][1] # Returns a byte
raw_email_str = raw_email.decode('utf-8') # Returns a str
#raw_email_str = base64.b64decode(raw_email_str1) # Tried this but didn't work.
#raw_email_str = raw_email.decode('utf-8', errors='ignore') # Tried this but caused a TypeError down where var subject is created because something there is expecting a str or byte-like
email_message = email.message_from_string(raw_email_str)
date_tuple = email.utils.parsedate_tz(email_message['Date'])
date_short = f'{date_tuple[1]}/{date_tuple[2]}/{date_tuple[0]}'
# Header Details
if date_short == '12/23/2019':
#if date_tuple:
# local_date = datetime.datetime.fromtimestamp(email.utils.mktime_tz(date_tuple))
# local_message_date = "%s" %(str(local_date.strftime("%a, %d %b %Y %H:%M:%S")))
email_from = str(email.header.make_header(email.header.decode_header(email_message['From'])))
subject = str(email.header.make_header(email.header.decode_header(email_message['Subject'])))
#print(subject)
if email_from.find('restaurants#uber.com') != -1:
print('yay')
# Body details
if email_from.find('restaurants#uber.com') != -1 and subject.find('Payment Summary') != -1:
for part in email_message.walk():
if part.get_content_type() == "text/plain":
body = part.get_payload(decode=True)
body = body.decode("utf-8") # Convert byte to str
body = body.replace("\r\n", " ")
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', body) # removes url links
text2 = text.translate(str.maketrans('', '', string.punctuation))
body_list = re.sub("[^\w]", " ", text2).split()
print(body_list)
print(date_short)
else:
continue

Here is an example how to retrieve and read mail parts with imapclient and the email.* modules from the python standard libs:
from imapclient import IMAPClient
import email
from email import policy
def walk_parts(part, level=0):
print(' ' * 4 * level + part.get_content_type())
# do something with part content (applies encoding by default)
# part.get_content()
if part.is_multipart():
for part in part.get_payload():
get_parts(part, level + 1)
# context manager ensures the session is cleaned up
with IMAPClient(host="your_mail_host") as client:
client.login('user', 'password')
# select some folder
client.select_folder('INBOX')
# do something with folder, e.g. search & grab unseen mails
messages = client.search('UNSEEN')
for uid, message_data in client.fetch(messages, 'RFC822').items():
email_message = email.message_from_bytes(
message_data[b'RFC822'], policy=policy.default)
print(uid, email_message.get('From'), email_message.get('Subject'))
# alternatively search for specific mails
msgs = client.search(['SUBJECT', 'some subject'])
#
# do something with a specific mail:
#
# fetch a single mail with UID 12345
raw_mails = client.fetch([12345], 'RFC822')
# parse the mail (very expensive for big mails with attachments!)
mail = email.message_from_bytes(
raw_mails[12345][b'RFC822'], policy=policy.default)
# Now you have a python object representation of the mail and can dig
# into it. Since a mail can be composed of several subparts we have
# to walk the subparts.
# walk all parts at once
for part in mail.walk():
# do something with that part
print(part.get_content_type())
# or recurse yourself into sub parts until you find the interesting part
walk_parts(mail)
See the docs for email.message.EmailMessage. There you find all needed bits to read into a mail message.

use 'ISO 8859-1' instead of 'utf-8'

I had the same issue And after a lot of research I realized that I simply need to use, message_from_bytes function from email rather than using message_from_string
so for your code simply replace:
raw_email_str = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_str)
to
email_message = email.message_from_bytes(raw_email)
should work like a charm :)

Related

python decode email from base64

hello iam using python script to fetch a message from a specific address mail seems everything work fine but i have a problem with the printable result is a base64 code.
i want to decode the result to get the decode message when do the final result with print, pls help!!
already thanks
the code used.
# Importing libraries
import imaplib, email
user = 'USER_EMAIL_ADDRESS'
password = 'USER_PASSWORD'
imap_url = 'imap.gmail.com'
# Function to get email content part i.e its body part
def get_body(msg):
if msg.is_multipart():
return get_body(msg.get_payload(0))
else:
return msg.get_payload(None, True)
# Function to search for a key value pair
def search(key, value, con):
result, data = con.search(None, key, '"{}"'.format(value))
return data
# Function to get the list of emails under this label
def get_emails(result_bytes):
msgs = [] # all the email data are pushed inside an array
for num in result_bytes[0].split():
typ, data = con.fetch(num, 'BODY.PEEK[1]')
msgs.append(data)
return msgs
# this is done to make SSL connnection with GMAIL
con = imaplib.IMAP4_SSL(imap_url)
# logging the user in
con.login(user, password)
# calling function to check for email under this label
con.select('Inbox')
# fetching emails from this user "tu**h*****1#gmail.com"
msgs = get_emails(search('FROM', 'MY_ANOTHER_GMAIL_ADDRESS', con))
# Uncomment this to see what actually comes as data
# print(msgs)
# Finding the required content from our msgs
# User can make custom changes in this part to
# fetch the required content he / she needs
# printing them by the order they are displayed in your gmail
for msg in msgs[::-1]:
for sent in msg:
if type(sent) is tuple:
# encoding set as utf-8
content = str(sent[1], 'utf-8')
data = str(content)
# Handling errors related to unicodenecode
try:
indexstart = data.find("ltr")
data2 = data[indexstart + 5: len(data)]
indexend = data2.find("</div>")
# printtng the required content which we need
# to extract from our email i.e our body
print(data2[0: indexend])
except UnicodeEncodeError as e:
pass
THE RESULT PRINTED
'''
aGVsbG8gd29yZCBpYW0gdGhlIG1lc3NhZ2UgZnJvbSBnbWFpbA==
'''

You could just use the base64 module to decode base64 encoded strings:
import base64
your_string="aGVsbG8gV29ybGQ==" # the base64 encoded string you need to decode
result = base64.b64decode(your_string.encode("utf8")).decode("utf8")
print(result)
Edit: encoding changed from ASCII to utf-8

If you need to find all encoded places (can be Subject, From, To email addresses with names), the code below might be useful. Given contentData is the entire email,
import re, base64
encodedParts=re.findall('(=\?(.+)\?B\?(.+)\?=)', contentData)
for part in encodedParts:
encodedPart = part[0]
charset = part[1]
encodedContent = part[2]
contentData = contentData.replace(encodedPart, base64.b64decode(encodedContent).decode(charset))

Spaces replaced by =20 after extracting text from email

I tried to get the text of a received gmail, using the email and imaplib modules in python. After decoding with utf-8 and after getting the payload of the message, all the spaces are still replaced by =20. Can I use another decoding step in order to fix this?
The code is the following: (I got it from a youtube tutorial - https://youtu.be/Jt8LizzxkPU )
``
import email
import imaplib
username = "abc"
password = "123"
mail = imaplib.IMAP4_SSL("imap.gmail.com")
mail.login(username,password)
mail.select("inbox")
result, data = mail.uid("search", None,"ALL")
inbox_item_list = data[0].split()
for item in inbox_item_list:
#most_recent = inbox_item_list[-1]
#oldest = inbox_item_list[0]
result2, email_data = mail.uid('fetch',item,'(RFC822)')
raw_email = email_data[0][1].decode("utf-8")
email_message = email.message_from_string(raw_email)
to_ = email_message['To']
from_ = email_message['From']
subject_ = email_message['Subject']
counter = 1
for part in email_message.walk():
if part.get_content_maintype() == "multipart":
continue
filename = part.get_filename()
if not filename:
ext = ".html"
filename = "msg-part-%08d%s" %(counter, ext)
counter += 1
#save file
content_type = part.get_content_type()
print(subject_)
print (content_type)
if "plain" in content_type:
print(part.get_payload())
elif "html" in content_type:
print("do some beautiful soup")
else:
print(content_type)
``

Try to import quopri, and then when you get the content of the email body (or whatever text that has the =20s inside), you can use quopri.decodestring()
I do it like this
quopri.decodestring(part.get_payload())
But do keep in mind that this is if you quite specifically want to decode from quoted-printable. Normally I would say the answer of #jfs is neater.

Here's a complete code example of how a simple email (that contains both a literal =20 as well as =20 sequence that should be replaced by a space) could be decoded:
#!/usr/bin/env python3
import email.policy
email_text = """Subject: =?UTF-8?B?dGVzdCDwn5OnID0yMA==?=
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo=
oooooooooooooooooooooooooooooong=20word
=3D20
^ line starts with =3D20
emoji: <=F0=9F=93=A7>"""
msg = email.message_from_string(
email_text, policy=email.policy.default
)
print("Subject: <{subject}>".format_map(msg))
assert not msg.is_multipart()
print(msg.get_content())
Output
Subject: <test 📧 =20>
loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooong word
=20
^ line starts with =20
emoji: <📧>
msg.walk(), part.get_payload(decode=True) could be used to traverse more complex EmailMessage objects. See email Examples.

How to get emails received from gmail python?

I want to get the last 10 received gmails with python.
Currently I have this code but it only returns a limited number of emails and it manipulates pop3 directly, which makes it unnecessary long.
Source of the code: https://www.code-learner.com/python-use-pop3-to-read-email-example/
import poplib
import smtplib, ssl
def guess_charset(msg):
# get charset from message object.
charset = msg.get_charset()
# if can not get charset
if charset is None:
# get message header content-type value and retrieve the charset from the value.
content_type = msg.get('Content-Type', '').lower()
pos = content_type.find('charset=')
if pos >= 0:
charset = content_type[pos + 8:].strip()
return charset
def decode_str(s):
value, charset = decode_header(s)[0]
if charset:
value = value.decode(charset)
return value
# variable indent_number is used to decide number of indent of each level in the mail multiple bory part.
def print_info(msg, indent_number=0):
if indent_number == 0:
# loop to retrieve from, to, subject from email header.
for header in ['From', 'To', 'Subject']:
# get header value
value = msg.get(header, '')
if value:
# for subject header.
if header=='Subject':
# decode the subject value
value = decode_str(value)
# for from and to header.
else:
# parse email address
hdr, addr = parseaddr(value)
# decode the name value.
name = decode_str(hdr)
value = u'%s <%s>' % (name, addr)
print('%s%s: %s' % (' ' * indent_number, header, value))
# if message has multiple part.
if (msg.is_multipart()):
# get multiple parts from message body.
parts = msg.get_payload()
# loop for each part
for n, part in enumerate(parts):
print('%spart %s' % (' ' * indent_number, n))
print('%s--------------------' % (' ' * indent_number))
# print multiple part information by invoke print_info function recursively.
print_info(part, indent_number + 1)
# if not multiple part.
else:
# get message content mime type
content_type = msg.get_content_type()
# if plain text or html content type.
if content_type=='text/plain' or content_type=='text/html':
# get email content
content = msg.get_payload(decode=True)
# get content string charset
charset = guess_charset(msg)
# decode the content with charset if provided.
if charset:
content = content.decode(charset)
print('%sText: %s' % (' ' * indent_number, content + '...'))
else:
print('%sAttachment: %s' % (' ' * indent_number, content_type))
# input email address, password and pop3 server domain or ip address
email = 'yourgmail#gmail.com'
password = 'yourpassword'
# connect to pop3 server:
server = poplib.POP3_SSL('pop.gmail.com')
# open debug switch to print debug information between client and pop3 server.
server.set_debuglevel(1)
# get pop3 server welcome message.
pop3_server_welcome_msg = server.getwelcome().decode('utf-8')
# print out the pop3 server welcome message.
print(server.getwelcome().decode('utf-8'))
# user account authentication
server.user(email)
server.pass_(password)
# stat() function return email count and occupied disk size
print('Messages: %s. Size: %s' % server.stat())
# list() function return all email list
resp, mails, octets = server.list()
print(mails)
# retrieve the newest email index number
#index = len(mails)
index = 3
# server.retr function can get the contents of the email with index variable value index number.
resp, lines, octets = server.retr(index)
# lines stores each line of the original text of the message
# so that you can get the original text of the entire message use the join function and lines variable.
msg_content = b'\r\n'.join(lines).decode('utf-8')
# now parse out the email object.
from email.parser import Parser
from email.header import decode_header
from email.utils import parseaddr
import poplib
# parse the email content to a message object.
msg = Parser().parsestr(msg_content)
print(len(msg_content))
# get email from, to, subject attribute value.
email_from = msg.get('From')
email_to = msg.get('To')
email_subject = msg.get('Subject')
print('From ' + email_from)
print('To ' + email_to)
print('Subject ' + email_subject)
for part in msg.walk():
if part.get_content_type():
body = part.get_payload(decode=True)
print_info(msg, len(msg))
# delete the email from pop3 server directly by email index.
# server.dele(index)
# close pop3 server connection.
server.quit()
I also tried this code but it didn't work:
import imaplib, email, base64
def fetch_messages(username, password):
messages = []
conn = imaplib.IMAP4_SSL("imap.gmail.com", 993)
conn.login(username, password)
conn.select()
typ, data = conn.uid('search', None, 'ALL')
for num in data[0].split():
typ, msg_data = conn.uid('fetch', num, '(RFC822)')
for response_part in msg_data:
if isinstance(response_part, tuple):
messages.append(email.message_from_string(response_part[1]))
typ, response = conn.store(num, '+FLAGS', r'(\Seen)')
return messages
and this also didn't work for me...
import poplib
from email import parser
pop_conn = poplib.POP3_SSL('pop.gmail.com')
pop_conn.user('#gmail.com')
pop_conn.pass_('password')
messages = [pop_conn.retr(i) for i in range(1, len(pop_conn.list()[1]) + 1)]
# Concat message pieces:
messages = ["\n".join(mssg[1]) for mssg in messages]
#Parse message intom an email object:
messages = [parser.Parser().parsestr(mssg) for mssg in messages]
for message in messages:
print(message['subject'])
print(message['body'])

I managed to solve it, the only issue is that it marks as read every unread email, here is the code I used:
import imaplib
mail = imaplib.IMAP4_SSL('imap.gmail.com')
email = input('Email: ')
password = input('Password: ')
mail.login(email+'#gmail.com', password)
mail.list()
# Out: list of "folders" aka labels in gmail.
mail.select("inbox") # connect to inbox.
result, data = mail.search(None, "ALL")
ids = data[0] # data is a list.
id_list = ids.split() # ids is a space separated string
latest_email_id = id_list[-1] # get the latest
# fetch the email body (RFC822) for the given ID
result, data = mail.fetch(latest_email_id, "(RFC822)")
raw_email = data[0][1] # here's the body, which is raw text of the whole email
# including headers and alternate payloads
import email
email_message = email.message_from_string(str(raw_email))
print (email_message['To'])
print (email.utils.parseaddr(email_message['From'])) # for parsing "Yuji Tomita" <yuji#grovemade.com>
print (email_message.items()) # print all headers
# note that if you want to get text content (body) and the email contains
# multiple payloads (plaintext/ html), you must parse each message separately.
# use something like the following: (taken from a stackoverflow post)
def get_first_text_block(self, email_message_instance):
maintype = email_message_instance.get_content_maintype()
if maintype == 'multipart':
for part in email_message_instance.get_payload():
if part.get_content_maintype() == 'text':
return part.get_payload()
elif maintype == 'text':
return email_message_instance.get_payload()

https://developers.google.com/gmail/api/quickstart/python is the preferred way:
from gmail.gmail import gmail_auth, ListThreadsMatchingQuery
service = gmail_auth()
threads = ListThreadsMatchingQuery(service, query=query)
where:
def ListThreadsMatchingQuery(service, user_id='me', query=''):
"""List all Threads of the user's mailbox matching the query.
Args:
service: Authorized Gmail API service instance.
user_id: User's email address. The special value "me"
can be used to indicate the authenticated user.
query: String used to filter messages returned.
Eg.- 'label:UNREAD' for unread messages only.
Returns:
List of threads that match the criteria of the query. Note that the returned
list contains Thread IDs, you must use get with the appropriate
ID to get the details for a Thread.
"""
try:
response = service.users().threads().list(userId=user_id, q=query).execute()
threads = []
if 'threads' in response:
threads.extend(response['threads'])
while 'nextPageToken' in response:
page_token = response['nextPageToken']
response = service.users().threads().list(userId=user_id, q=query,
pageToken=page_token).execute()
threads.extend(response['threads'])
return threads
except errors.HttpError as error:
raise error

You should try easyimap lib to get a list of e-mails, I'm not sure if works with pop3.
Code example:
import easyimap
host = 'imap.gmail.com'
user = 'you#example.com'
password = 'secret'
mailbox = 'INBOX.subfolder'
imapper = easyimap.connect(host, user, password, mailbox)
email_quantity = 10
emails_from_your_mailbox = imapper.listids(limit=email_quantity)
imapper.quit()

How do I know the "From" of an email? (Using IMAP)

In this case I am downloading plain text from an email with a criteria,
but how do I know the #gmail.com adress who sent it.
I am using Python 3.5.4
import imaplib
import email
mail = imaplib.IMAP4_SSL('imap.gmail.com')
#imaplib module implements connection based on IMAPv4 protocol
mail.login('myemail', 'mypassword')
mail.list() # Lists all labels in GMail
mail.select('inbox') # Connected to inbox.
result, data = mail.uid('search', None, '(HEADER Subject "[News]")')
#search and return uids instead
i = len(data[0].split()) # data[0] is a space separate string
for x in range(i):
latest_email_uid = data[0].split()[x] # unique ids wrt label selected
result, email_data = mail.uid('fetch', latest_email_uid, '(RFC822)')
# fetch the email body (RFC822) for the given ID
raw_email = email_data[0][1]
#From = email.utils.parseaddr(email_data['From'])
#continue inside the same for loop as above
raw_email_string = raw_email.decode('utf-8')
# converts byte literal to string removing b''
email_message = email.message_from_string(raw_email_string)
#this will loop through all the available multiparts in mail
for part in email_message.walk():
if part.get_content_type() == "text/plain": # ignore attachments/html
enter code here`body = part.get_payload(decode=True)
save_string = str("Llave de amigo" + str(x) + str("a"))
# location on disk
myfile = open(save_string, 'a')
myfile.write(body.decode('utf-8'))
# body is again a byte literal
myfile.close()

This is perhaps not obvious from the documentation (assuming Python 2.7), but the email_message object acts like a dict, by implementing the __getitem__ function. Since you fetched and parsed the entirety of the message, you should be able to access it simply as:
email_message['from']
Note, this gives you a raw representation of the header, which is probably okay in a lot of cases.
You may then want to use email.utils.parseaddr to break it into constituent parts:
realname, addr = email.utils.parseaddr(email_message['from')).
email.utils.getaddresses might be useful if you then parse To or Cc headers with more than one recipient.
If you need to deal with internationalized headers in older versions of Python, email.header.decode_header and email.header.make_header can be used.
In Python3.6, this has changed significantly, and should be more straightforward.

email.message_from_string() and imaplib adding '<' '>'

Alright so I load the email in from gmail with imaplib and then when I'm trying to parse the email it does not separate anything out in a usable format. I suspect this is because somewhere in the process '<' or '>' are being added to the raw email.
Here is what the debugger is showing me after I have called the method:
As you can see it hasn't really parsed anything into a usable format.
Here is the code I'm using: (NOTE: the .replace('>', '') seems to have no effect on the end result.)
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login('myEmail#gmail.com', 'password')
mail.list()
mail.select('inbox')
typ, data = mail.search(None, 'ALL')
ids = data[0]
id_list = ids.split()
# get the most recent email id
latest_email_id = int( id_list[-1] )
# iterate through 15 messages in descending order starting with latest_email_id
# the '-1' dictates reverse looping order
for i in range( latest_email_id -10, latest_email_id-15, -1 ):
typ, data = mail.fetch( str(i), '(RFC822)' )
for response_part in data:
if isinstance(response_part, tuple):
msg = str(response_part[1]).replace('<', '')
msg = msg.replace('>', '')
msg = email.message_from_string(msg)
#msg = feedparser.parse(response_part[1])
varSubject = msg['subject']
varFrom = msg['from']
python email.message_from_string() parse problems and Parsing email with Python both had very similar and identical problems to me (I think) and they solved it by altering their email, however I'm reading my email straight from Google's servers so I'm not sure exactly what to do to the email to fix it up since removing all '<' and '>' obviously won't work.
So, how do I fix the email that is read from imaplib so that it can be easily read with email.message_from_string()? (Or any other improvements/possible solutions as I'm not 100% certain the '<' and '>' are actually the problem, I'm only guessing based off of those other questions asked.)
Cheers

You shouldn't parse <, > and data between them - it is like parsing HTML, but much more complicated. There are existing solutions to do it.
Here is my code that can read mail with attachments, extract data that can be used for further use and process it to human and code readable format. As you can see, all tasks are being made by third-party modules.
from datetime import datetime
import imaplib
import email
import html2text
from os import path
class MailClient(object):
def __init__(self):
self.m = imaplib.IMAP4_SSL('your.server.com')
self.Login()
def Login(self):
result, data = self.m.login('login#domain.com', 'p#s$w0rd')
if result != 'OK':
raise Exception("Error connecting to mailbox: {}".format(data))
def ReadLatest(self, delete = True):
result, data = self.m.select("inbox")
if result != 'OK':
raise Exception("Error reading inbox: {}".format(data))
if data == ['0']:
return None
latest = data[0].split()[-1]
result, data = self.m.fetch(latest, "(RFC822)")
if result != 'OK':
raise Exception("Error reading email: {}".format(data))
if delete:
self.m.store(latest, '+FLAGS', '\\Deleted')
message = email.message_from_string(data[0][1])
res = {
'From' : email.utils.parseaddr(message['From'])[1],
'From name' : email.utils.parseaddr(message['From'])[0],
'Time' : datetime.fromtimestamp(email.utils.mktime_tz(email.utils.parsedate_tz(message['Date']))),
'To' : message['To'],
'Subject' : email.Header.decode_header(message["Subject"])[0][0],
'Text' : '',
'File' : None
}
for part in message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get_content_maintype() == 'text':
# reading as HTML (not plain text)
_html = part.get_payload(decode = True)
res['Text'] = html2text.html2text(_html)
elif part.get_content_maintype() == 'application' and part.get_filename():
fname = path.join("your/folder", part.get_filename())
attachment = open(fname, 'wb')
attachment.write(part.get_payload(decode = True))
attachment.close()
if res['File']:
res['File'].append(fname)
else:
res['File'] = [fname]
return res
def __del__(self):
self.m.close()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

IMAP message gets UnicodeDecodeError 'utf-8' codec can't decode - python

use 'ISO 8859-1' instead of 'utf-8'

Related

python decode email from base64

Spaces replaced by =20 after extracting text from email

How to get emails received from gmail python?

How do I know the "From" of an email? (Using IMAP)

email.message_from_string() and imaplib adding '<' '>'

Categories

Resources