Facing issues in downloading a particular attachment from gmail through Python

Facing issues in downloading a particular attachment from gmail through Python - python

I have the below piece of code which works fine for downloading CSV files. But I'm trying to download a file without any extension name where this is failing. The part.get_filename() is not fetching anything and hence the code is failing with error NameError: name 'fileName' is not defined. The search is working correctly and identifying the particular email.
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
import os
import imaplib
import email
#from email.header import decode_header
#import webbrowser
import os
import datetime
import time
import glob
import shutil
today = datetime.date.today()
yday = today - datetime.timedelta(days=5)
# account credentials
username = "xyz#gmail.com"
with open(r'C:\Users\xyz\Downloads\Google sheet key\gmail_app_pwd.txt','r') as pwd:
password=pwd.read()
# create an IMAP4 class with SSL
mailBox = imaplib.IMAP4_SSL("imap.gmail.com")
# authenticate
mailBox.login(username, password)
svdir = r'C:\Users\xyz\Downloads\Work'
boxList = mailBox.list()
# print(boxList)
mailBox.select()
searchQuery = '(SUBJECT "Mailer as on ' + str(yday) +'")'
result, data = mailBox.uid('search', None, searchQuery)
ids = data[0]
# list of uids
id_list = ids.split()
i = len(id_list)
#x=0
for x in range(i):
latest_email_uid = id_list[x]
# fetch the email body (RFC822) for the given ID
result, email_data = mailBox.uid('fetch', latest_email_uid, '(RFC822)')
raw_email = email_data[0][1]
# converts byte literal to string removing b''
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
# downloading attachments
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
fileName = part.get_filename()
if bool(fileName):
filePath = os.path.join(svdir, fileName)
if not os.path.isfile(filePath) :
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
subject = str(email_message).split("Subject: ", 1)[1].split("\nTo:", 1)[0]
print('Downloaded "{file}" from email titled "{subject}" with UID {uid}.'.format(file=fileName, subject=subject, uid=latest_email_uid.decode('utf-8')))
mailBox.close()
mailBox.logout()

It worked after I removed the checks. Updated code:
for x in range(i):
latest_email_uid = id_list[x]
# fetch the email body (RFC822) for the given ID
result, email_data = mailBox.uid('fetch', latest_email_uid, '(RFC822)')
raw_email = email_data[0][1]
# converts byte literal to string removing b''
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
# downloading attachments
for part in email_message.walk():
**fileName = "file.csv"**
if bool(fileName):
filePath = os.path.join(svdir, fileName)
if not os.path.isfile(filePath) :
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()

Related

Error while decoding UTF-8 encoded csv file

first time posting here :)
I have an issue while trying to download all the attachments from an email Inbox.
I download them, then write them into a file, which I specify the path to.
It works perfectly well for .png files, which are directly dowloaded to the file, but when it comes to a .csv file, it gives me this error message :
OSError: [Errno 22] Invalid argument: 'C:\Users\antoi\OneDrive\Bureau\python_secge\=?UTF-8?B?RXh0cmFjdCBzZXggZ8OpIHB1YmxpYy0yMDIyLTAzLTI2LTAwLTAwLTI2LmNzdg==?='
I think it does not decode well the name of the csv file, but I don't know why.
Thanks for your help!
If you want to look at my code below :
import smtplib
import imaplib
import base64
import os
import email
smtp_address = 'smtp.gmail.com'
smtp_port = 465
email_user = 'XXXX'
email_pass = 'XXXXX'
mail = imaplib.IMAP4_SSL('imap.gmail.com',993)
mail.login(email_user, email_pass)
mail.select('Inbox')
type, data = mail.search(None, 'ALL')
mail_ids=data[0]
idlist=mail_ids.split()
for num in data[0].split():
typ, data = mail.fetch(num, '(RFC822)' )
raw_email = data[0][1]
# converts byte literal to string removing b''
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
# downloading attachments
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
fileName = part.get_filename()
if bool(fileName):
filePath = os.path.join(r'C:\Users\antoi\OneDrive\Bureau\python_secge', fileName)
if not os.path.isfile(filePath) :
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
subject = str(email_message).split("Subject: ", 1)
I tried to change the name of the csv file, which downloaded well, but the content of it was as if it was not decoded :
# #*%%*525EE\ÿÂ ¿ A" ÿÄ7 ÿÚ iÙWúßóÓ ÅIq«‚ÙÊŸ§ˆ˜²‚6`Ø¶ p²#áíîŸÐà ¼ïDù÷.ŽéCÅ >ªþ®|…dÕË' <å8
!õÑàäH¬

Stolen from Encoded-word Syntax1:
import re
import base64
import quopri
def encoded_words_to_text(encoded_words):
encoded_word_regex = r'=\?{1}(.+)\?{1}([B|Q])\?{1}(.+)\?{1}='
charset, encoding, encoded_text = re.match(encoded_word_regex,
encoded_words).groups()
if encoding == 'B':
byte_string = base64.b64decode(encoded_text)
elif encoding == 'Q':
byte_string = quopri.decodestring(encoded_text)
return byte_string.decode(charset)
Apply as follows:
filename = '=?UTF-8?B?RXh0cmFjdCBzZXggZ8OpIHB1YmxpYy0yMDIyLTAzLTI2LTAwLTAwLTI2LmNzdg==?='
encoded_words_to_text(filename)
'Extract sex gé public-2022-03-26-00-00-26.csv'
1 and eliminated SyntaxWarning: "is" with a literal. Did you mean "=="?

IMAP message gets UnicodeDecodeError 'utf-8' codec can't decode

After 5 hours of trying, time to get some help. Sifted through all the stackoverflow questions related to this but couldn't find the answer.
The code is a gmail parser - works for most emails but some emails cause the UnicodeDecodeError. The problem is "raw_email.decode('utf-8')" but changing it (see comments) causes a different problem down below.
# Source: https://stackoverflow.com/questions/7314942/python-imaplib-to-get-gmail-inbox-subjects-titles-and-sender-name
import datetime
import time
import email
import imaplib
import mailbox
from vars import *
import re # to remove links from str
import string
EMAIL_ACCOUNT = 'gmail_login'
PASSWORD = 'gmail_psswd'
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login(EMAIL_ACCOUNT, PASSWORD)
mail.list()
mail.select('inbox')
result, data = mail.uid('search', None, "ALL") # (ALL/UNSEEN)
id_list = data[0].split()
email_rev = reversed(id_list) # Returns a type list.reverseiterator, which is not list
email_list = list(email_rev)
i = len(email_list)
todays_date = time.strftime("%m/%d/%Y")
for x in range(i):
latest_email_uid = email_list[x]
result, email_data = mail.uid('fetch', latest_email_uid, '(RFC822)')
raw_email = email_data[0][1] # Returns a byte
raw_email_str = raw_email.decode('utf-8') # Returns a str
#raw_email_str = base64.b64decode(raw_email_str1) # Tried this but didn't work.
#raw_email_str = raw_email.decode('utf-8', errors='ignore') # Tried this but caused a TypeError down where var subject is created because something there is expecting a str or byte-like
email_message = email.message_from_string(raw_email_str)
date_tuple = email.utils.parsedate_tz(email_message['Date'])
date_short = f'{date_tuple[1]}/{date_tuple[2]}/{date_tuple[0]}'
# Header Details
if date_short == '12/23/2019':
#if date_tuple:
# local_date = datetime.datetime.fromtimestamp(email.utils.mktime_tz(date_tuple))
# local_message_date = "%s" %(str(local_date.strftime("%a, %d %b %Y %H:%M:%S")))
email_from = str(email.header.make_header(email.header.decode_header(email_message['From'])))
subject = str(email.header.make_header(email.header.decode_header(email_message['Subject'])))
#print(subject)
if email_from.find('restaurants#uber.com') != -1:
print('yay')
# Body details
if email_from.find('restaurants#uber.com') != -1 and subject.find('Payment Summary') != -1:
for part in email_message.walk():
if part.get_content_type() == "text/plain":
body = part.get_payload(decode=True)
body = body.decode("utf-8") # Convert byte to str
body = body.replace("\r\n", " ")
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', body) # removes url links
text2 = text.translate(str.maketrans('', '', string.punctuation))
body_list = re.sub("[^\w]", " ", text2).split()
print(body_list)
print(date_short)
else:
continue

Here is an example how to retrieve and read mail parts with imapclient and the email.* modules from the python standard libs:
from imapclient import IMAPClient
import email
from email import policy
def walk_parts(part, level=0):
print(' ' * 4 * level + part.get_content_type())
# do something with part content (applies encoding by default)
# part.get_content()
if part.is_multipart():
for part in part.get_payload():
get_parts(part, level + 1)
# context manager ensures the session is cleaned up
with IMAPClient(host="your_mail_host") as client:
client.login('user', 'password')
# select some folder
client.select_folder('INBOX')
# do something with folder, e.g. search & grab unseen mails
messages = client.search('UNSEEN')
for uid, message_data in client.fetch(messages, 'RFC822').items():
email_message = email.message_from_bytes(
message_data[b'RFC822'], policy=policy.default)
print(uid, email_message.get('From'), email_message.get('Subject'))
# alternatively search for specific mails
msgs = client.search(['SUBJECT', 'some subject'])
#
# do something with a specific mail:
#
# fetch a single mail with UID 12345
raw_mails = client.fetch([12345], 'RFC822')
# parse the mail (very expensive for big mails with attachments!)
mail = email.message_from_bytes(
raw_mails[12345][b'RFC822'], policy=policy.default)
# Now you have a python object representation of the mail and can dig
# into it. Since a mail can be composed of several subparts we have
# to walk the subparts.
# walk all parts at once
for part in mail.walk():
# do something with that part
print(part.get_content_type())
# or recurse yourself into sub parts until you find the interesting part
walk_parts(mail)
See the docs for email.message.EmailMessage. There you find all needed bits to read into a mail message.

use 'ISO 8859-1' instead of 'utf-8'

I had the same issue And after a lot of research I realized that I simply need to use, message_from_bytes function from email rather than using message_from_string
so for your code simply replace:
raw_email_str = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_str)
to
email_message = email.message_from_bytes(raw_email)
should work like a charm :)

Python Download Attachment from Outlook w/ Imaplib4 Never downloads the last Attachment

The script I wrote:1) connects to my work Outlook email. The script reads my username and password from a text file which is found in the variable TextFilePath. 2) Looks for attachments based upon a searchterm I choose that would be in the Subject of the email (here, it's "special_search_term_in_email"). 3) Downloads the attachments to a specific folder titled 'DownloadFolderPath'.
The goal for this script is to run everyday and connect to my email and download 4 attachments that will be sent to me everyday. The issue is that the script will sometimes download all 4 attachments, but then sometimes will only download 3 of 4 attachments and then won't even terminate. Appreciate the help.
import email
import imaplib
import os
import datetime
import csv
# these 3 variables you need to set to run the code
mail_server = "imap-mail.outlook.com"
TextFilePath = "C:/Users/FakeName/PycharmProjects/imaplib_script/unpw.txt"
LogFilePath = 'C:/Users/FakeName/PycharmProjects/imaplib_script/downloaded_attachments/logfile.csv'
DownloadFolderPath = 'C:/Users/FakeName/PycharmProjects/imaplib_script/downloaded_attachments/'
# below read_input_return_list function reads input from a text file and returns a list
def read_input_return_list():
textunpw = open(TextFilePath, "r")
lines = textunpw.readlines()
username = lines[0].strip('\n')
password = lines[1]
textunpw.close()
return [username, password]
read_input_variable = read_input_return_list()
username = read_input_variable[0]
password = read_input_variable[1]
script_ran_time=datetime.datetime.today().strftime('%c')
mail = imaplib.IMAP4_SSL(mail_server)
mail.login(username, password)
print("{0} Connecting to mailbox via IMAP...".format(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")))
mail.select()
type, data = mail.search(None, '(SUBJECT "special_search_term_in_email")')
total_count = 0
with open(LogFilePath,newline='', encoding='utf-8', mode='a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for num in data[0].split():
type, data = mail.fetch(num, '(RFC822)')
raw_email = data[0][1]
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
for part1 in part.walk():
c_type = part.get_content_type()
c_disp0 = str(part.get('Content-Disposition'))
# skip any text/plain (txt) attachments
if c_type == 'text/plain' and 'attachment' not in c_disp0:
body = part1.get_payload(decode=True)
break
attachment = part.get_filename()
if bool(attachment):
filePath = os.path.join(DownloadFolderPath, attachment)
if os.path.isfile(filePath):
filename, file_extension = os.path.splitext(filePath)
FileDownloadAndWriteTime = '__' + datetime.datetime.today().strftime('%m_%d_%Y %H_%M_%S')
new_fname = "{}_{}{}".format(filename, FileDownloadAndWriteTime, file_extension)
while os.path.exists(new_fname):
new_fname = "{}_{}{}".format(filename, FileDownloadAndWriteTime, file_extension)
filePath = new_fname
filepathopen = open(filePath, 'wb')
filepathopen.write(part.get_payload(decode=True))
FileDownloadAndWriteTime = datetime.datetime.today().strftime('%m_%d_%Y %H_%M_%S')
total_count += 1
writer.writerow([filePath,FileDownloadAndWriteTime, script_ran_time])
filepathopen.close()
print('Download file attachment name: ', attachment)
print("Total count of downloaded documents: ", total_count)
mail.close()

I can't pinpoint what's wrong but try adopting this code here: https://gist.github.com/cdunklau/9001357
It worked for me.
I updated the find_attachments method like this:
def find_attachments(message):
"""
Return a tuple of parsed content-disposition dict, message object
for each attachment found.
"""
found = []
for part in message.walk():
if 'content-disposition' not in part:
continue
cdisp = part['content-disposition'].split(';')
cdisp = [x.strip() for x in cdisp]
if cdisp[0].lower() != 'attachment':
continue
parsed = {}
for kv in cdisp[1:]:
try:
key, val = kv.split('=')
if val.startswith('"'):
val = val.strip('"')
elif val.startswith("'"):
val = val.strip("'")
parsed[key] = val
except Exception as e:
parsed['filename']=kv.replace('filename=','')
found+=list(parsed.values())
return found

Read email in python 3.7 using imaplib with HTML body and attachments in the email

I would really appreciate if someone can help me with this issue.
I have implemented the below code to read "unread emails from gmail inbox". I need to print "To", "From", "Subject", "Body" and "save attachments in a specified location"
I have 2 issues here.
If there is any email with attachments, it gives the error Body: [<email.message.Message object at 0x026D1050>, <email.message.Message object at 0x02776B70>]. It will print all the required things and saves attachments but DOESN'T print the body.
This works fine if no attachment is included.
If there is an email body with any styling in it like "bold/italic/underline/colour...etc", it doesn't print as it is.
Example : Python is printed as Python=C2=A0i= and sometimes different styling is seperated by "*".
def get_body(email_message):
for payload in email_message.get_payload():
# print('Body:\t', payload.get_payload())
break
return(payload.get_payload())
def read_email(server,uname,pwd):
username = uname
password = pwd
mail = imaplib.IMAP4_SSL(server)
mail.login(username, password)
mail.select("inbox")
try:
result, data = mail.uid('search', None, '(UNSEEN)')
inbox_item_list = data[0].split()
most_recent = inbox_item_list[-1]
result2, email_data = mail.uid('fetch', most_recent, '(RFC822)')
raw_email = email_data[0][1].decode("UTF-8")
email_message = email.message_from_string(raw_email)
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
att_path = os.path.join(location, filename)
if not os.path.isfile(att_path):
fp = open(att_path, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
print('Downloaded file:', filename)
if email_message.is_multipart():
for payload in email_message.get_payload():
print('To:\t\t', email_message['To'])
print('From:\t', email_message['From'])
print('Subject:', email_message['Subject'])
print('Date:\t',email_message['Date'])
print('Body:\t', get_body(email_message))
break
else:
print('Nothing'])
except IndexError:
print("No new email")
while True:
read_email("imap.gmail.com", "s#gmail.com", "spassword")
time.sleep(10)
Many thanks

I new to python and this is the complete working code I have done to read unseen emails. You can print the elements according to your requirements. It works for gmail and office 365. This script runs for every 10 seconds. This might also work for other email providers by passing the credentials. Hope this helps.
import email
import imaplib
import os
import html2text
import time
detach_dir = 'locationWhereYouWantToSaveYourAttachments'
def get_body(email_message):
for payload in email_message.get_payload():
break
return payload.get_payload()
def two_way_email(server,uname,pwd):
username = uname
password = pwd
mail = imaplib.IMAP4_SSL(server)
mail.login(username, password)
mail.select("inbox")
try:
result, data = mail.uid('search', None, '(UNSEEN)')
inbox_item_list = data[0].split()
most_recent = inbox_item_list[-1]
result2, email_data = mail.uid('fetch', most_recent, '(RFC822)')
raw_email = email_data[0][1].decode("UTF-8")
email_message = email.message_from_string(raw_email)
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
att_path = os.path.join(detach_dir, filename)
if not os.path.isfile(att_path):
fp = open(att_path, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
print('Downloaded file:', filename)
if email_message.is_multipart():
for payload in email_message.get_payload():
print('To:\t\t', email_message['To'])
print('From:\t', email_message['From'])
print('Subject:', email_message['Subject'])
print('Date:\t',email_message['Date'])
for part in email_message.walk():
if (part.get_content_type() == 'text/plain') and (part.get('Content-Disposition') is None):
print('Body:\t',part.get_payload())
break
else:
print('To:\t\t', email_message['To'])
print('From:\t', email_message['From'])
print('Subject:', email_message['Subject'])
print('Date:\t', email_message['Date'])
print('Thread-Index:\t', email_message['Thread-Index'])
text = f"{email_message.get_payload(decode=True)}"
html = text.replace("b'", "")
h = html2text.HTML2Text()
h.ignore_links = True
output = (h.handle(f'''{html}''').replace("\\r\\n", ""))
output = output.replace("'", "")
print(output)
except IndexError:
print("No new email")
while True:
two_way_email("outlook.office365.com", "yourOffice365EmailAddressHere", "yourpassword")
two_way_email("imap.gmail.com", "yourGmailAddressHere", "yourPassword")
time.sleep(10)

from imap_tools import MailBox, A
with MailBox('imap.mail.com').login('test#mail.com', 'pwd') as mailbox:
for msg in mailbox.fetch(A(seen=False)):
body = msg.text or msg.html
print(msg.subject, msg.from_, msg.to, len(body))
for att in msg.attachments:
print(att.filename, len(att.payload))
https://github.com/ikvk/imap_tools
I am lib author.
More examples: https://github.com/ikvk/imap_tools/blob/master/examples/idle.py#L19

Retrieve Email Body Text Using imaplib

I'm trying to ensure that I retrieve all of the body text (no matter what format the email is in) from a certain email address. The connection details are omitted in this example (imaplib_connect) since all seems to work, but I don't think the below is sufficient for all email bodies. Any improvement recommendations? I'm writing each email's body to a file.
import imaplib
import imaplib_connect
import uuid
import ConfigParser
import os
import email
c = imaplib_connect.open_connection()
try:
config = ConfigParser.ConfigParser()
config.read([os.path.expanduser('~/reader.config')])
fromAddress = config.get('account', 'fromAddress')
typ, data = c.select('INBOX')
typ, data = c.search(None, '(FROM "' + fromAddress + '")')
print "Processing..."
for num in data[0].split():
typ, data = c.fetch(num, '(RFC822)')
rawMessage = data[0][1]
emailMessage = email.message_from_string(rawMessage)
maintype = emailMessage.get_content_maintype()
fileName = uuid.uuid4().hex
if maintype == 'multipart':
for part in emailMessage.get_payload():
if part.get_content_maintype() == 'text':
with open(fileName + ".txt", "wb") as fo:
fo.write(part.get_payload())
elif maintype == 'text':
with open(fileName + ".txt", "wb") as fo:
fo.write(part.get_payload())
finally:
try:
c.close()
except:
pass
c.logout()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Facing issues in downloading a particular attachment from gmail through Python - python

Related

Error while decoding UTF-8 encoded csv file

IMAP message gets UnicodeDecodeError 'utf-8' codec can't decode

Python Download Attachment from Outlook w/ Imaplib4 Never downloads the last Attachment

Read email in python 3.7 using imaplib with HTML body and attachments in the email

Retrieve Email Body Text Using imaplib

Categories

Resources