Error while decoding UTF-8 encoded csv file

Error while decoding UTF-8 encoded csv file - python

first time posting here :)
I have an issue while trying to download all the attachments from an email Inbox.
I download them, then write them into a file, which I specify the path to.
It works perfectly well for .png files, which are directly dowloaded to the file, but when it comes to a .csv file, it gives me this error message :
OSError: [Errno 22] Invalid argument: 'C:\Users\antoi\OneDrive\Bureau\python_secge\=?UTF-8?B?RXh0cmFjdCBzZXggZ8OpIHB1YmxpYy0yMDIyLTAzLTI2LTAwLTAwLTI2LmNzdg==?='
I think it does not decode well the name of the csv file, but I don't know why.
Thanks for your help!
If you want to look at my code below :
import smtplib
import imaplib
import base64
import os
import email
smtp_address = 'smtp.gmail.com'
smtp_port = 465
email_user = 'XXXX'
email_pass = 'XXXXX'
mail = imaplib.IMAP4_SSL('imap.gmail.com',993)
mail.login(email_user, email_pass)
mail.select('Inbox')
type, data = mail.search(None, 'ALL')
mail_ids=data[0]
idlist=mail_ids.split()
for num in data[0].split():
typ, data = mail.fetch(num, '(RFC822)' )
raw_email = data[0][1]
# converts byte literal to string removing b''
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
# downloading attachments
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
fileName = part.get_filename()
if bool(fileName):
filePath = os.path.join(r'C:\Users\antoi\OneDrive\Bureau\python_secge', fileName)
if not os.path.isfile(filePath) :
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
subject = str(email_message).split("Subject: ", 1)
I tried to change the name of the csv file, which downloaded well, but the content of it was as if it was not decoded :
# #*%%*525EE\ÿÂ ¿ A" ÿÄ7 ÿÚ iÙWúßóÓ ÅIq«‚ÙÊŸ§ˆ˜²‚6`Ø¶ p²#áíîŸÐà ¼ïDù÷.ŽéCÅ >ªþ®|…dÕË' <å8
!õÑàäH¬

Stolen from Encoded-word Syntax1:
import re
import base64
import quopri
def encoded_words_to_text(encoded_words):
encoded_word_regex = r'=\?{1}(.+)\?{1}([B|Q])\?{1}(.+)\?{1}='
charset, encoding, encoded_text = re.match(encoded_word_regex,
encoded_words).groups()
if encoding == 'B':
byte_string = base64.b64decode(encoded_text)
elif encoding == 'Q':
byte_string = quopri.decodestring(encoded_text)
return byte_string.decode(charset)
Apply as follows:
filename = '=?UTF-8?B?RXh0cmFjdCBzZXggZ8OpIHB1YmxpYy0yMDIyLTAzLTI2LTAwLTAwLTI2LmNzdg==?='
encoded_words_to_text(filename)
'Extract sex gé public-2022-03-26-00-00-26.csv'
1 and eliminated SyntaxWarning: "is" with a literal. Did you mean "=="?

Related

How to extract attachments from base64 encoded files?

i am trying to make a script which whill extract attachments from base64 encoded files.
base64 files are on amazon s3. Idea was, to take the data from file on s3, save it in temp file and pass it to the 'extraction' part of the code.
Script below perfectly works with a single file which is stored locally on my pc.
from email.message import EmailMessage
from email.header import decode_header
with open('mail2', 'r') as efile: # mail2 - это название файла который на s3 лежит
msg = EmailMessage.message_from_file(efile)
subject, encoding = decode_header(msg["Subject"])[0]
print(subject)
From, encoding = decode_header(msg.get("From"))[0]
if isinstance(From, bytes):
From = From.decode(encoding)
print(From)
if msg.is_multipart():
# iterate over email parts
for part in msg.walk():
# extract content type of email
content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
try:
# get the email body
body = part.get_payload(decode=True).decode()
except:
pass
if "attachment" in content_disposition:
# download attachment
filename = part.get_filename()
if filename:
#filepath = os.path.join(folder_name, filename)
# download attachment and save it
open(filename, "wb").write(part.get_payload(decode=True))
else:
pass
In order to avoid constant uploadings from s3, i decided to get the body of files and transfer them to temp file using boto3.
my_bucket = s3.Bucket('bucket_name')
substring = "reestrs"
for obj in my_bucket.objects.all():
#получение данных
if re.search(substring, obj.key):
raw_data = obj.get()['Body'].read()
temp = tempfile.NamedTemporaryFile()
temp.write(raw_data)
print(temp)
with open(temp.name, 'r') as efile:
msg = email.message_from_file(temp.name)
subject, encoding = decode_header(msg["Subject"])[0]
print(subject)
From, encoding = decode_header(msg.get("From"))[0]
if isinstance(From, bytes):
From = From.decode(encoding)
print(From)
if msg.is_multipart():
# iterate over email parts
for part in msg.walk():
# extract content type of email
content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
try:
# get the email body
body = part.get_payload(decode=True).decode()
except:
pass
if "attachment" in content_disposition:
# download attachment
filename = part.get_filename()
if filename:
#filepath = os.path.join(folder_name, filename)
# download attachment and save it
open(filename, "wb").write(part.get_payload(decode=True))
temp.close()
else:
pass
Whenever i launch this script, result is
File "rum_fin.py", line 42, in <module>
msg = email.message_from_file(temp.name)
File "/usr/lib/python3.8/email/__init__.py", line 54, in message_from_file
return Parser(*args, **kws).parse(fp)
File "/usr/lib/python3.8/email/parser.py", line 53, in parse
data = fp.read(8192)
AttributeError: 'str' object has no attribute 'read'
I've tested scipt above with a single file. Unfortunately, result is the attachment with base64 name insted of normal csv/png/txt
obj = s3.Object('bucket', 'filename')
raw_data = obj.get()['Body'].read()
temp = tempfile.NamedTemporaryFile()
temp.write(raw_data)
print(temp)
# In[7]:
with open(temp.name, 'r') as efile: # mail2 - это название файла который на s3 лежит
msg = email.message_from_file(efile)
subject, encoding = decode_header(msg["Subject"])[0]
print(subject)
From, encoding = decode_header(msg.get("From"))[0]
if isinstance(From, bytes):
From = From.decode(encoding)
print(From)
I suppose, it's something to do with the way of passing the data from actual file to the temp file. Any ideas how to solve it?

Facing issues in downloading a particular attachment from gmail through Python

I have the below piece of code which works fine for downloading CSV files. But I'm trying to download a file without any extension name where this is failing. The part.get_filename() is not fetching anything and hence the code is failing with error NameError: name 'fileName' is not defined. The search is working correctly and identifying the particular email.
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
import os
import imaplib
import email
#from email.header import decode_header
#import webbrowser
import os
import datetime
import time
import glob
import shutil
today = datetime.date.today()
yday = today - datetime.timedelta(days=5)
# account credentials
username = "xyz#gmail.com"
with open(r'C:\Users\xyz\Downloads\Google sheet key\gmail_app_pwd.txt','r') as pwd:
password=pwd.read()
# create an IMAP4 class with SSL
mailBox = imaplib.IMAP4_SSL("imap.gmail.com")
# authenticate
mailBox.login(username, password)
svdir = r'C:\Users\xyz\Downloads\Work'
boxList = mailBox.list()
# print(boxList)
mailBox.select()
searchQuery = '(SUBJECT "Mailer as on ' + str(yday) +'")'
result, data = mailBox.uid('search', None, searchQuery)
ids = data[0]
# list of uids
id_list = ids.split()
i = len(id_list)
#x=0
for x in range(i):
latest_email_uid = id_list[x]
# fetch the email body (RFC822) for the given ID
result, email_data = mailBox.uid('fetch', latest_email_uid, '(RFC822)')
raw_email = email_data[0][1]
# converts byte literal to string removing b''
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
# downloading attachments
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
fileName = part.get_filename()
if bool(fileName):
filePath = os.path.join(svdir, fileName)
if not os.path.isfile(filePath) :
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
subject = str(email_message).split("Subject: ", 1)[1].split("\nTo:", 1)[0]
print('Downloaded "{file}" from email titled "{subject}" with UID {uid}.'.format(file=fileName, subject=subject, uid=latest_email_uid.decode('utf-8')))
mailBox.close()
mailBox.logout()

It worked after I removed the checks. Updated code:
for x in range(i):
latest_email_uid = id_list[x]
# fetch the email body (RFC822) for the given ID
result, email_data = mailBox.uid('fetch', latest_email_uid, '(RFC822)')
raw_email = email_data[0][1]
# converts byte literal to string removing b''
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
# downloading attachments
for part in email_message.walk():
**fileName = "file.csv"**
if bool(fileName):
filePath = os.path.join(svdir, fileName)
if not os.path.isfile(filePath) :
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()

Get e-mail recieved date of IMAP using Python

I'm using the following code (got it from StackOverflow :)) to get all unread e-mail from specific email adresses. It works perfect!
I would however like to get the actual recived (or sent) date for each e-mail I'm getting an attached file from. But I dont know how to do that?
import email
import imaplib
import os
import sys
import random
import string
import glob
import unicodedata
def remove_accents(s):
nkfd_form = unicodedata.normalize('NFKD', s)
return u''.join([c for c in nkfd_form if not unicodedata.combining(c)])
def remove_non_ascii(text):
return unidecode(unicode(text, encoding = "cp865"))
def replace_non_ascii(x): return ''.join(i if ord(i) < 128 else '_' for i in x)
detach_dir = r'\\1.1.1.1\xxx\xxx\drop_folder'
try:
imapSession = imaplib.IMAP4_SSL('outlook.office365.com')
typ, accountDetails = imapSession.login('xxxx', 'xxxx')
if typ != 'OK':
print ('Not able to sign in!')
raise
imapSession.select('Inbox')
typ, data = imapSession.search(None, '(UNSEEN FROM "#xxxx.xxx")')
if typ != 'OK':
print ('Error searching Inbox.')
raise
# Iterating over all emails
for msgId in data[0].split():
typ, messageParts = imapSession.fetch(msgId, '(RFC822)')
if typ != 'OK':
print ('Error fetching mail.')
raise
emailBody = messageParts[0][1]
mail = email.message_from_string(emailBody)
for part in mail.walk():
if part.get_content_maintype() == 'multipart':
# print part.as_string()
continue
if part.get('Content-Disposition') is None:
# print part.as_string()
continue
fileName = part.get_filename().encode('utf-8')
if bool(fileName):
filePath = os.path.join(detach_dir, 'EXP-' + fileName + '.xls')
if not os.path.isfile(filePath) :
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
imapSession.close()
imapSession.logout()
except :
print ('Not able to download all attachments.')

You could fetch the INTERNALDATE instead of, or in addition to the RFC822 item; It is (generally) time the server received the message.
You will have to do some parsing of the return item, since imaplib does no parsing of FETCH results. It will be easier to parse if it's the only thing you fetch.
The response will look something like
* 5 FETCH (INTERNALDATE "17-Jul-2018 02:44:25 -0700")

Python Download Attachment from Outlook w/ Imaplib4 Never downloads the last Attachment

The script I wrote:1) connects to my work Outlook email. The script reads my username and password from a text file which is found in the variable TextFilePath. 2) Looks for attachments based upon a searchterm I choose that would be in the Subject of the email (here, it's "special_search_term_in_email"). 3) Downloads the attachments to a specific folder titled 'DownloadFolderPath'.
The goal for this script is to run everyday and connect to my email and download 4 attachments that will be sent to me everyday. The issue is that the script will sometimes download all 4 attachments, but then sometimes will only download 3 of 4 attachments and then won't even terminate. Appreciate the help.
import email
import imaplib
import os
import datetime
import csv
# these 3 variables you need to set to run the code
mail_server = "imap-mail.outlook.com"
TextFilePath = "C:/Users/FakeName/PycharmProjects/imaplib_script/unpw.txt"
LogFilePath = 'C:/Users/FakeName/PycharmProjects/imaplib_script/downloaded_attachments/logfile.csv'
DownloadFolderPath = 'C:/Users/FakeName/PycharmProjects/imaplib_script/downloaded_attachments/'
# below read_input_return_list function reads input from a text file and returns a list
def read_input_return_list():
textunpw = open(TextFilePath, "r")
lines = textunpw.readlines()
username = lines[0].strip('\n')
password = lines[1]
textunpw.close()
return [username, password]
read_input_variable = read_input_return_list()
username = read_input_variable[0]
password = read_input_variable[1]
script_ran_time=datetime.datetime.today().strftime('%c')
mail = imaplib.IMAP4_SSL(mail_server)
mail.login(username, password)
print("{0} Connecting to mailbox via IMAP...".format(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")))
mail.select()
type, data = mail.search(None, '(SUBJECT "special_search_term_in_email")')
total_count = 0
with open(LogFilePath,newline='', encoding='utf-8', mode='a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for num in data[0].split():
type, data = mail.fetch(num, '(RFC822)')
raw_email = data[0][1]
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
for part1 in part.walk():
c_type = part.get_content_type()
c_disp0 = str(part.get('Content-Disposition'))
# skip any text/plain (txt) attachments
if c_type == 'text/plain' and 'attachment' not in c_disp0:
body = part1.get_payload(decode=True)
break
attachment = part.get_filename()
if bool(attachment):
filePath = os.path.join(DownloadFolderPath, attachment)
if os.path.isfile(filePath):
filename, file_extension = os.path.splitext(filePath)
FileDownloadAndWriteTime = '__' + datetime.datetime.today().strftime('%m_%d_%Y %H_%M_%S')
new_fname = "{}_{}{}".format(filename, FileDownloadAndWriteTime, file_extension)
while os.path.exists(new_fname):
new_fname = "{}_{}{}".format(filename, FileDownloadAndWriteTime, file_extension)
filePath = new_fname
filepathopen = open(filePath, 'wb')
filepathopen.write(part.get_payload(decode=True))
FileDownloadAndWriteTime = datetime.datetime.today().strftime('%m_%d_%Y %H_%M_%S')
total_count += 1
writer.writerow([filePath,FileDownloadAndWriteTime, script_ran_time])
filepathopen.close()
print('Download file attachment name: ', attachment)
print("Total count of downloaded documents: ", total_count)
mail.close()

I can't pinpoint what's wrong but try adopting this code here: https://gist.github.com/cdunklau/9001357
It worked for me.
I updated the find_attachments method like this:
def find_attachments(message):
"""
Return a tuple of parsed content-disposition dict, message object
for each attachment found.
"""
found = []
for part in message.walk():
if 'content-disposition' not in part:
continue
cdisp = part['content-disposition'].split(';')
cdisp = [x.strip() for x in cdisp]
if cdisp[0].lower() != 'attachment':
continue
parsed = {}
for kv in cdisp[1:]:
try:
key, val = kv.split('=')
if val.startswith('"'):
val = val.strip('"')
elif val.startswith("'"):
val = val.strip("'")
parsed[key] = val
except Exception as e:
parsed['filename']=kv.replace('filename=','')
found+=list(parsed.values())
return found

Retrieve Email Body Text Using imaplib

I'm trying to ensure that I retrieve all of the body text (no matter what format the email is in) from a certain email address. The connection details are omitted in this example (imaplib_connect) since all seems to work, but I don't think the below is sufficient for all email bodies. Any improvement recommendations? I'm writing each email's body to a file.
import imaplib
import imaplib_connect
import uuid
import ConfigParser
import os
import email
c = imaplib_connect.open_connection()
try:
config = ConfigParser.ConfigParser()
config.read([os.path.expanduser('~/reader.config')])
fromAddress = config.get('account', 'fromAddress')
typ, data = c.select('INBOX')
typ, data = c.search(None, '(FROM "' + fromAddress + '")')
print "Processing..."
for num in data[0].split():
typ, data = c.fetch(num, '(RFC822)')
rawMessage = data[0][1]
emailMessage = email.message_from_string(rawMessage)
maintype = emailMessage.get_content_maintype()
fileName = uuid.uuid4().hex
if maintype == 'multipart':
for part in emailMessage.get_payload():
if part.get_content_maintype() == 'text':
with open(fileName + ".txt", "wb") as fo:
fo.write(part.get_payload())
elif maintype == 'text':
with open(fileName + ".txt", "wb") as fo:
fo.write(part.get_payload())
finally:
try:
c.close()
except:
pass
c.logout()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Error while decoding UTF-8 encoded csv file - python

Related

How to extract attachments from base64 encoded files?

Facing issues in downloading a particular attachment from gmail through Python

Get e-mail recieved date of IMAP using Python

Python Download Attachment from Outlook w/ Imaplib4 Never downloads the last Attachment

Retrieve Email Body Text Using imaplib

Categories

Resources