How to extract attachments from base64 encoded files? - python

i am trying to make a script which whill extract attachments from base64 encoded files.
base64 files are on amazon s3. Idea was, to take the data from file on s3, save it in temp file and pass it to the 'extraction' part of the code.
Script below perfectly works with a single file which is stored locally on my pc.
from email.message import EmailMessage
from email.header import decode_header
with open('mail2', 'r') as efile: # mail2 - это название файла который на s3 лежит
msg = EmailMessage.message_from_file(efile)
subject, encoding = decode_header(msg["Subject"])[0]
print(subject)
From, encoding = decode_header(msg.get("From"))[0]
if isinstance(From, bytes):
From = From.decode(encoding)
print(From)
if msg.is_multipart():
# iterate over email parts
for part in msg.walk():
# extract content type of email
content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
try:
# get the email body
body = part.get_payload(decode=True).decode()
except:
pass
if "attachment" in content_disposition:
# download attachment
filename = part.get_filename()
if filename:
#filepath = os.path.join(folder_name, filename)
# download attachment and save it
open(filename, "wb").write(part.get_payload(decode=True))
else:
pass
In order to avoid constant uploadings from s3, i decided to get the body of files and transfer them to temp file using boto3.
my_bucket = s3.Bucket('bucket_name')
substring = "reestrs"
for obj in my_bucket.objects.all():
#получение данных
if re.search(substring, obj.key):
raw_data = obj.get()['Body'].read()
temp = tempfile.NamedTemporaryFile()
temp.write(raw_data)
print(temp)
with open(temp.name, 'r') as efile:
msg = email.message_from_file(temp.name)
subject, encoding = decode_header(msg["Subject"])[0]
print(subject)
From, encoding = decode_header(msg.get("From"))[0]
if isinstance(From, bytes):
From = From.decode(encoding)
print(From)
if msg.is_multipart():
# iterate over email parts
for part in msg.walk():
# extract content type of email
content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
try:
# get the email body
body = part.get_payload(decode=True).decode()
except:
pass
if "attachment" in content_disposition:
# download attachment
filename = part.get_filename()
if filename:
#filepath = os.path.join(folder_name, filename)
# download attachment and save it
open(filename, "wb").write(part.get_payload(decode=True))
temp.close()
else:
pass
Whenever i launch this script, result is
File "rum_fin.py", line 42, in <module>
msg = email.message_from_file(temp.name)
File "/usr/lib/python3.8/email/__init__.py", line 54, in message_from_file
return Parser(*args, **kws).parse(fp)
File "/usr/lib/python3.8/email/parser.py", line 53, in parse
data = fp.read(8192)
AttributeError: 'str' object has no attribute 'read'
I've tested scipt above with a single file. Unfortunately, result is the attachment with base64 name insted of normal csv/png/txt
obj = s3.Object('bucket', 'filename')
raw_data = obj.get()['Body'].read()
temp = tempfile.NamedTemporaryFile()
temp.write(raw_data)
print(temp)
# In[7]:
with open(temp.name, 'r') as efile: # mail2 - это название файла который на s3 лежит
msg = email.message_from_file(efile)
subject, encoding = decode_header(msg["Subject"])[0]
print(subject)
From, encoding = decode_header(msg.get("From"))[0]
if isinstance(From, bytes):
From = From.decode(encoding)
print(From)
I suppose, it's something to do with the way of passing the data from actual file to the temp file. Any ideas how to solve it?

Related

How do I implement the imaplib search function?

import imaplib
import email
from email.header import decode_header
import webbrowser
import os
# account credentials
username = "example#stack.com"
password = "exapleforstack"
imap_server = "imap.one.com"
def clean(text):
# clean text for creating a folder
return "".join(c if c.isalnum() else "_" for c in text)
# create an IMAP4 class with SSL
imap = imaplib.IMAP4_SSL(imap_server)
# authenticate
imap.login(username, password)
status, messages = imap.select("INBOX")
imap.search(None, 'SUBJECT', '"exampleforstack"')
# number of top emails to fetch
N = 3
# total number of emails
messages = int(messages[0])
for i in range(messages, messages-N, -1):
# fetch the email message by ID
res, msg = imap.fetch(str(i), "(RFC822)")
for response in msg:
if isinstance(response, tuple):
# parse a bytes email into a message object
msg = email.message_from_bytes(response[1])
# decode the email subject
subject, encoding = decode_header(msg["Subject"])[0]
if isinstance(subject, bytes):
# if it's a bytes, decode to str
subject = subject.decode(encoding)
# decode email sender
From, encoding = decode_header(msg.get("From"))[0]
if isinstance(From, bytes):
From = From.decode(encoding)
print("Subject:", subject)
print("From:", From)
# if the email message is multipart
if msg.is_multipart():
# iterate over email parts
for part in msg.walk():
# extract content type of email
content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
try:
# get the email body
body = part.get_payload(decode=True).decode()
except:
pass
if content_type == "text/plain" and "attachment" not in content_disposition:
# print text/plain emails and skip attachments
print(body)
elif "attachment" in content_disposition:
# download attachment
filename = part.get_filename()
if filename:
folder_name = clean(subject)
if not os.path.isdir(folder_name):
# make a folder for this email (named after the subject)
os.mkdir(folder_name)
filepath = os.path.join(folder_name, filename)
# download attachment and save it
open(filepath, "wb").write(part.get_payload(decode=True))
else:
# extract content type of email
content_type = msg.get_content_type()
# get the email body
body = msg.get_payload(decode=True).decode()
if content_type == "text/plain":
# print only text email parts
print(body)
if content_type == "text/html":
# if it's HTML, create a new HTML file and open it in browser
folder_name = clean(subject)
if not os.path.isdir(folder_name):
# make a folder for this email (named after the subject)
os.mkdir(folder_name)
filename = "index.html"
filepath = os.path.join(folder_name, filename)
# write the file
open(filepath, "w").write(body)
# open in the default browser
webbrowser.open(filepath)
print("="*100)
# close the connection and logout
imap.close()
imap.logout()
I tried using the search method a couple different ways, like:
res, msg = imap.search(None, 'SUBJECT', "example")
and
res, msg = imap.search(None, 'SUBJECT, "example"')
but my code just gives an error and automatically fetches the most recent 3.
I've tried replacing the line:
res, msg = imap.fetch(str(i), "(RFC822)")
with
res, msg = imap.search(None, 'SUBJECT', '"example"')
but the program returns nothing at all.
how would I go about implementing the search I got this code from pythoncode and altered its credentials.
But I'm not sure why I can't implement the search function.

How to get specific details from email using python beautiful soup and extract to excel

i am trying to get specific data from my email such as check in & checkout date from my email, but i not sure what can i do to get the specific text from the email.
For the code below, it can search for specific hotel booking number and write in html file.
For now only not sure how can i read from those html file and get the specific details and extract to excel
my_mail = imaplib.IMAP4_SSL("imap.gmail.com","993")
my_mail.login(username, password)
my_mail.select('Inbox')
key = 'SUBJECT'
value = "721182693"
_, data = my_mail.search(None, key, value)
mail_id_list = data[0].split()
msgs = []
for num in mail_id_list:
_, data = my_mail.fetch(num, '(RFC822)') #RFC822 returns whole message (BODY fetches just body)
msgs.append(data)
def clean(text):
return "".join(c if c.isalnum() else "_" for c in text)
soup = BeautifulSoup(data)
for msg in msgs[::-1]:
for response in msg:
if isinstance(response, tuple):
msg = email.message_from_bytes(response[1])
subject, encoding = decode_header(msg["Subject"])[0]
if isinstance(subject, bytes):
subject = subject.decode(encoding="utf-8")
From, encoding = decode_header(msg.get("From"))[0]
if isinstance(From, bytes):
From = From.decode(encoding)
print("Subject:", subject)
print("From:", From)
if msg.is_multipart():
for part in msg.walk():
content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
try:
body = part.get_payload(decode=True).decode()
except:
pass
if content_type == "text/plain" and "attachment" not in content_disposition:
print(body)
elif "attachment" in content_disposition:
filename = part.get_filename()
if filename:
folder_name = clean(subject)
if not os.path.isdir(folder_name):
os.mkdir(folder_name)
filename = "index.html"
filepath = os.path.join(folder_name, filename)
open(filepath, "w",encoding="utf-8").write(body)
webbrowser.open(filepath)
else:
content_type = msg.get_content_type()
body = msg.get_payload(decode=True).decode()
if content_type == "text/plain":
print(body)
if content_type == "text/html":
folder_name = clean(subject)
if not os.path.isdir(folder_name):
os.mkdir(folder_name)
filename = "index.html"
filepath = os.path.join(folder_name, filename)
open(filepath, "w",encoding="utf-8").write(body)
webbrowser.open(filepath)
print("="*100)`

Fetch a particular unseen email and print only those emails

To elaborate the question I have a code that searches for UNSEEN emails and stores the ID to a variable.
status, messages = mail.select('Inbox')
messages = int(messages[0])
_, new_mails = mail.search(None, '(UNSEEN)')
recent_mails = len(new_mails[0].split())
print("Total Messages that is New:" , recent_mails)
print(new_mails)
and it prints this:
Total Messages that is New: 2
[b'389 393']
What I want to do is the use these numbers to fetch it's contents like subject, who sent it and the body of the email. Is it possible to this implementation?
I have a previous code that fetches the first and beyond emails by indicating on how emails it will go to using the variable
N code follows:
N = 0
for i in range(messages, messages-N, -1):
# fetch the email message by ID
res, msg = mail.fetch(str(i), "(RFC822)")
for response in msg:
if isinstance(response, tuple):
# parse a bytes email into a message object
msg = email.message_from_bytes(response[1])
# decode the email subject
pre_subject, encoding = decode_header(msg["Subject"])[0]
subject = pre_subject.upper()
if isinstance(subject, bytes):
# if it's a bytes, decode to str
subject = subject.decode(encoding)
# decode email sender
From, encoding = decode_header(msg.get("From"))[0]
if isinstance(From, bytes):
From = From.decode(encoding)
print("Subject:", pre_subject)
print("From:", From)
# if the email message is multipart
if msg.is_multipart():
# iterate over email parts
for part in msg.walk():
# extract content type of email
content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
try:
# get the email body
body = part.get_payload(decode=True).decode()
except:
pass
if content_type == "text/plain" and "attachment" not in content_disposition:
# print text/plain emails and skip attachments
print(body)
plain = body
elif "attachment" in content_disposition:
# download attachment
filename = part.get_filename()
if filename:
folder_name = clean(subject)
if not os.path.isdir(folder_name):
# make a folder for this email (named after the subject)
os.mkdir(folder_name)
filepath = os.path.join(folder_name, filename)
# download attachment and save it
open(filepath, "wb").write(part.get_payload(decode=True))
else:
# extract content type of email
content_type = msg.get_content_type()
# get the email body
body = msg.get_payload(decode=True).decode()
if content_type == "text/plain":
# print only text email parts
print(body)
plain = body
print("="*100)
Finally for hours of experimenting I successfully implemented it.
Basically the ID I fetched on UNSEEN emails is converted to string from byte and then pass those number lists to the loop
gmail_host = 'imap.gmail.com'
mail = imaplib.IMAP4_SSL(gmail_host)
mail.login(EMAIL_ADDRESS,EMAIL_PASSWORD)
mail.list()
status, messages = mail.select('Inbox')
messages = int(messages[0])
_, raw_mails = mail.search(None, '(UNSEEN)')
recent_mails = len(raw_mails[0].split())
splited = str(raw_mails[0], 'utf-8')
new_emails = splited.split()
print("Total Messages that is New:" , recent_mails)
print(new_emails)
Output:
['378', '390']
And changed my for loop to this
for i in new_emails:

Python Download Attachment from Outlook w/ Imaplib4 Never downloads the last Attachment

The script I wrote:1) connects to my work Outlook email. The script reads my username and password from a text file which is found in the variable TextFilePath. 2) Looks for attachments based upon a searchterm I choose that would be in the Subject of the email (here, it's "special_search_term_in_email"). 3) Downloads the attachments to a specific folder titled 'DownloadFolderPath'.
The goal for this script is to run everyday and connect to my email and download 4 attachments that will be sent to me everyday. The issue is that the script will sometimes download all 4 attachments, but then sometimes will only download 3 of 4 attachments and then won't even terminate. Appreciate the help.
import email
import imaplib
import os
import datetime
import csv
# these 3 variables you need to set to run the code
mail_server = "imap-mail.outlook.com"
TextFilePath = "C:/Users/FakeName/PycharmProjects/imaplib_script/unpw.txt"
LogFilePath = 'C:/Users/FakeName/PycharmProjects/imaplib_script/downloaded_attachments/logfile.csv'
DownloadFolderPath = 'C:/Users/FakeName/PycharmProjects/imaplib_script/downloaded_attachments/'
# below read_input_return_list function reads input from a text file and returns a list
def read_input_return_list():
textunpw = open(TextFilePath, "r")
lines = textunpw.readlines()
username = lines[0].strip('\n')
password = lines[1]
textunpw.close()
return [username, password]
read_input_variable = read_input_return_list()
username = read_input_variable[0]
password = read_input_variable[1]
script_ran_time=datetime.datetime.today().strftime('%c')
mail = imaplib.IMAP4_SSL(mail_server)
mail.login(username, password)
print("{0} Connecting to mailbox via IMAP...".format(datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")))
mail.select()
type, data = mail.search(None, '(SUBJECT "special_search_term_in_email")')
total_count = 0
with open(LogFilePath,newline='', encoding='utf-8', mode='a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for num in data[0].split():
type, data = mail.fetch(num, '(RFC822)')
raw_email = data[0][1]
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
for part1 in part.walk():
c_type = part.get_content_type()
c_disp0 = str(part.get('Content-Disposition'))
# skip any text/plain (txt) attachments
if c_type == 'text/plain' and 'attachment' not in c_disp0:
body = part1.get_payload(decode=True)
break
attachment = part.get_filename()
if bool(attachment):
filePath = os.path.join(DownloadFolderPath, attachment)
if os.path.isfile(filePath):
filename, file_extension = os.path.splitext(filePath)
FileDownloadAndWriteTime = '__' + datetime.datetime.today().strftime('%m_%d_%Y %H_%M_%S')
new_fname = "{}_{}{}".format(filename, FileDownloadAndWriteTime, file_extension)
while os.path.exists(new_fname):
new_fname = "{}_{}{}".format(filename, FileDownloadAndWriteTime, file_extension)
filePath = new_fname
filepathopen = open(filePath, 'wb')
filepathopen.write(part.get_payload(decode=True))
FileDownloadAndWriteTime = datetime.datetime.today().strftime('%m_%d_%Y %H_%M_%S')
total_count += 1
writer.writerow([filePath,FileDownloadAndWriteTime, script_ran_time])
filepathopen.close()
print('Download file attachment name: ', attachment)
print("Total count of downloaded documents: ", total_count)
mail.close()
I can't pinpoint what's wrong but try adopting this code here: https://gist.github.com/cdunklau/9001357
It worked for me.
I updated the find_attachments method like this:
def find_attachments(message):
"""
Return a tuple of parsed content-disposition dict, message object
for each attachment found.
"""
found = []
for part in message.walk():
if 'content-disposition' not in part:
continue
cdisp = part['content-disposition'].split(';')
cdisp = [x.strip() for x in cdisp]
if cdisp[0].lower() != 'attachment':
continue
parsed = {}
for kv in cdisp[1:]:
try:
key, val = kv.split('=')
if val.startswith('"'):
val = val.strip('"')
elif val.startswith("'"):
val = val.strip("'")
parsed[key] = val
except Exception as e:
parsed['filename']=kv.replace('filename=','')
found+=list(parsed.values())
return found

adding to an email a zip file

I have the code below that works perfectly fine when I use it to send a txt file, an image or audio. However, it doesn't work when I try to send zip files, rar files, or any other file that doesn't have its own MIME (which is not related to MIMEText, MIMEImage or MIMEAudio).
In conclusion, whenever I reach the else part (the MIMEBase command) I do something wrong and get the error:
e.send_mail(TARGET, SUBJECT, "file.zip")
msg.attach(part) //two lines after the else's end
AttributeError: 'str' object has no attribute 'append'
the code:
def send_mail(self, target, subject, *file_names):
"""
send a mail with files to the target
#param target: send the mail to the target
#param subject: mail's subject
#param file_names= list of files to send
"""
msg = email.MIMEMultipart.MIMEMultipart()
msg['From'] = self.mail
msg['To'] = email.Utils.COMMASPACE.join(target)
msg['Subject'] = subject
for file_name in file_names:
f = open(file_name, 'rb')
ctype, encoding = mimetypes.guess_type(file_name)
if ctype is None or encoding is not None:
ctype = 'application/octet-stream'
maintype, subtype = ctype.split('/', 1)
# in case of a text file
if maintype == 'text':
part = MIMEText(f.read(), _subtype=subtype)
# in case of an image file
elif maintype == 'image':
part = MIMEImage(f.read(), _subtype=subtype)
# in case of an audio file
elif maintype == 'audio':
part = MIMEAudio(f.read(), _subtype=subtype)
# any other file
else:
part = MIMEBase(maintype, subtype)
msg.set_payload(f.read())
part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(file_name))
msg.attach(part)
f.close()
# ssl server doesn't support or need tls, so don't call server_ssl.starttls()
self.server_ssl.sendmail(self.mail, target, msg.as_string())
#server_ssl.quit()
self.server_ssl.close()
I have seen similar codes but I don't understand what is wrong with mine.
could you please explain me what I am messing up?
thank you!
if it helps anyone here is the answer:
the main problem was that I changed the msg payload instead of the zip file's
def send_mail(self, target, subject, body, *file_names):
"""
send a mail with files to the target
#param target: send the mail to the target
#param subject: mail's subject
#param file_names= list of files to send
"""
msg = MIMEMultipart()
msg['From'] = self.mail
msg['To'] = target
msg['Subject'] = subject
body_part = MIMEText(body, 'plain')
msg.attach(body_part)
for file_name in file_names:
f = open(file_name, 'rb')
ctype, encoding = mimetypes.guess_type(file_name)
if ctype is None or encoding is not None:
ctype = 'application/octet-stream'
maintype, subtype = ctype.split('/', 1)
# in case of a text file
if maintype == 'text':
part = MIMEText(f.read(), _subtype=subtype)
# in case of an image file
elif maintype == 'image':
part = MIMEImage(f.read(), _subtype=subtype)
# in case of an audio file
elif maintype == 'audio':
part = MIMEAudio(f.read(), _subtype=subtype)
# any other file
else:
part = MIMEBase(maintype, subtype)
part.set_payload(f.read())
encoders.encode_base64(part)
part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(file_name))
msg.attach(part)
f.close()
# ssl server doesn't support or need tls, so don't call server_ssl.starttls()
self.server_ssl.sendmail(self.mail, target, msg.as_string())
self.server_ssl.quit()

Categories