I do not known how to load a eml file in python 3.4.
I want to list all and read all of them in python.
This is how you get content of an e-mail i.e. *.eml file.
This works perfectly on Python2.5 - 2.7. Try it on 3. It should work as well.
from email import message_from_file
import os
# Path to directory where attachments will be stored:
path = "./msgfiles"
# To have attachments extracted into memory, change behaviour of 2 following functions:
def file_exists (f):
"""Checks whether extracted file was extracted before."""
return os.path.exists(os.path.join(path, f))
def save_file (fn, cont):
"""Saves cont to a file fn"""
file = open(os.path.join(path, fn), "wb")
file.write(cont)
file.close()
def construct_name (id, fn):
"""Constructs a file name out of messages ID and packed file name"""
id = id.split(".")
id = id[0]+id[1]
return id+"."+fn
def disqo (s):
"""Removes double or single quotations."""
s = s.strip()
if s.startswith("'") and s.endswith("'"): return s[1:-1]
if s.startswith('"') and s.endswith('"'): return s[1:-1]
return s
def disgra (s):
"""Removes < and > from HTML-like tag or e-mail address or e-mail ID."""
s = s.strip()
if s.startswith("<") and s.endswith(">"): return s[1:-1]
return s
def pullout (m, key):
"""Extracts content from an e-mail message.
This works for multipart and nested multipart messages too.
m -- email.Message() or mailbox.Message()
key -- Initial message ID (some string)
Returns tuple(Text, Html, Files, Parts)
Text -- All text from all parts.
Html -- All HTMLs from all parts
Files -- Dictionary mapping extracted file to message ID it belongs to.
Parts -- Number of parts in original message.
"""
Html = ""
Text = ""
Files = {}
Parts = 0
if not m.is_multipart():
if m.get_filename(): # It's an attachment
fn = m.get_filename()
cfn = construct_name(key, fn)
Files[fn] = (cfn, None)
if file_exists(cfn): return Text, Html, Files, 1
save_file(cfn, m.get_payload(decode=True))
return Text, Html, Files, 1
# Not an attachment!
# See where this belongs. Text, Html or some other data:
cp = m.get_content_type()
if cp=="text/plain": Text += m.get_payload(decode=True)
elif cp=="text/html": Html += m.get_payload(decode=True)
else:
# Something else!
# Extract a message ID and a file name if there is one:
# This is some packed file and name is contained in content-type header
# instead of content-disposition header explicitly
cp = m.get("content-type")
try: id = disgra(m.get("content-id"))
except: id = None
# Find file name:
o = cp.find("name=")
if o==-1: return Text, Html, Files, 1
ox = cp.find(";", o)
if ox==-1: ox = None
o += 5; fn = cp[o:ox]
fn = disqo(fn)
cfn = construct_name(key, fn)
Files[fn] = (cfn, id)
if file_exists(cfn): return Text, Html, Files, 1
save_file(cfn, m.get_payload(decode=True))
return Text, Html, Files, 1
# This IS a multipart message.
# So, we iterate over it and call pullout() recursively for each part.
y = 0
while 1:
# If we cannot get the payload, it means we hit the end:
try:
pl = m.get_payload(y)
except: break
# pl is a new Message object which goes back to pullout
t, h, f, p = pullout(pl, key)
Text += t; Html += h; Files.update(f); Parts += p
y += 1
return Text, Html, Files, Parts
def extract (msgfile, key):
"""Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary.
msgfile -- A file-like readable object
key -- Some ID string for that particular Message. Can be a file name or anything.
Returns dict()
Keys: from, to, subject, date, text, html, parts[, files]
Key files will be present only when message contained binary files.
For more see __doc__ for pullout() and caption() functions.
"""
m = message_from_file(msgfile)
From, To, Subject, Date = caption(m)
Text, Html, Files, Parts = pullout(m, key)
Text = Text.strip(); Html = Html.strip()
msg = {"subject": Subject, "from": From, "to": To, "date": Date,
"text": Text, "html": Html, "parts": Parts}
if Files: msg["files"] = Files
return msg
def caption (origin):
"""Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
origin -- Message() object
Returns tuple(From, To, Subject, Date)
If message doesn't contain one/more of them, the empty strings will be returned.
"""
Date = ""
if origin.has_key("date"): Date = origin["date"].strip()
From = ""
if origin.has_key("from"): From = origin["from"].strip()
To = ""
if origin.has_key("to"): To = origin["to"].strip()
Subject = ""
if origin.has_key("subject"): Subject = origin["subject"].strip()
return From, To, Subject, Date
# Usage:
f = open("message.eml", "rb")
print extract(f, f.name)
f.close()
I programmed this for my mailgroup using mailbox, that is why it is so convoluted.
It never failed me. Never any junk. If message is multipart, output dictionary will contain a
key "files" (a sub dict) with all filenames of extracted other files that were not text or html.
That was a way of extracting attachments and other binary data.
You may change it in pullout(), or just change the behaviour of file_exists() and save_file().
construct_name() constructs a filename out of message id and multipart message
filename, if there is one.
In pullout() the Text and Html variables are strings. For online mailgroup it was OK to get any text or HTML packed into multipart that wasn't an attachment at once.
If you need something more sophisticated change Text and Html to lists and append to them and add them as needed.
Nothing problematic.
Maybe there are some errors here, because it is intended to work with mailbox.Message(),
not with email.Message(). I tried it on email.Message() and it worked fine.
You said, you "wish to list them all". From where? If you refer to the POP3 mailbox or a mailbox of some nice open-source mailer, then you do it using mailbox module.
If you want to list them from others, then you have a problem.
For example, to get mails from MS Outlook, you have to know how to read OLE2 compound files.
Other mailers rarely refer to them as *.eml files, so I think this is exactly what you would like to do.
Then search on PyPI for olefile or compoundfiles module and Google around for how to extract an e-mail from MS Outlook inbox file.
Or save yourself a mess and just export them from there to some directory. When you have them as eml files, then apply this code.
I found this code much simpler
import email
import os
path = './'
listing = os.listdir(path)
for fle in listing:
if str.lower(fle[-3:])=="eml":
msg = email.message_from_file(open(fle))
attachments=msg.get_payload()
for attachment in attachments:
try:
fnam=attachment.get_filename()
f=open(fnam, 'wb').write(attachment.get_payload(decode=True,))
f.close()
except Exception as detail:
#print detail
pass
Posting this here for anyone looking to just extract text from an email and get a list of .eml files - took me forever to find a good answer to this online. NOTE: This will not get attachments to emails, just the text from email.
import email
from email import policy
from email.parser import BytesParser
import glob
import os
path = '/path/to/data/' # set this to "./" if in current directory
eml_files = glob.glob(path + '*.eml') # get all .eml files in a list
for eml_file in eml_files:
with open(eml_file, 'rb') as fp: # select a specific email file from the list
name = fp.name # Get file name
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
fp.close()
text = text.split("\n")
print (name) # Get name of eml file
print (text) # Get list of all text in email
Credit to some of the code from this post: Reading .eml files with Python 3.6 using emaildata 0.3.4
Python 3 version of Dalen's answer. Basically syntax issue fixes. (Can't comment due to lack of reputation, also clearer as an answer).
# To have attachments extracted into memory, change behaviour of 2 following functions:
def file_exists (f):
"""Checks whether extracted file was extracted before."""
return os.path.exists(os.path.join(path, f))
def save_file (fn, cont):
"""Saves cont to a file fn"""
file = open(os.path.join(path, fn), "wb")
file.write(cont)
file.close()
def construct_name (id, fn):
"""Constructs a file name out of messages ID and packed file name"""
id = id.split(".")
id = id[0]+id[1]
return id+"."+fn
def disqo (s):
"""Removes double or single quotations."""
s = s.strip()
if s.startswith("'") and s.endswith("'"): return s[1:-1]
if s.startswith('"') and s.endswith('"'): return s[1:-1]
return s
def disgra (s):
"""Removes < and > from HTML-like tag or e-mail address or e-mail ID."""
s = s.strip()
if s.startswith("<") and s.endswith(">"): return s[1:-1]
return s
def pullout (m, key):
"""Extracts content from an e-mail message.
This works for multipart and nested multipart messages too.
m -- email.Message() or mailbox.Message()
key -- Initial message ID (some string)
Returns tuple(Text, Html, Files, Parts)
Text -- All text from all parts.
Html -- All HTMLs from all parts
Files -- Dictionary mapping extracted file to message ID it belongs to.
Parts -- Number of parts in original message.
"""
Html = ""
Text = ""
Files = {}
Parts = 0
if not m.is_multipart():
if m.get_filename(): # It's an attachment
fn = m.get_filename()
cfn = construct_name(key, fn)
Files[fn] = (cfn, None)
if file_exists(cfn): return Text, Html, Files, 1
save_file(cfn, m.get_payload(decode=True))
return Text, Html, Files, 1
# Not an attachment!
# See where this belongs. Text, Html or some other data:
cp = m.get_content_type()
if cp=="text/plain":
Text += str(m.get_payload(decode=True))
elif cp=="text/html":
Html += str(m.get_payload(decode=True))
else:
# Something else!
# Extract a message ID and a file name if there is one:
# This is some packed file and name is contained in content-type header
# instead of content-disposition header explicitly
cp = m.get("content-type")
try: id = disgra(m.get("content-id"))
except: id = None
# Find file name:
o = cp.find("name=")
if o==-1: return Text, Html, Files, 1
ox = cp.find(";", o)
if ox==-1: ox = None
o += 5; fn = cp[o:ox]
fn = disqo(fn)
cfn = construct_name(key, fn)
Files[fn] = (cfn, id)
if file_exists(cfn): return Text, Html, Files, 1
save_file(cfn, m.get_payload(decode=True))
return Text, Html, Files, 1
# This IS a multipart message.
# So, we iterate over it and call pullout() recursively for each part.
y = 0
while 1:
# If we cannot get the payload, it means we hit the end:
try:
pl = m.get_payload(y)
except: break
# pl is a new Message object which goes back to pullout
t, h, f, p = pullout(pl, key)
Text += t; Html += h; Files.update(f); Parts += p
y += 1
return Text, Html, Files, Parts
def extract (msgfile, key):
"""Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary.
msgfile -- A file-like readable object
key -- Some ID string for that particular Message. Can be a file name or anything.
Returns dict()
Keys: from, to, subject, date, text, html, parts[, files]
Key files will be present only when message contained binary files.
For more see __doc__ for pullout() and caption() functions.
"""
m = email.message_from_file(msgfile)
From, To, Subject, Date = caption(m)
Text, Html, Files, Parts = pullout(m, key)
Text = Text.strip(); Html = Html.strip()
msg = {"subject": Subject, "from": From, "to": To, "date": Date,
"text": Text, "html": Html, "parts": Parts}
if Files: msg["files"] = Files
return msg
def caption (origin):
"""Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
origin -- Message() object
Returns tuple(From, To, Subject, Date)
If message doesn't contain one/more of them, the empty strings will be returned.
"""
Date = ""
if origin.__contains__("date"): Date = origin["date"].strip()
From = ""
if origin.__contains__("from"): From = origin["from"].strip()
To = ""
if origin.__contains__("to"): To = origin["to"].strip()
Subject = ""
if origin.__contains__("subject"): Subject = origin["subject"].strip()
return From, To, Subject, Date
Try this:
#!python3
# -*- coding: utf-8 -*-
import email
import os
SOURCE_DIR = 'email'
DEST_DIR = 'temp'
def extractattachements(fle,suffix=None):
message = email.message_from_file(open(fle))
filenames = []
if message.get_content_maintype() == 'multipart':
for part in message.walk():
if part.get_content_maintype() == 'multipart': continue
#if part.get('Content-Disposition') is None: continue
if part.get('Content-Type').find('application/octet-stream') == -1: continue
filename = part.get_filename()
if suffix:
filename = ''.join( [filename.split('.')[0], '_', suffix, '.', filename.split('.')[1]])
filename = os.path.join(DEST_DIR, filename)
fb = open(filename,'wb')
fb.write(part.get_payload(decode=True))
fb.close()
filenames.append(filename)
return filenames
def main():
onlyfiles = [f for f in os.listdir(SOURCE_DIR) if os.path.isfile(os.path.join(SOURCE_DIR, f))]
for file in onlyfiles:
#print path.join(SOURCE_DIR,file)
extractattachements(os.path.join(SOURCE_DIR,file))
return True
if __name__ == "__main__":
main()
Here I am simplifying things for you so that you can get a more clear data to process on .....
.eml will consist of 2 parts on broad level 1) Headers 2)Content/Body
(Note it will discard any attachements if they are there)
Moreover I've removed https links also from .eml file but I'll tell you what to do if you want them .
1) Header :
So I used eml-parser to get Header information you can install it using :
pip install eml-parser
View their documentation to get more info about how to get headers :
https://pypi.org/project/eml-parser/
2)Content/Body : Now here I modified some older scripts to get best result in output
from email import policy
from email.parser import BytesParser
import glob
import os
path = './' # set this to "./" if in current directory
eml_files = glob.glob(path + '*.eml') # get all .eml files in a list
for eml_file in eml_files:
with open(eml_file, 'rb') as fp: # select a specific email file from the list
name = fp.name # Get file name
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
fp.close()
print (name) # Get name of eml file
# print (text) # Get list of all text in email
This is a part of code which was already available on many places and of which I don't take credit of......
Now I've added few conditions to print out the body in more pretty way these lines of code are mine and you can give me credit for that :
newText = ""
flag = 0
urlFlag = 0
for i in range(len(text)):
if(flag==1):
flag = 0
continue
if(text[i]=="\\"):
flag = 1
continue
if(text[i]=='<'): //to remove hyperlinks
urlFlag = 1
continue
if(text[i]=='>'): //to remove hyperlinks
urlFlag = 0
continue
if(urlFlag==0): //to remove hyperlinks
newText = newText+text[i]
print(newText)
Now this will remove all the break-lines , tab space and other stuff (\t,\r,\n)
Moreover if you want to have links (http,https links present in your .eml file) then just remove 3 conditions and new code will look like :
newText = ""
flag = 0
urlFlag = 0
for i in range(len(text)):
if(flag==1):
flag = 0
continue
if(text[i]=="\\"):
flag = 1
continue
newText = newText+text[i]
print(newText)
Final Code (with removing links) :
from email import policy
from email.parser import BytesParser
import glob
import os
path = './' # set this to "./" if in current directory
eml_files = glob.glob(path + '*.eml') # get all .eml files in a list
for eml_file in eml_files:
with open(eml_file, 'rb') as fp: # select a specific email file from the list
name = fp.name # Get file name
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
fp.close()
print (name) # Get name of eml file
# print (text) # Get list of all text in email
newText = ""
flag = 0
urlFlag = 0
for i in range(len(text)):
if(flag==1):
flag = 0
continue
if(text[i]=="\\"):
flag = 1
continue
if(text[i]=='<'):
urlFlag = 1
continue
if(text[i]=='>'):
urlFlag = 0
continue
if(urlFlag==0):
newText = newText+text[i]
print(newText)
This is my 1st answer on StackOverflow hope this will help you guys !
My Python version is : 3.8.10
Related
I have PDF files in same folder. How to get all PDF file names and save as excel file according to PDF file name.
This is what I have tried
def get_files(pdf_path):
import os
os.chdir(pdf_path)
files = os.listdir()
files = [x for x in files if x.endswith(".pdf")]
return files
files = get_files(pdf_path)
for i in files:
save_as_excel(pdf_path, i)
As discussed on chat, this is the continuation of your previous question, which I answered. In the previous question I answered how you can extract text from the pdf file which contains multiple data entity. Now you want to extract the text and parse the content to save the data as csv/xlsx for all pdf files present in the folder.
Please go through all the steps below, all you need to change below is the path of your directory to pdf files path_of_pdf_files
Assumption and logic would remain same from my previous answer.
I have moved the data and methods and encapsulated to a class PdfExtractor.
Please follow the below steps to extract text from pdf and save as xlsx.
Before moving ahead install the packages pdfplumber, xlsxwriter
Save the below code with filename PdfExtractor.py
import pdfplumber
import xlsxwriter
import re
# regex pattern for keys in line1 of data entity
my_regex_dict_line1 = {
'Our Ref' : r'Our Ref :(.*?)Name',
'Name' : r'Name:(.*?)Ref 1',
'Ref 1' : r'Ref 1 :(.*?)Ref 2',
'Ref 2' : r'Ref 2:(.*?)$'
}
# regex pattern for keys in line2 of data entity
my_regex_dict_line2 = {
'Amount' : r'Amount:(.*?)Total Paid',
'Total Paid' : r'Total Paid:(.*?)Balance',
'Balance' : r'Balance:(.*?)Date of A/C',
'Date of A/C' : r'Date of A/C:(.*?)Date Received',
'Date Received' : r'Date Received:(.*?)$'
}
# regex pattern for keys in line3 of data entity
my_regex_dict_line3 ={
'Last Paid' : r'Last Paid:(.*?)Amt Last Paid',
'Amt Last Paid' : r'Amt Last Paid:(.*?)A/C\s+Status',
'A/C Status': r'A/C\s+Status:(.*?)Collector',
'Collector' : r'Collector :(.*?)$'
}
class PdfExtractor:
data_entity_sep_pattern = r'(?=Our Ref.*?Name.*?Ref 1.*?Ref 2)'
def __init__(self, pdf_path):
self.pdf_path = pdf_path
self.json_data = {}
self.pdf_text = ''
def __preprocess_data(self, data):
return [el.strip() for el in data.splitlines() if el.strip()]
def __get_header_data(self, text):
header_data_list = self.__preprocess_data(text)
# third line in text of header contains Date Created field
self.json_data['Date Created'] = re.search(r'Date Created:(.*?)$', header_data_list[2]).group(1).strip()
# fourth line in text contains Number of Pages, Client Code, Client Name
self.json_data['Number of Pages'] = re.search(r'Number of Pages:(.*?)$', header_data_list[3]).group(1).strip()
# fifth line in text contains Client Code and ClientName
self.json_data['Client Code'] = re.search(r'Client Code - (.*?)Client Name', header_data_list[4]).group(1).strip()
self.json_data['ClientName'] = re.search(r'Client Name - (.*?)$', header_data_list[4]).group(1).strip()
def __iterate_through_regex_and_populate_dictionaries(self, data_dict, regex_dict, text):
''' For the given pattern of regex_dict, this function iterates through each regex pattern and adds the key value to regex_dict dictionary '''
for key, regex in regex_dict.items():
matched_value = re.search(regex, text)
if matched_value is not None:
data_dict[key] = matched_value.group(1).strip()
def __populate_date_notes(self, data_dict, text):
''' This function populates date and Notes in the data chunk in the form of list to data_dict dictionary '''
data_dict['Date'] = []
data_dict['Notes'] = []
iter = 4
while(iter < len(text)):
date_match = re.search(r'(\d{2}/\d{2}/\d{4})',text[iter])
data_dict['Date'].append(date_match.group(1).strip())
notes_match = re.search(r'\d{2}/\d{2}/\d{4}\s*(.*?)$',text[iter])
data_dict['Notes'].append(notes_match.group(1).strip())
iter += 1
def get_pdf_text(self):
data_index = 1
with pdfplumber.open(self.pdf_path) as pdf:
index = 0
while(index < len(pdf.pages)):
page = pdf.pages[index]
self.pdf_text += '\n' + page.extract_text()
index += 1
split_on_data_entity = re.split(self.data_entity_sep_pattern, self.pdf_text.strip())
# first data in the split_on_data_entity list will contain the header information
self.__get_header_data(split_on_data_entity[0])
while(data_index < len(split_on_data_entity)):
data_entity = {}
data_processed = self.__preprocess_data(split_on_data_entity[data_index])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line1, data_processed[0])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line2, data_processed[1])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line3, data_processed[2])
if(len(data_processed) > 3 and data_processed[3] != None and 'Date' in data_processed[3] and 'Notes' in data_processed[3]):
self.__populate_date_notes(data_entity, data_processed)
self.json_data['data_entity' + str(data_index)] = data_entity
data_index += 1
return self.json_data
def save_as_xlsx(self, file_name):
if(not self.json_data):
print("Data was not read from PDF")
return
workbook = xlsxwriter.Workbook(file_name)
worksheet = workbook.add_worksheet("Sheet 1")
row = 0
col = 0
# write column
columns = ['Account History Report', 'All Notes'] + [ key for key in self.json_data.keys() if 'data_entity' not in key ] + list(self.json_data['data_entity1'].keys())
worksheet.write_row(row, col, tuple(columns))
row += 1
column_index_map = {}
for index, col in enumerate(columns):
column_index_map[col] = index
# write the header
worksheet.write(row, column_index_map['Date Created'], self.json_data['Date Created'])
worksheet.write(row, column_index_map['Number of Pages'], self.json_data['Number of Pages'])
worksheet.write(row, column_index_map['Client Code'], self.json_data['Client Code'])
worksheet.write(row, column_index_map['ClientName'], self.json_data['ClientName'])
data_entity_index = 1
#iterate through each data entity and for each key insert the values in the sheet
while True:
data_entity_key = 'data_entity' + str(data_entity_index)
row_size = 1
if(self.json_data.get(data_entity_key) != None):
for key, value in self.json_data.get(data_entity_key).items():
if(type(value) == list):
worksheet.write_column(row, column_index_map[key], tuple(value))
row_size = len(value)
else:
worksheet.write(row, column_index_map[key], value)
else:
break
data_entity_index += 1
row += row_size
workbook.close()
print(file_name + " saved successfully")
Execute the below code, it reads all the pdf files inside the folder path_of_pdf_files and saves the data in a xlsx file in the same directory. Also note that the below code should be executed in the same folder where you saved the file PdfExtractor.py
import os
from PdfExtractor import PdfExtractor
path_of_pdf_files = r'C:\Users\hpoddar\Desktop\Temp' # Directory path for your pdf files
files = os.listdir(path_of_pdf_files)
for file in files:
if(not file.endswith(".pdf")):
continue
filename = os.path.splitext(file)[0]
pdf_obj = PdfExtractor(os.path.join(path_of_pdf_files, file))
pdf_text = pdf_obj.get_pdf_text()
pdf_obj.save_as_xlsx(os.path.join(path_of_pdf_files, filename + '.xlsx'))
Output :
C:\Users\hpoddar\Desktop\Temp\sample.xlsx saved successfully
C:\Users\hpoddar\Desktop\Temp\sample2.xlsx saved successfully
C:\Users\hpoddar\Desktop\Temp\sample3.xlsx saved successfully
Lets say you have following pdf files in the directory sample.pdf, sample2.pdf, sample3.pdf. The xlsx files will be created in the same folder with following filename sample.xlsx, sample2.xlsx, sample3.xlsx
Let me know if you have any doubts in the above code.
If you mean saving each filename as an empty excel file, try this :
import os
import openpyxl
pdf_path = '.'
def get_files(pdf_path):
os.chdir(pdf_path)
files = os.listdir()
files = [x for x in files if x.endswith(".pdf")]
return files
files = get_files(pdf_path)
# create an empty workbook (excel file)
wb = openpyxl.workbook.Workbook()
for i in files:
output_path = os.path.join(pdf_path, i).replace('.pdf', '.xlsx')
# save as an excel file with filename
wb.save(output_path)
print(output_path)
I made a youtube video download Manager. It download a video but i am facing one issue when i download same video, it doesn't download it again. how can i download it again with same title like pic.png and send pic1.png. How can i do that?
def Download(self):
video_url = self.lineEdit.text()
save_location = self.lineEdit_2.text()
if video_url == '' or save_location == '':
QMessageBox.warning(self, "Data Error", "Provide a Valid Video URL or save Location")
else:
video = pafy.new(video_url)
video_stream = video.streams
video_quality = self.comboBox.currentIndex()
download = video_stream[video_quality].download(filepath=save_location, callback=self.Handel_Progress, )
Ok, this one is interesting.
The real problem begins here.
download = video_stream[video_quality].download(filepath=save_location, callback=self.Handel_Progress, )
Here, you are calling download function of video_stream object which takes filepath as an argument for file location but does not take the filename, because, obviously, the file would be saved with the actual name.
Root Cause of your problem:
If you look into the definition of download function, you would find that if a file exists with the same name, it would not download the file at all.
Now comes the part, how do you make sure it downloads, no matter what:
There are two things you need to do:
Check if a file with same name exists or not, and if does, then add 1 in the end of the file name just before the extension. So if abc.mp4 exists, then save abc1.mp4.
[I will tell you how to handle the scenario when abc.mp4, abc1.mp4 and so on exists, but for now, let's get back to the problem.]
How to pass the file name (abc1.mp4) to the download method?
Following piece of code would handle both.
I have added comments for your understanding.
import os
import re
import pafy
from pafy.util import xenc
# this function is used by pafy to generate file name while saving,
# so im using the same function to get the file name which I will use to check
# if file exists or not
# DO NOT CHANGE IT
def generate_filename(title, extension):
max_length = 251
""" Generate filename. """
ok = re.compile(r'[^/]')
if os.name == "nt":
ok = re.compile(r'[^\\/:*?"<>|]')
filename = "".join(x if ok.match(x) else "_" for x in title)
if max_length:
max_length = max_length + 1 + len(extension)
if len(filename) > max_length:
filename = filename[:max_length - 3] + '...'
filename += "." + extension
return xenc(filename)
def get_file_name_for_saving(save_location, full_name):
file_path_with_name = os.path.join(save_location, full_name)
# file exists, add 1 in the end, otherwise return filename as it is
if os.path.exists(file_path_with_name):
split = file_path_with_name.split(".")
file_path_with_name = ".".join(split[:-1]) + "1." + split[-1]
return file_path_with_name
def Download(self):
video_url = self.lineEdit.text()
save_location = self.lineEdit_2.text()
if video_url == '' or save_location == '':
QMessageBox.warning(self, "Data Error", "Provide a Valid Video URL or save Location")
else:
# video file
video = pafy.new(video_url)
# available video streams
video_stream = video.streams
video_quality = self.comboBox.currentIndex()
# video title/name
video_name = video.title
# take out the extension of the file from video stream
extension = video_stream[video_quality].extension
# fullname with extension
full_name = generate_filename(video_name, extension)
final_path_with_file_name = get_file_name_for_saving(save_location, full_name)
download = video_stream[video_quality].download(filepath=final_path_with_file_name,
callback=self.Handel_Progress, )
Let me know if you face any issues.
# coding=utf-8
# Libreria RegEx de Python.
import re
# Libreria para rutas.
import os
import csv
# function betwwen: return the value between two words a and b
def between(value, a, b):
pos_a = value.find(a) # Find and validate before-part.
if pos_a == -1: return "" # Find and validate after part.
pos_b = value.rfind(b)
if pos_b == -1: return "" # Return middle part.
adjusted_pos_a = pos_a + len(a)
if adjusted_pos_a >= pos_b: return ""
return value[adjusted_pos_a:pos_b]
# function scan folder DiarioOficial
def scan_folder():
# directory 'path'
path = '/Users/anna/PycharmProjects/extractData/DiarioOficial'
# contador de ficheros del path
count = 0
# creation csv as csvFile
with open('All_Companies1.csv', 'a') as csvFile:
# iterate all paths in the folder DiarioOficial without name
for (path, dirnames, file_names) in os.walk(path):
# iterate over all the files in the path (+ file_name)
for file_name in file_names:
# Add extension that is required
if file_name.endswith(".txt"):
# summatory count files in DiarioOficial folder
count = count + 1
# concatenation path + file name
file_path=os.path.join(path, file_name)
#print(file_path)
# open and read the file path
mensaje = open(file_path).read()
# Replace a newline for a space
mensaje = mensaje.replace("\n","")
# Company Name
keywords_cap = ['SpA', 'SPA', 'LIMITADA', 'LTDA', 'S.A.', 'E.I.R.L.', 'S.L.']
# re.escape to solve the problem with metacharacters in keyword_obj
keywords_cap = map(re.escape, keywords_cap)
# sorting the items by lengh in descending order
keywords_cap.sort(key=len, reverse=True)
obj = re.compile(r'[:,;.]\s*"?([^:,;.]*?(?<!\w)(?:{}))'.format('|'.join(keywords_cap)))
if obj:
# To obtain the first match obj.search(mensaje).group(1)
company_name = obj.search(mensaje)
else:
company_name = "None"
# CVE Number of the file
regex = r"\s*CVE\s+([^|]*)"
matches = re.search(regex, mensaje)
if matches:
company_cve = matches.group(1).strip()
else:
company_cve = "None"
# Section of diariooficial.interior.gob.cl
company_sect = between(mensaje, 'SECCIÓN', 'Núm.')
if company_sect:
company_sect = company_sect
else:
company_sect = "None"
# Name of the person that constitutes the company
company_ceo = re.search(r'\sante mí,\s+([^,]*)', mensaje)
if company_ceo:
company_ceo = company_ceo.group(1)
else:
company_ceo = "None"
# File Number from Section
num_reg = r'\sNúm.\s+([^|]*)'
match_num = re.search(num_reg, mensaje)
if match_num:
company_numsect = match_num.group(1)
else:
company_numsect = "None"
# Social Capital ($)
cap = r"\s*(CAPITAL:\s+([^-]*)|Capital social:\s+([^-]*)|Capital:\s+([^-]*)|Capital:\s+([^,]*))"
caps = re.search(cap, mensaje)
if caps:
company_capital = caps.group()
else:
company_capital = 'None'
csvData = [company_name, company_cve, company_sect, company_ceo, company_numsect, company_capital]
headers = ['COMPANY NAME', 'CVE', 'SECTION','CEO NAME','NUMBER SECTOR','COMPANY CAPITAL']
writer = csv.writer(csvFile, delimiter=',') # create a csv delimited by comma
writer.writerow(headers) # print the header row
writer.writerow(csvData) # print the Data in csv
# Number of txt files
print (count)
scan_folder()
I have this script that create a csv with the data extracted from a text in specific path. In spite of the errors that can be on RegEx, mainly it extracts parts of text that it keeps them in variables and the printa in a csv. Each company must have a single line in this csv. In this way, when the csv is opened, the number of companies and all the information can be visualized by variables.
My problem is that when I see the CSV called, in this case, All_companies1, the data is not put in the same row, they jump.
Also, the titles are repeated, and I do not want them to repeat themselves
First try changing the mode for the csvFile from a (append) to w (write), also check if the editor you're using actual uses the comma as the column delimiter for csv files, since in the above picture is seems as if the comma is seen by the editor as a normal character.
Also remove any carriage return characters (\n \r) from your string before printing it, this can be done in the following code.
csvData = [str(data).replace('\n', '').replace('\r', '') for data in csvData]
Note:
if by any chance this works, there might be a problem with with having empty rows in the csv file beteen each two elements, this can be fixed by changing with open('All_Companies1.csv', 'a') as csvFile to with open('All_Companies1.csv', 'a', newline='') as csvFile
I have to send mails through python. It works. It is almost done. The only problem is that I have to keep the formatting too. So either I have to send e mail as HTML (and then rewrite template with html instead of .docx) OR copy .docx file with extension
Anybody has any ideas how to do this? Thanks guys.
import win32com.client as win32
import fileinput as fi
from docx import Document
outlook = win32.Dispatch('outlook.application')
path_in = 'maillist.csv'
input_file = open(path_in, 'r')
document = Document('template.docx')
document_html = open('template.html', 'r')
print(temp)
def filecount(fname):
for line in fi.input(fname):
pass
return fi.lineno()
print("Total mails %s" % (filecount(path_in)))
count = 0
for line in input_file:
if (count>16):
name = line.split(";")[0]
mail_adress = line.split(";")[1]
subject = line.split(";")[2]
print ("%s:%s:%s:" % (name, mail_adress, subject))
mail = outlook.CreateItem(0)
mail.To = mail_adress
mail.Subject = subject
mail.body = temp.replace("XXXNAMEXXX", name)
mail.send
else:
count+=1
Try adding the .RTFBody and/or .HTMLBody methods to the document objects :
document = Document('template.docx').RTFBody
document_html = open('template.html', 'r').HTMLBody
Also, I'm not sure if it makes much of a difference but, for convention's sake, I like to capitalize the first letter of the method for the mailItem object.
Let me know if that works.
I somewhat understand how to do looping in Python, seems easy enough to say "For each file in this directory...do something". I'm now having a hard time figuring out how to loop through a series of .ini files in a directory, read lines from them, and use the text in the ini files as variables in the same Python script. For example, in this script, a single .ini file provides the values for 12 variables in the script. Currently, to run the script multiple times, one has to replace the single ini file with another one, that contains a different 12 variables. The script performs routine maintenance of an on-line mapping service provider..thing is...I have dozen's of services I'd like to manage with the script. From the script, it appears that the name of the .ini file is fixed, not sure it's even possible to loop through multiple ini file? The good news is, that the script is using ConfigParser.....I hope this makes sense!
[FS_INFO]
SERVICENAME = MyMapService
FOLDERNAME = None
MXD = D:\nightly_updates\maps\MyMap.mxd
TAGS = points, dots, places
DESCRIPTION = This is the description text
MAXRECORDS = 1000
[FS_SHARE]
SHARE = True
EVERYONE = true
ORG = true
GROUPS = None
[AGOL]
USER = user_name
PASS = pass_word1
The script below is reading from the ini file above.
# Import system modules
import urllib, urllib2, json
import sys, os
import requests
import arcpy
import ConfigParser
from xml.etree import ElementTree as ET
class AGOLHandler(object):
def __init__(self, username, password, serviceName, folderName):
self.username = username
self.password = password
self.serviceName = serviceName
self.token, self.http = self.getToken(username, password)
self.itemID = self.findItem("Feature Service")
self.SDitemID = self.findItem("Service Definition")
self.folderName = folderName
self.folderID = self.findFolder()
def getToken(self, username, password, exp=60):
referer = "http://www.arcgis.com/"
query_dict = {'username': username,
'password': password,
'expiration': str(exp),
'client': 'referer',
'referer': referer,
'f': 'json'}
query_string = urllib.urlencode(query_dict)
url = "https://www.arcgis.com/sharing/rest/generateToken"
token = json.loads(urllib.urlopen(url + "?f=json", query_string).read())
if "token" not in token:
print token['error']
sys.exit()
else:
httpPrefix = "http://www.arcgis.com/sharing/rest"
if token['ssl'] == True:
httpPrefix = "https://www.arcgis.com/sharing/rest"
return token['token'], httpPrefix
def findItem(self, findType):
#
# Find the itemID of whats being updated
#
searchURL = self.http + "/search"
query_dict = {'f': 'json',
'token': self.token,
'q': "title:\""+ self.serviceName + "\"AND owner:\"" + self.username + "\" AND type:\"" + findType + "\""}
jsonResponse = sendAGOLReq(searchURL, query_dict)
if jsonResponse['total'] == 0:
print "\nCould not find a service to update. Check the service name in the settings.ini"
sys.exit()
else:
print("found {} : {}").format(findType, jsonResponse['results'][0]["id"])
return jsonResponse['results'][0]["id"]
def findFolder(self):
#
# Find the ID of the folder containing the service
#
if self.folderName == "None":
return ""
findURL = self.http + "/content/users/{}".format(self.username)
query_dict = {'f': 'json',
'num': 1,
'token': self.token}
jsonResponse = sendAGOLReq(findURL, query_dict)
for folder in jsonResponse['folders']:
if folder['title'] == self.folderName:
return folder['id']
print "\nCould not find the specified folder name provided in the settings.ini"
print "-- If your content is in the root folder, change the folder name to 'None'"
sys.exit()
def urlopen(url, data=None):
# monkey-patch URLOPEN
referer = "http://www.arcgis.com/"
req = urllib2.Request(url)
req.add_header('Referer', referer)
if data:
response = urllib2.urlopen(req, data)
else:
response = urllib2.urlopen(req)
return response
def makeSD(MXD, serviceName, tempDir, outputSD, maxRecords):
#
# create a draft SD and modify the properties to overwrite an existing FS
#
arcpy.env.overwriteOutput = True
# All paths are built by joining names to the tempPath
SDdraft = os.path.join(tempDir, "tempdraft.sddraft")
newSDdraft = os.path.join(tempDir, "updatedDraft.sddraft")
arcpy.mapping.CreateMapSDDraft(MXD, SDdraft, serviceName, "MY_HOSTED_SERVICES")
# Read the contents of the original SDDraft into an xml parser
doc = ET.parse(SDdraft)
root_elem = doc.getroot()
if root_elem.tag != "SVCManifest":
raise ValueError("Root tag is incorrect. Is {} a .sddraft file?".format(SDDraft))
# The following 6 code pieces modify the SDDraft from a new MapService
# with caching capabilities to a FeatureService with Query,Create,
# Update,Delete,Uploads,Editing capabilities as well as the ability to set the max
# records on the service.
# The first two lines (commented out) are no longer necessary as the FS
# is now being deleted and re-published, not truly overwritten as is the
# case when publishing from Desktop.
# The last three pieces change Map to Feature Service, disable caching
# and set appropriate capabilities. You can customize the capabilities by
# removing items.
# Note you cannot disable Query from a Feature Service.
#doc.find("./Type").text = "esriServiceDefinitionType_Replacement"
#doc.find("./State").text = "esriSDState_Published"
# Change service type from map service to feature service
for config in doc.findall("./Configurations/SVCConfiguration/TypeName"):
if config.text == "MapServer":
config.text = "FeatureServer"
#Turn off caching
for prop in doc.findall("./Configurations/SVCConfiguration/Definition/" +
"ConfigurationProperties/PropertyArray/" +
"PropertySetProperty"):
if prop.find("Key").text == 'isCached':
prop.find("Value").text = "false"
if prop.find("Key").text == 'maxRecordCount':
prop.find("Value").text = maxRecords
# Turn on feature access capabilities
for prop in doc.findall("./Configurations/SVCConfiguration/Definition/Info/PropertyArray/PropertySetProperty"):
if prop.find("Key").text == 'WebCapabilities':
prop.find("Value").text = "Query,Create,Update,Delete,Uploads,Editing"
# Add the namespaces which get stripped, back into the .SD
root_elem.attrib["xmlns:typens"] = 'http://www.esri.com/schemas/ArcGIS/10.1'
root_elem.attrib["xmlns:xs"] ='http://www.w3.org/2001/XMLSchema'
# Write the new draft to disk
with open(newSDdraft, 'w') as f:
doc.write(f, 'utf-8')
# Analyze the service
analysis = arcpy.mapping.AnalyzeForSD(newSDdraft)
if analysis['errors'] == {}:
# Stage the service
arcpy.StageService_server(newSDdraft, outputSD)
print "Created {}".format(outputSD)
else:
# If the sddraft analysis contained errors, display them and quit.
print analysis['errors']
sys.exit()
def upload(fileName, tags, description):
#
# Overwrite the SD on AGOL with the new SD.
# This method uses 3rd party module: requests
#
updateURL = agol.http+'/content/users/{}/{}/items/{}/update'.format(agol.username, agol.folderID, agol.SDitemID)
filesUp = {"file": open(fileName, 'rb')}
url = updateURL + "?f=json&token="+agol.token+ \
"&filename="+fileName+ \
"&type=Service Definition"\
"&title="+agol.serviceName+ \
"&tags="+tags+\
"&description="+description
response = requests.post(url, files=filesUp);
itemPartJSON = json.loads(response.text)
if "success" in itemPartJSON:
itemPartID = itemPartJSON['id']
print("updated SD: {}").format(itemPartID)
return True
else:
print "\n.sd file not uploaded. Check the errors and try again.\n"
print itemPartJSON
sys.exit()
def publish():
#
# Publish the existing SD on AGOL (it will be turned into a Feature Service)
#
publishURL = agol.http+'/content/users/{}/publish'.format(agol.username)
query_dict = {'itemID': agol.SDitemID,
'filetype': 'serviceDefinition',
'overwrite': 'true',
'f': 'json',
'token': agol.token}
jsonResponse = sendAGOLReq(publishURL, query_dict)
print("successfully updated...{}...").format(jsonResponse['services'])
return jsonResponse['services'][0]['serviceItemId']
def enableSharing(newItemID, everyone, orgs, groups):
#
# Share an item with everyone, the organization and/or groups
#
shareURL = agol.http+'/content/users/{}/{}/items/{}/share'.format(agol.username, agol.folderID, newItemID)
if groups == None:
groups = ''
query_dict = {'f': 'json',
'everyone' : everyone,
'org' : orgs,
'groups' : groups,
'token': agol.token}
jsonResponse = sendAGOLReq(shareURL, query_dict)
print("successfully shared...{}...").format(jsonResponse['itemId'])
def sendAGOLReq(URL, query_dict):
#
# Helper function which takes a URL and a dictionary and sends the request
#
query_string = urllib.urlencode(query_dict)
jsonResponse = urllib.urlopen(URL, urllib.urlencode(query_dict))
jsonOuput = json.loads(jsonResponse.read())
wordTest = ["success", "results", "services", "notSharedWith", "folders"]
if any(word in jsonOuput for word in wordTest):
return jsonOuput
else:
print "\nfailed:"
print jsonOuput
sys.exit()
if __name__ == "__main__":
#
# start
#
print "Starting Feature Service publish process"
# Find and gather settings from the ini file
localPath = sys.path[0]
settingsFile = os.path.join(localPath, "settings.ini")
if os.path.isfile(settingsFile):
config = ConfigParser.ConfigParser()
config.read(settingsFile)
else:
print "INI file not found. \nMake sure a valid 'settings.ini' file exists in the same directory as this script."
sys.exit()
# AGOL Credentials
inputUsername = config.get( 'AGOL', 'USER')
inputPswd = config.get('AGOL', 'PASS')
# FS values
MXD = config.get('FS_INFO', 'MXD')
serviceName = config.get('FS_INFO', 'SERVICENAME')
folderName = config.get('FS_INFO', 'FOLDERNAME')
tags = config.get('FS_INFO', 'TAGS')
description = config.get('FS_INFO', 'DESCRIPTION')
maxRecords = config.get('FS_INFO', 'MAXRECORDS')
# Share FS to: everyone, org, groups
shared = config.get('FS_SHARE', 'SHARE')
everyone = config.get('FS_SHARE', 'EVERYONE')
orgs = config.get('FS_SHARE', 'ORG')
groups = config.get('FS_SHARE', 'GROUPS') #Groups are by ID. Multiple groups comma separated
# create a temp directory under the script
tempDir = os.path.join(localPath, "tempDir")
if not os.path.isdir(tempDir):
os.mkdir(tempDir)
finalSD = os.path.join(tempDir, serviceName + ".sd")
#initialize AGOLHandler class
agol = AGOLHandler(inputUsername, inputPswd, serviceName, folderName)
# Turn map document into .SD file for uploading
makeSD(MXD, serviceName, tempDir, finalSD, maxRecords)
# overwrite the existing .SD on arcgis.com
if upload(finalSD, tags, description):
# publish the sd which was just uploaded
newItemID = publish()
# share the item
if shared:
enableSharing(newItemID, everyone, orgs, groups)
print "\nfinished."
If I understand your question correctly, you would just want to add another loop in your main and then place most of what you have in your main into a new function (in my example, the new function is called 'process_ini'.
So, try replacing everything from your name == main line through the end with:
def process_ini(fileName):
settingsFile = os.path.join(localPath, fileName)
if os.path.isfile(settingsFile):
config = ConfigParser.ConfigParser()
config.read(settingsFile)
else:
print "INI file not found. \nMake sure a valid 'settings.ini' file exists in the same directory as this script."
sys.exit()
# AGOL Credentials
inputUsername = config.get( 'AGOL', 'USER')
inputPswd = config.get('AGOL', 'PASS')
# FS values
MXD = config.get('FS_INFO', 'MXD')
serviceName = config.get('FS_INFO', 'SERVICENAME')
folderName = config.get('FS_INFO', 'FOLDERNAME')
tags = config.get('FS_INFO', 'TAGS')
description = config.get('FS_INFO', 'DESCRIPTION')
maxRecords = config.get('FS_INFO', 'MAXRECORDS')
# Share FS to: everyone, org, groups
shared = config.get('FS_SHARE', 'SHARE')
everyone = config.get('FS_SHARE', 'EVERYONE')
orgs = config.get('FS_SHARE', 'ORG')
groups = config.get('FS_SHARE', 'GROUPS') #Groups are by ID. Multiple groups comma separated
# create a temp directory under the script
tempDir = os.path.join(localPath, "tempDir")
if not os.path.isdir(tempDir):
os.mkdir(tempDir)
finalSD = os.path.join(tempDir, serviceName + ".sd")
#initialize AGOLHandler class
agol = AGOLHandler(inputUsername, inputPswd, serviceName, folderName)
# Turn map document into .SD file for uploading
makeSD(MXD, serviceName, tempDir, finalSD, maxRecords)
# overwrite the existing .SD on arcgis.com
if upload(finalSD, tags, description):
# publish the sd which was just uploaded
newItemID = publish()
# share the item
if shared:
enableSharing(newItemID, everyone, orgs, groups)
print "\nfinished."
if __name__ == "__main__":
print "Starting Feature Service publish process"
# Find and gather settings from the ini file
localPath = sys.path[0]
for fileName in ['settings.ini', 'flurb.ini', 'durf.ini']:
process_ini(fileName)
You'd have to write all the ini filenames in the list found in the penultimate line of my example.
Alternatively, you could identify all the .ini files in the directory via code:
if __name__ == "__main__":
print "Starting Feature Service publish process"
# Find and gather settings from the ini file
localPath = sys.path[0]
fileNames = [os.path.join(localPath, i) for i in os.listdir(localPath) if i.endswith('.ini')]
for fileName in fileNames:
process_ini(fileName)
It also might help to set the working directory (e.g., os.chdir(localPath)), but I'm going off of what you already had.