Python: How to loop through several ini files with ConfigParser? - python

I somewhat understand how to do looping in Python, seems easy enough to say "For each file in this directory...do something". I'm now having a hard time figuring out how to loop through a series of .ini files in a directory, read lines from them, and use the text in the ini files as variables in the same Python script. For example, in this script, a single .ini file provides the values for 12 variables in the script. Currently, to run the script multiple times, one has to replace the single ini file with another one, that contains a different 12 variables. The script performs routine maintenance of an on-line mapping service provider..thing is...I have dozen's of services I'd like to manage with the script. From the script, it appears that the name of the .ini file is fixed, not sure it's even possible to loop through multiple ini file? The good news is, that the script is using ConfigParser.....I hope this makes sense!
[FS_INFO]
SERVICENAME = MyMapService
FOLDERNAME = None
MXD = D:\nightly_updates\maps\MyMap.mxd
TAGS = points, dots, places
DESCRIPTION = This is the description text
MAXRECORDS = 1000
[FS_SHARE]
SHARE = True
EVERYONE = true
ORG = true
GROUPS = None
[AGOL]
USER = user_name
PASS = pass_word1
The script below is reading from the ini file above.
# Import system modules
import urllib, urllib2, json
import sys, os
import requests
import arcpy
import ConfigParser
from xml.etree import ElementTree as ET
class AGOLHandler(object):
def __init__(self, username, password, serviceName, folderName):
self.username = username
self.password = password
self.serviceName = serviceName
self.token, self.http = self.getToken(username, password)
self.itemID = self.findItem("Feature Service")
self.SDitemID = self.findItem("Service Definition")
self.folderName = folderName
self.folderID = self.findFolder()
def getToken(self, username, password, exp=60):
referer = "http://www.arcgis.com/"
query_dict = {'username': username,
'password': password,
'expiration': str(exp),
'client': 'referer',
'referer': referer,
'f': 'json'}
query_string = urllib.urlencode(query_dict)
url = "https://www.arcgis.com/sharing/rest/generateToken"
token = json.loads(urllib.urlopen(url + "?f=json", query_string).read())
if "token" not in token:
print token['error']
sys.exit()
else:
httpPrefix = "http://www.arcgis.com/sharing/rest"
if token['ssl'] == True:
httpPrefix = "https://www.arcgis.com/sharing/rest"
return token['token'], httpPrefix
def findItem(self, findType):
#
# Find the itemID of whats being updated
#
searchURL = self.http + "/search"
query_dict = {'f': 'json',
'token': self.token,
'q': "title:\""+ self.serviceName + "\"AND owner:\"" + self.username + "\" AND type:\"" + findType + "\""}
jsonResponse = sendAGOLReq(searchURL, query_dict)
if jsonResponse['total'] == 0:
print "\nCould not find a service to update. Check the service name in the settings.ini"
sys.exit()
else:
print("found {} : {}").format(findType, jsonResponse['results'][0]["id"])
return jsonResponse['results'][0]["id"]
def findFolder(self):
#
# Find the ID of the folder containing the service
#
if self.folderName == "None":
return ""
findURL = self.http + "/content/users/{}".format(self.username)
query_dict = {'f': 'json',
'num': 1,
'token': self.token}
jsonResponse = sendAGOLReq(findURL, query_dict)
for folder in jsonResponse['folders']:
if folder['title'] == self.folderName:
return folder['id']
print "\nCould not find the specified folder name provided in the settings.ini"
print "-- If your content is in the root folder, change the folder name to 'None'"
sys.exit()
def urlopen(url, data=None):
# monkey-patch URLOPEN
referer = "http://www.arcgis.com/"
req = urllib2.Request(url)
req.add_header('Referer', referer)
if data:
response = urllib2.urlopen(req, data)
else:
response = urllib2.urlopen(req)
return response
def makeSD(MXD, serviceName, tempDir, outputSD, maxRecords):
#
# create a draft SD and modify the properties to overwrite an existing FS
#
arcpy.env.overwriteOutput = True
# All paths are built by joining names to the tempPath
SDdraft = os.path.join(tempDir, "tempdraft.sddraft")
newSDdraft = os.path.join(tempDir, "updatedDraft.sddraft")
arcpy.mapping.CreateMapSDDraft(MXD, SDdraft, serviceName, "MY_HOSTED_SERVICES")
# Read the contents of the original SDDraft into an xml parser
doc = ET.parse(SDdraft)
root_elem = doc.getroot()
if root_elem.tag != "SVCManifest":
raise ValueError("Root tag is incorrect. Is {} a .sddraft file?".format(SDDraft))
# The following 6 code pieces modify the SDDraft from a new MapService
# with caching capabilities to a FeatureService with Query,Create,
# Update,Delete,Uploads,Editing capabilities as well as the ability to set the max
# records on the service.
# The first two lines (commented out) are no longer necessary as the FS
# is now being deleted and re-published, not truly overwritten as is the
# case when publishing from Desktop.
# The last three pieces change Map to Feature Service, disable caching
# and set appropriate capabilities. You can customize the capabilities by
# removing items.
# Note you cannot disable Query from a Feature Service.
#doc.find("./Type").text = "esriServiceDefinitionType_Replacement"
#doc.find("./State").text = "esriSDState_Published"
# Change service type from map service to feature service
for config in doc.findall("./Configurations/SVCConfiguration/TypeName"):
if config.text == "MapServer":
config.text = "FeatureServer"
#Turn off caching
for prop in doc.findall("./Configurations/SVCConfiguration/Definition/" +
"ConfigurationProperties/PropertyArray/" +
"PropertySetProperty"):
if prop.find("Key").text == 'isCached':
prop.find("Value").text = "false"
if prop.find("Key").text == 'maxRecordCount':
prop.find("Value").text = maxRecords
# Turn on feature access capabilities
for prop in doc.findall("./Configurations/SVCConfiguration/Definition/Info/PropertyArray/PropertySetProperty"):
if prop.find("Key").text == 'WebCapabilities':
prop.find("Value").text = "Query,Create,Update,Delete,Uploads,Editing"
# Add the namespaces which get stripped, back into the .SD
root_elem.attrib["xmlns:typens"] = 'http://www.esri.com/schemas/ArcGIS/10.1'
root_elem.attrib["xmlns:xs"] ='http://www.w3.org/2001/XMLSchema'
# Write the new draft to disk
with open(newSDdraft, 'w') as f:
doc.write(f, 'utf-8')
# Analyze the service
analysis = arcpy.mapping.AnalyzeForSD(newSDdraft)
if analysis['errors'] == {}:
# Stage the service
arcpy.StageService_server(newSDdraft, outputSD)
print "Created {}".format(outputSD)
else:
# If the sddraft analysis contained errors, display them and quit.
print analysis['errors']
sys.exit()
def upload(fileName, tags, description):
#
# Overwrite the SD on AGOL with the new SD.
# This method uses 3rd party module: requests
#
updateURL = agol.http+'/content/users/{}/{}/items/{}/update'.format(agol.username, agol.folderID, agol.SDitemID)
filesUp = {"file": open(fileName, 'rb')}
url = updateURL + "?f=json&token="+agol.token+ \
"&filename="+fileName+ \
"&type=Service Definition"\
"&title="+agol.serviceName+ \
"&tags="+tags+\
"&description="+description
response = requests.post(url, files=filesUp);
itemPartJSON = json.loads(response.text)
if "success" in itemPartJSON:
itemPartID = itemPartJSON['id']
print("updated SD: {}").format(itemPartID)
return True
else:
print "\n.sd file not uploaded. Check the errors and try again.\n"
print itemPartJSON
sys.exit()
def publish():
#
# Publish the existing SD on AGOL (it will be turned into a Feature Service)
#
publishURL = agol.http+'/content/users/{}/publish'.format(agol.username)
query_dict = {'itemID': agol.SDitemID,
'filetype': 'serviceDefinition',
'overwrite': 'true',
'f': 'json',
'token': agol.token}
jsonResponse = sendAGOLReq(publishURL, query_dict)
print("successfully updated...{}...").format(jsonResponse['services'])
return jsonResponse['services'][0]['serviceItemId']
def enableSharing(newItemID, everyone, orgs, groups):
#
# Share an item with everyone, the organization and/or groups
#
shareURL = agol.http+'/content/users/{}/{}/items/{}/share'.format(agol.username, agol.folderID, newItemID)
if groups == None:
groups = ''
query_dict = {'f': 'json',
'everyone' : everyone,
'org' : orgs,
'groups' : groups,
'token': agol.token}
jsonResponse = sendAGOLReq(shareURL, query_dict)
print("successfully shared...{}...").format(jsonResponse['itemId'])
def sendAGOLReq(URL, query_dict):
#
# Helper function which takes a URL and a dictionary and sends the request
#
query_string = urllib.urlencode(query_dict)
jsonResponse = urllib.urlopen(URL, urllib.urlencode(query_dict))
jsonOuput = json.loads(jsonResponse.read())
wordTest = ["success", "results", "services", "notSharedWith", "folders"]
if any(word in jsonOuput for word in wordTest):
return jsonOuput
else:
print "\nfailed:"
print jsonOuput
sys.exit()
if __name__ == "__main__":
#
# start
#
print "Starting Feature Service publish process"
# Find and gather settings from the ini file
localPath = sys.path[0]
settingsFile = os.path.join(localPath, "settings.ini")
if os.path.isfile(settingsFile):
config = ConfigParser.ConfigParser()
config.read(settingsFile)
else:
print "INI file not found. \nMake sure a valid 'settings.ini' file exists in the same directory as this script."
sys.exit()
# AGOL Credentials
inputUsername = config.get( 'AGOL', 'USER')
inputPswd = config.get('AGOL', 'PASS')
# FS values
MXD = config.get('FS_INFO', 'MXD')
serviceName = config.get('FS_INFO', 'SERVICENAME')
folderName = config.get('FS_INFO', 'FOLDERNAME')
tags = config.get('FS_INFO', 'TAGS')
description = config.get('FS_INFO', 'DESCRIPTION')
maxRecords = config.get('FS_INFO', 'MAXRECORDS')
# Share FS to: everyone, org, groups
shared = config.get('FS_SHARE', 'SHARE')
everyone = config.get('FS_SHARE', 'EVERYONE')
orgs = config.get('FS_SHARE', 'ORG')
groups = config.get('FS_SHARE', 'GROUPS') #Groups are by ID. Multiple groups comma separated
# create a temp directory under the script
tempDir = os.path.join(localPath, "tempDir")
if not os.path.isdir(tempDir):
os.mkdir(tempDir)
finalSD = os.path.join(tempDir, serviceName + ".sd")
#initialize AGOLHandler class
agol = AGOLHandler(inputUsername, inputPswd, serviceName, folderName)
# Turn map document into .SD file for uploading
makeSD(MXD, serviceName, tempDir, finalSD, maxRecords)
# overwrite the existing .SD on arcgis.com
if upload(finalSD, tags, description):
# publish the sd which was just uploaded
newItemID = publish()
# share the item
if shared:
enableSharing(newItemID, everyone, orgs, groups)
print "\nfinished."

If I understand your question correctly, you would just want to add another loop in your main and then place most of what you have in your main into a new function (in my example, the new function is called 'process_ini'.
So, try replacing everything from your name == main line through the end with:
def process_ini(fileName):
settingsFile = os.path.join(localPath, fileName)
if os.path.isfile(settingsFile):
config = ConfigParser.ConfigParser()
config.read(settingsFile)
else:
print "INI file not found. \nMake sure a valid 'settings.ini' file exists in the same directory as this script."
sys.exit()
# AGOL Credentials
inputUsername = config.get( 'AGOL', 'USER')
inputPswd = config.get('AGOL', 'PASS')
# FS values
MXD = config.get('FS_INFO', 'MXD')
serviceName = config.get('FS_INFO', 'SERVICENAME')
folderName = config.get('FS_INFO', 'FOLDERNAME')
tags = config.get('FS_INFO', 'TAGS')
description = config.get('FS_INFO', 'DESCRIPTION')
maxRecords = config.get('FS_INFO', 'MAXRECORDS')
# Share FS to: everyone, org, groups
shared = config.get('FS_SHARE', 'SHARE')
everyone = config.get('FS_SHARE', 'EVERYONE')
orgs = config.get('FS_SHARE', 'ORG')
groups = config.get('FS_SHARE', 'GROUPS') #Groups are by ID. Multiple groups comma separated
# create a temp directory under the script
tempDir = os.path.join(localPath, "tempDir")
if not os.path.isdir(tempDir):
os.mkdir(tempDir)
finalSD = os.path.join(tempDir, serviceName + ".sd")
#initialize AGOLHandler class
agol = AGOLHandler(inputUsername, inputPswd, serviceName, folderName)
# Turn map document into .SD file for uploading
makeSD(MXD, serviceName, tempDir, finalSD, maxRecords)
# overwrite the existing .SD on arcgis.com
if upload(finalSD, tags, description):
# publish the sd which was just uploaded
newItemID = publish()
# share the item
if shared:
enableSharing(newItemID, everyone, orgs, groups)
print "\nfinished."
if __name__ == "__main__":
print "Starting Feature Service publish process"
# Find and gather settings from the ini file
localPath = sys.path[0]
for fileName in ['settings.ini', 'flurb.ini', 'durf.ini']:
process_ini(fileName)
You'd have to write all the ini filenames in the list found in the penultimate line of my example.
Alternatively, you could identify all the .ini files in the directory via code:
if __name__ == "__main__":
print "Starting Feature Service publish process"
# Find and gather settings from the ini file
localPath = sys.path[0]
fileNames = [os.path.join(localPath, i) for i in os.listdir(localPath) if i.endswith('.ini')]
for fileName in fileNames:
process_ini(fileName)
It also might help to set the working directory (e.g., os.chdir(localPath)), but I'm going off of what you already had.

Related

Scraping files from google drive - automated queries prevented by

I wanted to scrape a few pdfs from a great history crash course I used to read a long time ago. Sadly, the old website is down and I only managed to get the old html code from archive.org
(the links I got work fine, ex: https://drive.google.com/file/d/0BzRJiIvdbSoKcHpGUWJBUDZ2WDA/edit?usp=sharing).
This script is resulting in html files being downloaded, saying
,,We're sorry but your computer or network may be sending automated queries. To protect our users, we can't process your request right now.”
Is there a way to bypass this? I tried putting a few random delays into the code so this might be insufficient or i might be on google's blacklist for now.
(the text.txt file can be found here https://filebin.net/k2qw09embamx05ey )
import requests
import time
import random
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
time.sleep(random.randrange(1,2))
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f = open('text.txt')
long_string = f.readlines()
interesting_strings = []
for item in long_string:
if 'drive.google' in item:
interesting_strings.append(item)
print(interesting_strings)
interesting_strings = interesting_strings[0]
interesting_strings = interesting_strings.split('https://web.archive.org/web/20161219093036/')
links = []
for item in interesting_strings:
if 'drive.google' in item:
idx = item.find('"')
links.append(item[:idx])
cntr = 1
for link in links:
print(link)
fname = './data/History_' + str(cntr)
file_id = link.split('/')[-2]
print('id:', file_id)
destination = fname
download_file_from_google_drive(file_id, destination)
print('Getting file #', str(cntr))
cntr += 1
time.sleep(random.randrange(3,15) + random.random())
Use gdown:
import gdown
file_id = '0BzRJiIvdbSoKcHpGUWJBUDZ2WDA'
filename = 'file.pdf'
url = 'https://drive.google.com/uc?id=' + file_id
gdown.download(url, filename, quiet=False)

YouTube video Downloader python

I made a youtube video download Manager. It download a video but i am facing one issue when i download same video, it doesn't download it again. how can i download it again with same title like pic.png and send pic1.png. How can i do that?
def Download(self):
video_url = self.lineEdit.text()
save_location = self.lineEdit_2.text()
if video_url == '' or save_location == '':
QMessageBox.warning(self, "Data Error", "Provide a Valid Video URL or save Location")
else:
video = pafy.new(video_url)
video_stream = video.streams
video_quality = self.comboBox.currentIndex()
download = video_stream[video_quality].download(filepath=save_location, callback=self.Handel_Progress, )
Ok, this one is interesting.
The real problem begins here.
download = video_stream[video_quality].download(filepath=save_location, callback=self.Handel_Progress, )
Here, you are calling download function of video_stream object which takes filepath as an argument for file location but does not take the filename, because, obviously, the file would be saved with the actual name.
Root Cause of your problem:
If you look into the definition of download function, you would find that if a file exists with the same name, it would not download the file at all.
Now comes the part, how do you make sure it downloads, no matter what:
There are two things you need to do:
Check if a file with same name exists or not, and if does, then add 1 in the end of the file name just before the extension. So if abc.mp4 exists, then save abc1.mp4.
[I will tell you how to handle the scenario when abc.mp4, abc1.mp4 and so on exists, but for now, let's get back to the problem.]
How to pass the file name (abc1.mp4) to the download method?
Following piece of code would handle both.
I have added comments for your understanding.
import os
import re
import pafy
from pafy.util import xenc
# this function is used by pafy to generate file name while saving,
# so im using the same function to get the file name which I will use to check
# if file exists or not
# DO NOT CHANGE IT
def generate_filename(title, extension):
max_length = 251
""" Generate filename. """
ok = re.compile(r'[^/]')
if os.name == "nt":
ok = re.compile(r'[^\\/:*?"<>|]')
filename = "".join(x if ok.match(x) else "_" for x in title)
if max_length:
max_length = max_length + 1 + len(extension)
if len(filename) > max_length:
filename = filename[:max_length - 3] + '...'
filename += "." + extension
return xenc(filename)
def get_file_name_for_saving(save_location, full_name):
file_path_with_name = os.path.join(save_location, full_name)
# file exists, add 1 in the end, otherwise return filename as it is
if os.path.exists(file_path_with_name):
split = file_path_with_name.split(".")
file_path_with_name = ".".join(split[:-1]) + "1." + split[-1]
return file_path_with_name
def Download(self):
video_url = self.lineEdit.text()
save_location = self.lineEdit_2.text()
if video_url == '' or save_location == '':
QMessageBox.warning(self, "Data Error", "Provide a Valid Video URL or save Location")
else:
# video file
video = pafy.new(video_url)
# available video streams
video_stream = video.streams
video_quality = self.comboBox.currentIndex()
# video title/name
video_name = video.title
# take out the extension of the file from video stream
extension = video_stream[video_quality].extension
# fullname with extension
full_name = generate_filename(video_name, extension)
final_path_with_file_name = get_file_name_for_saving(save_location, full_name)
download = video_stream[video_quality].download(filepath=final_path_with_file_name,
callback=self.Handel_Progress, )
Let me know if you face any issues.

Merge dicttoxml json dumps to one xml file

I have some code that takes an input.xml file from server commands and I get the output.xml
Bu I need to check two servers and have a loop at end for this and I check it but I get only the last result inside output.xml file.
I would need to merge the json.dumps(data) in one dictionary or whatever xmltodict produces and then parse into one xml.
I have tried some dictionary update but it did not work.
The code is here:
def get_output_dict():
if vendor == 'HP':
data = xml_to_dict(xml_doc='hp_input.xml')
elif vendor == 'Dell':
data = xml_to_dict(xml_doc='dell_input.xml')
for test in data['platform']['vendor']['tests']:
command = test.get('command') #continue if command is not present
output = remote(command)
str1 = ''.join(str(e) for e in output)
for key in test.keys():
if key == 'command':
test[key] = str1
#Change command key name with result using .pop
test['Result'] = test.pop('command')
return json.loads(json.dumps(data))
def get_output_xml(output_dict):
#dicttoxml.set_debug()
output_xml = dicttoxml.dicttoxml(output_dict,custom_root='output',attr_type=False,root=False)
if vendor == 'HP':
filename = 'hp_output-{}.xml'.format(host)
elif vendor == 'Dell':
filename = 'dell_output-{}.xml'.format(host)
tree = etree.fromstring(output_xml)
output_xml_string = etree.tostring(tree, pretty_print=True)
with open(filename, 'wb') as f:
f.write(output_xml_string)
print('Output for hostname server: {} written to:
{}'.format(host,filename))
return output_xml
for x in range(3, len(sys.argv)):
print("Checking Server: %s" % (sys.argv[x]))
remote = myssh(sys.argv[x], username, password)
data = get_output_dict()
xml = get_output_xml(data)
The result should be that I at get_output_xml(data) get the data merged from two iterations of two servers, and maybe later on 3 servers.

How to read eml file in python?

I do not known how to load a eml file in python 3.4.
I want to list all and read all of them in python.
This is how you get content of an e-mail i.e. *.eml file.
This works perfectly on Python2.5 - 2.7. Try it on 3. It should work as well.
from email import message_from_file
import os
# Path to directory where attachments will be stored:
path = "./msgfiles"
# To have attachments extracted into memory, change behaviour of 2 following functions:
def file_exists (f):
"""Checks whether extracted file was extracted before."""
return os.path.exists(os.path.join(path, f))
def save_file (fn, cont):
"""Saves cont to a file fn"""
file = open(os.path.join(path, fn), "wb")
file.write(cont)
file.close()
def construct_name (id, fn):
"""Constructs a file name out of messages ID and packed file name"""
id = id.split(".")
id = id[0]+id[1]
return id+"."+fn
def disqo (s):
"""Removes double or single quotations."""
s = s.strip()
if s.startswith("'") and s.endswith("'"): return s[1:-1]
if s.startswith('"') and s.endswith('"'): return s[1:-1]
return s
def disgra (s):
"""Removes < and > from HTML-like tag or e-mail address or e-mail ID."""
s = s.strip()
if s.startswith("<") and s.endswith(">"): return s[1:-1]
return s
def pullout (m, key):
"""Extracts content from an e-mail message.
This works for multipart and nested multipart messages too.
m -- email.Message() or mailbox.Message()
key -- Initial message ID (some string)
Returns tuple(Text, Html, Files, Parts)
Text -- All text from all parts.
Html -- All HTMLs from all parts
Files -- Dictionary mapping extracted file to message ID it belongs to.
Parts -- Number of parts in original message.
"""
Html = ""
Text = ""
Files = {}
Parts = 0
if not m.is_multipart():
if m.get_filename(): # It's an attachment
fn = m.get_filename()
cfn = construct_name(key, fn)
Files[fn] = (cfn, None)
if file_exists(cfn): return Text, Html, Files, 1
save_file(cfn, m.get_payload(decode=True))
return Text, Html, Files, 1
# Not an attachment!
# See where this belongs. Text, Html or some other data:
cp = m.get_content_type()
if cp=="text/plain": Text += m.get_payload(decode=True)
elif cp=="text/html": Html += m.get_payload(decode=True)
else:
# Something else!
# Extract a message ID and a file name if there is one:
# This is some packed file and name is contained in content-type header
# instead of content-disposition header explicitly
cp = m.get("content-type")
try: id = disgra(m.get("content-id"))
except: id = None
# Find file name:
o = cp.find("name=")
if o==-1: return Text, Html, Files, 1
ox = cp.find(";", o)
if ox==-1: ox = None
o += 5; fn = cp[o:ox]
fn = disqo(fn)
cfn = construct_name(key, fn)
Files[fn] = (cfn, id)
if file_exists(cfn): return Text, Html, Files, 1
save_file(cfn, m.get_payload(decode=True))
return Text, Html, Files, 1
# This IS a multipart message.
# So, we iterate over it and call pullout() recursively for each part.
y = 0
while 1:
# If we cannot get the payload, it means we hit the end:
try:
pl = m.get_payload(y)
except: break
# pl is a new Message object which goes back to pullout
t, h, f, p = pullout(pl, key)
Text += t; Html += h; Files.update(f); Parts += p
y += 1
return Text, Html, Files, Parts
def extract (msgfile, key):
"""Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary.
msgfile -- A file-like readable object
key -- Some ID string for that particular Message. Can be a file name or anything.
Returns dict()
Keys: from, to, subject, date, text, html, parts[, files]
Key files will be present only when message contained binary files.
For more see __doc__ for pullout() and caption() functions.
"""
m = message_from_file(msgfile)
From, To, Subject, Date = caption(m)
Text, Html, Files, Parts = pullout(m, key)
Text = Text.strip(); Html = Html.strip()
msg = {"subject": Subject, "from": From, "to": To, "date": Date,
"text": Text, "html": Html, "parts": Parts}
if Files: msg["files"] = Files
return msg
def caption (origin):
"""Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
origin -- Message() object
Returns tuple(From, To, Subject, Date)
If message doesn't contain one/more of them, the empty strings will be returned.
"""
Date = ""
if origin.has_key("date"): Date = origin["date"].strip()
From = ""
if origin.has_key("from"): From = origin["from"].strip()
To = ""
if origin.has_key("to"): To = origin["to"].strip()
Subject = ""
if origin.has_key("subject"): Subject = origin["subject"].strip()
return From, To, Subject, Date
# Usage:
f = open("message.eml", "rb")
print extract(f, f.name)
f.close()
I programmed this for my mailgroup using mailbox, that is why it is so convoluted.
It never failed me. Never any junk. If message is multipart, output dictionary will contain a
key "files" (a sub dict) with all filenames of extracted other files that were not text or html.
That was a way of extracting attachments and other binary data.
You may change it in pullout(), or just change the behaviour of file_exists() and save_file().
construct_name() constructs a filename out of message id and multipart message
filename, if there is one.
In pullout() the Text and Html variables are strings. For online mailgroup it was OK to get any text or HTML packed into multipart that wasn't an attachment at once.
If you need something more sophisticated change Text and Html to lists and append to them and add them as needed.
Nothing problematic.
Maybe there are some errors here, because it is intended to work with mailbox.Message(),
not with email.Message(). I tried it on email.Message() and it worked fine.
You said, you "wish to list them all". From where? If you refer to the POP3 mailbox or a mailbox of some nice open-source mailer, then you do it using mailbox module.
If you want to list them from others, then you have a problem.
For example, to get mails from MS Outlook, you have to know how to read OLE2 compound files.
Other mailers rarely refer to them as *.eml files, so I think this is exactly what you would like to do.
Then search on PyPI for olefile or compoundfiles module and Google around for how to extract an e-mail from MS Outlook inbox file.
Or save yourself a mess and just export them from there to some directory. When you have them as eml files, then apply this code.
I found this code much simpler
import email
import os
path = './'
listing = os.listdir(path)
for fle in listing:
if str.lower(fle[-3:])=="eml":
msg = email.message_from_file(open(fle))
attachments=msg.get_payload()
for attachment in attachments:
try:
fnam=attachment.get_filename()
f=open(fnam, 'wb').write(attachment.get_payload(decode=True,))
f.close()
except Exception as detail:
#print detail
pass
Posting this here for anyone looking to just extract text from an email and get a list of .eml files - took me forever to find a good answer to this online. NOTE: This will not get attachments to emails, just the text from email.
import email
from email import policy
from email.parser import BytesParser
import glob
import os
path = '/path/to/data/' # set this to "./" if in current directory
eml_files = glob.glob(path + '*.eml') # get all .eml files in a list
for eml_file in eml_files:
with open(eml_file, 'rb') as fp: # select a specific email file from the list
name = fp.name # Get file name
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
fp.close()
text = text.split("\n")
print (name) # Get name of eml file
print (text) # Get list of all text in email
Credit to some of the code from this post: Reading .eml files with Python 3.6 using emaildata 0.3.4
Python 3 version of Dalen's answer. Basically syntax issue fixes. (Can't comment due to lack of reputation, also clearer as an answer).
# To have attachments extracted into memory, change behaviour of 2 following functions:
def file_exists (f):
"""Checks whether extracted file was extracted before."""
return os.path.exists(os.path.join(path, f))
def save_file (fn, cont):
"""Saves cont to a file fn"""
file = open(os.path.join(path, fn), "wb")
file.write(cont)
file.close()
def construct_name (id, fn):
"""Constructs a file name out of messages ID and packed file name"""
id = id.split(".")
id = id[0]+id[1]
return id+"."+fn
def disqo (s):
"""Removes double or single quotations."""
s = s.strip()
if s.startswith("'") and s.endswith("'"): return s[1:-1]
if s.startswith('"') and s.endswith('"'): return s[1:-1]
return s
def disgra (s):
"""Removes < and > from HTML-like tag or e-mail address or e-mail ID."""
s = s.strip()
if s.startswith("<") and s.endswith(">"): return s[1:-1]
return s
def pullout (m, key):
"""Extracts content from an e-mail message.
This works for multipart and nested multipart messages too.
m -- email.Message() or mailbox.Message()
key -- Initial message ID (some string)
Returns tuple(Text, Html, Files, Parts)
Text -- All text from all parts.
Html -- All HTMLs from all parts
Files -- Dictionary mapping extracted file to message ID it belongs to.
Parts -- Number of parts in original message.
"""
Html = ""
Text = ""
Files = {}
Parts = 0
if not m.is_multipart():
if m.get_filename(): # It's an attachment
fn = m.get_filename()
cfn = construct_name(key, fn)
Files[fn] = (cfn, None)
if file_exists(cfn): return Text, Html, Files, 1
save_file(cfn, m.get_payload(decode=True))
return Text, Html, Files, 1
# Not an attachment!
# See where this belongs. Text, Html or some other data:
cp = m.get_content_type()
if cp=="text/plain":
Text += str(m.get_payload(decode=True))
elif cp=="text/html":
Html += str(m.get_payload(decode=True))
else:
# Something else!
# Extract a message ID and a file name if there is one:
# This is some packed file and name is contained in content-type header
# instead of content-disposition header explicitly
cp = m.get("content-type")
try: id = disgra(m.get("content-id"))
except: id = None
# Find file name:
o = cp.find("name=")
if o==-1: return Text, Html, Files, 1
ox = cp.find(";", o)
if ox==-1: ox = None
o += 5; fn = cp[o:ox]
fn = disqo(fn)
cfn = construct_name(key, fn)
Files[fn] = (cfn, id)
if file_exists(cfn): return Text, Html, Files, 1
save_file(cfn, m.get_payload(decode=True))
return Text, Html, Files, 1
# This IS a multipart message.
# So, we iterate over it and call pullout() recursively for each part.
y = 0
while 1:
# If we cannot get the payload, it means we hit the end:
try:
pl = m.get_payload(y)
except: break
# pl is a new Message object which goes back to pullout
t, h, f, p = pullout(pl, key)
Text += t; Html += h; Files.update(f); Parts += p
y += 1
return Text, Html, Files, Parts
def extract (msgfile, key):
"""Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary.
msgfile -- A file-like readable object
key -- Some ID string for that particular Message. Can be a file name or anything.
Returns dict()
Keys: from, to, subject, date, text, html, parts[, files]
Key files will be present only when message contained binary files.
For more see __doc__ for pullout() and caption() functions.
"""
m = email.message_from_file(msgfile)
From, To, Subject, Date = caption(m)
Text, Html, Files, Parts = pullout(m, key)
Text = Text.strip(); Html = Html.strip()
msg = {"subject": Subject, "from": From, "to": To, "date": Date,
"text": Text, "html": Html, "parts": Parts}
if Files: msg["files"] = Files
return msg
def caption (origin):
"""Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
origin -- Message() object
Returns tuple(From, To, Subject, Date)
If message doesn't contain one/more of them, the empty strings will be returned.
"""
Date = ""
if origin.__contains__("date"): Date = origin["date"].strip()
From = ""
if origin.__contains__("from"): From = origin["from"].strip()
To = ""
if origin.__contains__("to"): To = origin["to"].strip()
Subject = ""
if origin.__contains__("subject"): Subject = origin["subject"].strip()
return From, To, Subject, Date
Try this:
#!python3
# -*- coding: utf-8 -*-
import email
import os
SOURCE_DIR = 'email'
DEST_DIR = 'temp'
def extractattachements(fle,suffix=None):
message = email.message_from_file(open(fle))
filenames = []
if message.get_content_maintype() == 'multipart':
for part in message.walk():
if part.get_content_maintype() == 'multipart': continue
#if part.get('Content-Disposition') is None: continue
if part.get('Content-Type').find('application/octet-stream') == -1: continue
filename = part.get_filename()
if suffix:
filename = ''.join( [filename.split('.')[0], '_', suffix, '.', filename.split('.')[1]])
filename = os.path.join(DEST_DIR, filename)
fb = open(filename,'wb')
fb.write(part.get_payload(decode=True))
fb.close()
filenames.append(filename)
return filenames
def main():
onlyfiles = [f for f in os.listdir(SOURCE_DIR) if os.path.isfile(os.path.join(SOURCE_DIR, f))]
for file in onlyfiles:
#print path.join(SOURCE_DIR,file)
extractattachements(os.path.join(SOURCE_DIR,file))
return True
if __name__ == "__main__":
main()
Here I am simplifying things for you so that you can get a more clear data to process on .....
.eml will consist of 2 parts on broad level 1) Headers 2)Content/Body
(Note it will discard any attachements if they are there)
Moreover I've removed https links also from .eml file but I'll tell you what to do if you want them .
1) Header :
So I used eml-parser to get Header information you can install it using :
pip install eml-parser
View their documentation to get more info about how to get headers :
https://pypi.org/project/eml-parser/
2)Content/Body : Now here I modified some older scripts to get best result in output
from email import policy
from email.parser import BytesParser
import glob
import os
path = './' # set this to "./" if in current directory
eml_files = glob.glob(path + '*.eml') # get all .eml files in a list
for eml_file in eml_files:
with open(eml_file, 'rb') as fp: # select a specific email file from the list
name = fp.name # Get file name
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
fp.close()
print (name) # Get name of eml file
# print (text) # Get list of all text in email
This is a part of code which was already available on many places and of which I don't take credit of......
Now I've added few conditions to print out the body in more pretty way these lines of code are mine and you can give me credit for that :
newText = ""
flag = 0
urlFlag = 0
for i in range(len(text)):
if(flag==1):
flag = 0
continue
if(text[i]=="\\"):
flag = 1
continue
if(text[i]=='<'): //to remove hyperlinks
urlFlag = 1
continue
if(text[i]=='>'): //to remove hyperlinks
urlFlag = 0
continue
if(urlFlag==0): //to remove hyperlinks
newText = newText+text[i]
print(newText)
Now this will remove all the break-lines , tab space and other stuff (\t,\r,\n)
Moreover if you want to have links (http,https links present in your .eml file) then just remove 3 conditions and new code will look like :
newText = ""
flag = 0
urlFlag = 0
for i in range(len(text)):
if(flag==1):
flag = 0
continue
if(text[i]=="\\"):
flag = 1
continue
newText = newText+text[i]
print(newText)
Final Code (with removing links) :
from email import policy
from email.parser import BytesParser
import glob
import os
path = './' # set this to "./" if in current directory
eml_files = glob.glob(path + '*.eml') # get all .eml files in a list
for eml_file in eml_files:
with open(eml_file, 'rb') as fp: # select a specific email file from the list
name = fp.name # Get file name
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
fp.close()
print (name) # Get name of eml file
# print (text) # Get list of all text in email
newText = ""
flag = 0
urlFlag = 0
for i in range(len(text)):
if(flag==1):
flag = 0
continue
if(text[i]=="\\"):
flag = 1
continue
if(text[i]=='<'):
urlFlag = 1
continue
if(text[i]=='>'):
urlFlag = 0
continue
if(urlFlag==0):
newText = newText+text[i]
print(newText)
This is my 1st answer on StackOverflow hope this will help you guys !
My Python version is : 3.8.10

How do I fix KeyError while parsing instagram?

Everyone,
I have a small script parsing names on Instagram.
Recently started having this error:
Traceback (most recent call last):
File "/home/jpegcoma/vk/posting_instagram.py", line 361, in <module>
main()
File "/home/jpegcoma/vk/posting_instagram.py", line 293, in main
table_of_content = get_stuf_from_url(urls)
File "/home/jpegcoma/vk/posting_instagram.py", line 64, in get_stuf_from_url
if json.loads(shared_data)["entry_data"]["ProfilePage"][0]["graphql"]["user"]["is_private"] == False:
KeyError: 'ProfilePage'
Currently it is running on some server. However, I tryed it on my laptop and script was working.
Here is the code that does the thing:
import requests
import json
import os
import random
from time import sleep
import time
import re
from io import BytesIO
from PIL import Image
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
file_name = 'users_names.txt'
def create_folder_photos():
if os.path.isdir(os.path.join(os.getcwd(), "photos")) == False:
os.makedirs(os.path.join(os.getcwd(), "photos"))
else:
pass
def make_list_of_users_to_scrap(file_name):
'''Opens file with instagram user_names.
Every name should be on a new line.
Prepares full URL for parsing.
Returns list URLs'''
path = os.path.join(os.getcwd(), file_name)
base_url = 'https://www.instagram.com/'
users_url_dic = []
with open(path, 'r') as file:
for name in file:
users_url_dic.append(base_url + name.rstrip() + '/')
return users_url_dic
def parsed_data(shared_data):
'''Get to ["edges"] node in shared_data from instagram'''
return json.loads(shared_data)['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']["edges"]
def get_stuf_from_url(urls):
# Open a request session
with requests.session() as s:
# Add some headers in case
s.headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
pool = ThreadPool(5)
d={}
# Go throught all the URLs on instagram
responce = pool.map(requests.get, urls)
pool.close()
pool.join()
for i in responce:
c = i.text
if 30000 < len(c) < 180000:
# Clean html, take only content of 'sharedData' part
shared_data = c.split('window._sharedData = ')[1].split(';</script>')[0]
# Check is accaunt is private
if json.loads(shared_data)["entry_data"]["ProfilePage"][0]["graphql"]["user"]["is_private"] == False:
# Go throught all the nodes:
# If video - pass.
# If photo - take {"Id":"URL"}
for node in parsed_data(shared_data)[::]:
if node['node']['is_video'] == False:
d[node['node']['id']] = node['node']['display_url']
else:
continue
else:
continue
else:
continue
return d
def check_for_new(new_data_from_request):
'''Open 'before_log.txt with previous loggs {'id':'url'}
Check if any new data is presented.
Write 2 new files:
"added.txt" - new photos with url from the last time
"before_log.txt" - updated log with all the ids and urls
returns dic with added {'id':'url'} photos'''
# Open a before_log.txt or say that no such file is presented.
if os.path.isfile(os.path.join(os.getcwd(), 'before_log.txt')):
with open(os.path.join(os.getcwd(), 'before_log.txt'), mode='r', encoding='utf8') as f_file:
before_log = json.load(f_file)
else:
print('Need to make "before_log.txt" file to use the script!!!')
# Get new data from "def get_stuf_from_url(urls):"
after = new_data_from_request
# Check if any new photos is avaliable
added = {i:after[i] for i in after if not i in before_log}
# Add new {key:value} to before_log
for key, value in after.items():
if key not in before_log.keys():
before_log[key] = value
# Write added and before_log for future use
with open(os.path.join(os.getcwd(), 'added.txt'), mode='w', encoding='utf8') as add_file:
add_file.write(json.dumps(added) + '\n')
with open(os.path.join(os.getcwd(), 'before_log.txt'), mode='w', encoding='utf8') as out_file:
out_file.write(json.dumps(before_log) + '\n')
print('We got {} new photos.'.format(len(added)))
return added
def createFilename(url, name, folder):
result = re.split(r'.jpg', url)
slashSplit = result[0].split('/')
if name == None:
name = slashSplit[-1]
ext = "jpg"
file = '{}{}.{}'.format(folder, name, ext)
return file
def getImageFast(url, name=None, folder= os.path.join(os.getcwd(), "photos/")):
'''Download new photos from instagram
Creates a photos folder'''
print("Downloading photos.....")
file = createFilename(url, name, folder)
r = requests.get(url, stream=True)
i = Image.open(BytesIO(r.content))
i.save(file)
I guess the problem is somewhere in here
if json.loads(shared_data)["entry_data"]["ProfilePage"][0]["graphql"]["user"]["is_private"] == False:
Some examples of parsed names on instagram:
_nail_ann_
alena.nails.tallinn
alyne_nails
anna_nails_erbil
aquarelle_nailstudio
cantinhodalara_nails
In a shorter version it does work as intended:
urls = 'https://www.instagram.com/_linails_/'
responce = requests.get(urls)
response_text= responce.text
shared_data = response_text.split('window._sharedData = ')[1].split(';</script>')[0]
# print(shared_data)
d={}
f = json.loads(shared_data)['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']["edges"]
for node in f[::]:
if node['node']['is_video'] == False:
d[node['node']['id']] = node['node']['display_url']
else:
continue
print (d)
After running it I'm getting all the URL and ids I need:
{
'2073876006313498489': 'https://scontent-lax3-2.cdninstagram.com/vp/6e5c8c22e54aa0c853ee88db05dc79bf/5E1BBCA4/t51.2885-15/e35/65217639_634723610367271_4450988163128206846_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=107',
'2024498169693824735': 'https://scontent-lax3-2.cdninstagram.com/vp/39188272c2305ed250ad466c7a715b91/5E2F4B15/t51.2885-15/e35/56352792_132736304460754_8293153588685230511_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=110',
'2023266828574689831': 'https://scontent-lax3-2.cdninstagram.com/vp/f313d44c5bd398a8e6b3f04fb7dbb739/5E2BBB71/t51.2885-15/e35/56578225_1055286461334820_1507399846418163801_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=104',
'2016110942668250132': 'https://scontent-lax3-2.cdninstagram.com/vp/349bbf6a920e440a4e71d5b2d149a61b/5E2BB7FE/t51.2885-15/e35/53745148_280247652888437_7055433742029015170_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=105',
'2012783478764415885': 'https://scontent-lax3-2.cdninstagram.com/vp/72dfe2f67b6dc1ea75e2ddd832384475/5E1936CE/t51.2885-15/e35/54512001_2155869857812437_3429908829670998264_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=109',
'2012464856204377926': 'https://scontent-lax3-2.cdninstagram.com/vp/5aefc3a4e047b08dc94366b0723f170d/5E32A5D3/t51.2885-15/e35/54513720_424627718315641_3423874379564248817_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=101',
'2008135031155279090': 'https://scontent-lax3-2.cdninstagram.com/vp/09cc2e7631c115a0131bda6f597dde60/5E1B4C09/t51.2885-15/e35/53156526_1025783867629475_1693464480553968728_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=111',
'2004990756607236359': 'https://scontent-lax3-2.cdninstagram.com/vp/5da04c640d70b52a3e3073667985f8e3/5E2A62EB/t51.2885-15/e35/54266355_225989821600275_560245954300705815_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=103',
'2002388991416431681': 'https://scontent-lax3-2.cdninstagram.com/vp/77bb0bf9878ca2d175dbd51350c1ef03/5E37974D/t51.2885-15/e35/53217305_581829868953428_1147405223061346025_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=108',
'2001312091952564411': 'https://scontent-lax3-2.cdninstagram.com/vp/64326e9675b389a7997ed86980cba7bc/5E30992A/t51.2885-15/e35/54513758_391705221628749_737855016941810571_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=109',
'1999425996532762294': 'https://scontent-lax3-2.cdninstagram.com/vp/4c4a5ee2b0ad46d6e3eeb1a30c1e9130/5E1BC2CA/t51.2885-15/e35/52639028_2494445767266095_4453054116414455580_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=111',
'1993652807341169347': 'https://scontent-lax3-2.cdninstagram.com/vp/d6d8ffef7fd23d1f12b14282d3bc9aca/5E17386F/t51.2885-15/e35/52024250_786523341734970_6491735451376989098_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=106'
}

Categories