Alphabetically sorting URLs to Download Image - python

Having an issue with the sorting of urls. The .jpg files end in "xxxx-xxxx.jpg". The second set of keys need to be sorted in alphabetical order. Thus far I've only been able to sort the first set of characters alphabetically (which is not necessary).
For instance:
http://code.google.com/edu/languages/google-python-class/images/puzzle/p-babf-bbac.jpg
is proceeding
http://code.google.com/edu/languages/google-python-class/images/puzzle/p-babh-bajc.jpg
when
#!/usr/bin/python
# Copyright 2010 Google Inc.
# Licensed under the Apache License, Version 2.0
# http://www.apache.org/licenses/LICENSE-2.0
# Google's Python Class
# http://code.google.com/edu/languages/google-python-class/
import os
import re
import sys
import requests
"""Logpuzzle exercise
Given an apache logfile, find the puzzle urls and download the images.
Here's what a puzzle url looks like:
10.254.254.28 - - [06/Aug/2007:00:13:48 -0700] "GET /~foo/puzzle-bar-aaab.jpg HTTP/1.0" 302 528 "-" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6"
"""
def url_sort_key(url):
print url [-8:]
#Extract the puzzle urls from inside a logfile
def read_urls(filename):
"""Returns a list of the puzzle urls from the given log file,
extracting the hostname from the filename itself.
Screens out duplicate urls and returns the urls sorted into
increasing order."""
# +++your code here+++
# Use open function to search fort the urls containing "puzzle/p"
# Use a line split to pick out the 6th section of the filename
# Sort out all repeated urls, and return sorted list
with open(filename) as f:
out = set()
for line in f:
if re.search("puzzle/p", line):
url = "http://code.google.com" + line.split(" ")[6]
print line.split(" ")
out.add(url)
return sorted(list(out))
# Complete the download_images function, which takes a sorted
# list of urls and a directory
def download_images(img_urls, dest_dir):
"""Given the urls already in the correct order, downloads
each image into the given directory.
Gives the images local filenames img0, img1, and so on.
Creates an index.html in the directory
with an img tag to show each local image file.
Creates the directory if necessary.
"""
# ++your code here++
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
# Create an index
index = file(os.path.join(dest_dir, 'index.html'), 'w')
index.write('<html><body>\n')
i = 0
for img_url in img_urls:
i += 1
local_name = 'img%d' %i
print "Retrieving...", local_name
print local_name
print dest_dir
print img_url
response = requests.get(img_url)
if response.status_code == 200:
f = open(os.path.join(dest_dir,local_name + ".jpg"), 'wb')
f.write(response.content)
f.close()
index.write ('<img src="%s">' % (local_name + ".jpg"))
index.write('\n</body></html>\n')
index.close()
def main():
args = sys.argv[1:]
print args
if not args:
print ('usage: [--todir dir] logfile ')
sys.exit(1)
todir = None
if args[0] == '--todir':
todir = args[1]
del args[0:2]
img_urls = read_urls(args[0])
if todir:
download_images(img_urls, todir)
else:
print ('\n'.join(img_urls))
if __name__ == '__main__':
main()
I think the error lies in the return for the read_urls function, but am not positive.

Given the urls end in the format
xxxx-yyyy.jpg
and you want to sort the urls based on the second key, i.e. yyyy
def read_urls(filename):
with open(filename) as f:
s = {el.rstrip() for el in f if 'puzzle' in el}
return sorted(s, key=lambda u: u[-8:-4]) # u[-13:-9] if need to sort on the first key
For example, with an input file containing
http://localhost/p-xxxx-yyyy.jpg
http://code.google.com/edu/languages/google-python-class/images/puzzle/p-babf-bbac.jpg
http://code.google.com/edu/languages/google-python-class/images/puzzle/p-babh-bajc.jpg
http://localhost/p-xxxx-yyyy.jpg
it produces the list
['http://code.google.com/edu/languages/google-python-class/images/puzzle/p-babh-bajc.jpg',
'http://code.google.com/edu/languages/google-python-class/images/puzzle/p-babf-bbac.jpg']
i.e. bajc comes before bbac.
See the comment in the code, in case you want to sort by the first key (xxxx)

Related

How to add strings in the file at the end of all lines

I am trying to download files using python and then add lines at the end of the downloaded files, but it returns an error:
f.write(data + """<auth-user-pass>
TypeError: can't concat str to bytes
Edit: Thanks, it works now when I do this b"""< auth-user-pass >""", but I only want to add the string at the end of the file. When I run the code, it adds the string for every line.
I also tried something like this but it also did not work: f.write(str(data) + "< auth-user-pass >")
here is my full code:
import requests
from multiprocessing.pool import ThreadPool
def download_url(url):
print("downloading: ", url)
# assumes that the last segment after the / represents the file name
# if url is abc/xyz/file.txt, the file name will be file.txt
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
save_path = 'ovpns/'
complete_path = os.path.join(save_path, file_name)
print(complete_path)
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(complete_path, 'wb') as f:
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""")
return url
servers = [
"us-ca72.nordvpn.com",
"us-ca73.nordvpn.com"
]
urls = []
for server in servers:
urls.append("https://downloads.nordcdn.com/configs/files/ovpn_legacy/servers/" + server + ".udp1194.ovpn")
# Run 5 multiple threads. Each call will take the next element in urls list
results = ThreadPool(5).imap_unordered(download_url, urls)
for r in results:
print(r)
EDIT: Thanks, it works now when I do this b"""< auth-user-pass >""", but I only want to add the string at the end of the file. When I run the code, it adds the string for every line.
Try this:
import requests
from multiprocessing.pool import ThreadPool
def download_url(url):
print("downloading: ", url)
# assumes that the last segment after the / represents the file name
# if url is abc/xyz/file.txt, the file name will be file.txt
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
save_path = 'ovpns/'
complete_path = os.path.join(save_path, file_name)
print(complete_path)
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(complete_path, 'wb') as f:
for data in r:
f.write(data)
return url
servers = [
"us-ca72.nordvpn.com",
"us-ca73.nordvpn.com"
]
urls = []
for server in servers:
urls.append("https://downloads.nordcdn.com/configs/files/ovpn_legacy/servers/" + server + ".udp1194.ovpn")
# Run 5 multiple threads. Each call will take the next element in urls list
results = ThreadPool(5).imap_unordered(download_url, urls)
with open(complete_path, 'ab') as f:
f.write(b"""<auth-user-pass>
username
password
</auth-user-pass>""")
for r in results:
print(r)
You are using binary mode, encode your string before concat, that is replace
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""")
using
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""".encode())
You open the file as a write in binary.
Because of that you cant use normal strings like the comment from #user56700 said.
You either need to convert the string or open it another way(ex. 'a' = appending).
Im not completly sure but it is also possible that the write binary variant of open the data of the file deletes. Normally open with write deletes existing data, so its quite possible that you need to change it to 'rwb'.

Python: How to loop through several ini files with ConfigParser?

I somewhat understand how to do looping in Python, seems easy enough to say "For each file in this directory...do something". I'm now having a hard time figuring out how to loop through a series of .ini files in a directory, read lines from them, and use the text in the ini files as variables in the same Python script. For example, in this script, a single .ini file provides the values for 12 variables in the script. Currently, to run the script multiple times, one has to replace the single ini file with another one, that contains a different 12 variables. The script performs routine maintenance of an on-line mapping service provider..thing is...I have dozen's of services I'd like to manage with the script. From the script, it appears that the name of the .ini file is fixed, not sure it's even possible to loop through multiple ini file? The good news is, that the script is using ConfigParser.....I hope this makes sense!
[FS_INFO]
SERVICENAME = MyMapService
FOLDERNAME = None
MXD = D:\nightly_updates\maps\MyMap.mxd
TAGS = points, dots, places
DESCRIPTION = This is the description text
MAXRECORDS = 1000
[FS_SHARE]
SHARE = True
EVERYONE = true
ORG = true
GROUPS = None
[AGOL]
USER = user_name
PASS = pass_word1
The script below is reading from the ini file above.
# Import system modules
import urllib, urllib2, json
import sys, os
import requests
import arcpy
import ConfigParser
from xml.etree import ElementTree as ET
class AGOLHandler(object):
def __init__(self, username, password, serviceName, folderName):
self.username = username
self.password = password
self.serviceName = serviceName
self.token, self.http = self.getToken(username, password)
self.itemID = self.findItem("Feature Service")
self.SDitemID = self.findItem("Service Definition")
self.folderName = folderName
self.folderID = self.findFolder()
def getToken(self, username, password, exp=60):
referer = "http://www.arcgis.com/"
query_dict = {'username': username,
'password': password,
'expiration': str(exp),
'client': 'referer',
'referer': referer,
'f': 'json'}
query_string = urllib.urlencode(query_dict)
url = "https://www.arcgis.com/sharing/rest/generateToken"
token = json.loads(urllib.urlopen(url + "?f=json", query_string).read())
if "token" not in token:
print token['error']
sys.exit()
else:
httpPrefix = "http://www.arcgis.com/sharing/rest"
if token['ssl'] == True:
httpPrefix = "https://www.arcgis.com/sharing/rest"
return token['token'], httpPrefix
def findItem(self, findType):
#
# Find the itemID of whats being updated
#
searchURL = self.http + "/search"
query_dict = {'f': 'json',
'token': self.token,
'q': "title:\""+ self.serviceName + "\"AND owner:\"" + self.username + "\" AND type:\"" + findType + "\""}
jsonResponse = sendAGOLReq(searchURL, query_dict)
if jsonResponse['total'] == 0:
print "\nCould not find a service to update. Check the service name in the settings.ini"
sys.exit()
else:
print("found {} : {}").format(findType, jsonResponse['results'][0]["id"])
return jsonResponse['results'][0]["id"]
def findFolder(self):
#
# Find the ID of the folder containing the service
#
if self.folderName == "None":
return ""
findURL = self.http + "/content/users/{}".format(self.username)
query_dict = {'f': 'json',
'num': 1,
'token': self.token}
jsonResponse = sendAGOLReq(findURL, query_dict)
for folder in jsonResponse['folders']:
if folder['title'] == self.folderName:
return folder['id']
print "\nCould not find the specified folder name provided in the settings.ini"
print "-- If your content is in the root folder, change the folder name to 'None'"
sys.exit()
def urlopen(url, data=None):
# monkey-patch URLOPEN
referer = "http://www.arcgis.com/"
req = urllib2.Request(url)
req.add_header('Referer', referer)
if data:
response = urllib2.urlopen(req, data)
else:
response = urllib2.urlopen(req)
return response
def makeSD(MXD, serviceName, tempDir, outputSD, maxRecords):
#
# create a draft SD and modify the properties to overwrite an existing FS
#
arcpy.env.overwriteOutput = True
# All paths are built by joining names to the tempPath
SDdraft = os.path.join(tempDir, "tempdraft.sddraft")
newSDdraft = os.path.join(tempDir, "updatedDraft.sddraft")
arcpy.mapping.CreateMapSDDraft(MXD, SDdraft, serviceName, "MY_HOSTED_SERVICES")
# Read the contents of the original SDDraft into an xml parser
doc = ET.parse(SDdraft)
root_elem = doc.getroot()
if root_elem.tag != "SVCManifest":
raise ValueError("Root tag is incorrect. Is {} a .sddraft file?".format(SDDraft))
# The following 6 code pieces modify the SDDraft from a new MapService
# with caching capabilities to a FeatureService with Query,Create,
# Update,Delete,Uploads,Editing capabilities as well as the ability to set the max
# records on the service.
# The first two lines (commented out) are no longer necessary as the FS
# is now being deleted and re-published, not truly overwritten as is the
# case when publishing from Desktop.
# The last three pieces change Map to Feature Service, disable caching
# and set appropriate capabilities. You can customize the capabilities by
# removing items.
# Note you cannot disable Query from a Feature Service.
#doc.find("./Type").text = "esriServiceDefinitionType_Replacement"
#doc.find("./State").text = "esriSDState_Published"
# Change service type from map service to feature service
for config in doc.findall("./Configurations/SVCConfiguration/TypeName"):
if config.text == "MapServer":
config.text = "FeatureServer"
#Turn off caching
for prop in doc.findall("./Configurations/SVCConfiguration/Definition/" +
"ConfigurationProperties/PropertyArray/" +
"PropertySetProperty"):
if prop.find("Key").text == 'isCached':
prop.find("Value").text = "false"
if prop.find("Key").text == 'maxRecordCount':
prop.find("Value").text = maxRecords
# Turn on feature access capabilities
for prop in doc.findall("./Configurations/SVCConfiguration/Definition/Info/PropertyArray/PropertySetProperty"):
if prop.find("Key").text == 'WebCapabilities':
prop.find("Value").text = "Query,Create,Update,Delete,Uploads,Editing"
# Add the namespaces which get stripped, back into the .SD
root_elem.attrib["xmlns:typens"] = 'http://www.esri.com/schemas/ArcGIS/10.1'
root_elem.attrib["xmlns:xs"] ='http://www.w3.org/2001/XMLSchema'
# Write the new draft to disk
with open(newSDdraft, 'w') as f:
doc.write(f, 'utf-8')
# Analyze the service
analysis = arcpy.mapping.AnalyzeForSD(newSDdraft)
if analysis['errors'] == {}:
# Stage the service
arcpy.StageService_server(newSDdraft, outputSD)
print "Created {}".format(outputSD)
else:
# If the sddraft analysis contained errors, display them and quit.
print analysis['errors']
sys.exit()
def upload(fileName, tags, description):
#
# Overwrite the SD on AGOL with the new SD.
# This method uses 3rd party module: requests
#
updateURL = agol.http+'/content/users/{}/{}/items/{}/update'.format(agol.username, agol.folderID, agol.SDitemID)
filesUp = {"file": open(fileName, 'rb')}
url = updateURL + "?f=json&token="+agol.token+ \
"&filename="+fileName+ \
"&type=Service Definition"\
"&title="+agol.serviceName+ \
"&tags="+tags+\
"&description="+description
response = requests.post(url, files=filesUp);
itemPartJSON = json.loads(response.text)
if "success" in itemPartJSON:
itemPartID = itemPartJSON['id']
print("updated SD: {}").format(itemPartID)
return True
else:
print "\n.sd file not uploaded. Check the errors and try again.\n"
print itemPartJSON
sys.exit()
def publish():
#
# Publish the existing SD on AGOL (it will be turned into a Feature Service)
#
publishURL = agol.http+'/content/users/{}/publish'.format(agol.username)
query_dict = {'itemID': agol.SDitemID,
'filetype': 'serviceDefinition',
'overwrite': 'true',
'f': 'json',
'token': agol.token}
jsonResponse = sendAGOLReq(publishURL, query_dict)
print("successfully updated...{}...").format(jsonResponse['services'])
return jsonResponse['services'][0]['serviceItemId']
def enableSharing(newItemID, everyone, orgs, groups):
#
# Share an item with everyone, the organization and/or groups
#
shareURL = agol.http+'/content/users/{}/{}/items/{}/share'.format(agol.username, agol.folderID, newItemID)
if groups == None:
groups = ''
query_dict = {'f': 'json',
'everyone' : everyone,
'org' : orgs,
'groups' : groups,
'token': agol.token}
jsonResponse = sendAGOLReq(shareURL, query_dict)
print("successfully shared...{}...").format(jsonResponse['itemId'])
def sendAGOLReq(URL, query_dict):
#
# Helper function which takes a URL and a dictionary and sends the request
#
query_string = urllib.urlencode(query_dict)
jsonResponse = urllib.urlopen(URL, urllib.urlencode(query_dict))
jsonOuput = json.loads(jsonResponse.read())
wordTest = ["success", "results", "services", "notSharedWith", "folders"]
if any(word in jsonOuput for word in wordTest):
return jsonOuput
else:
print "\nfailed:"
print jsonOuput
sys.exit()
if __name__ == "__main__":
#
# start
#
print "Starting Feature Service publish process"
# Find and gather settings from the ini file
localPath = sys.path[0]
settingsFile = os.path.join(localPath, "settings.ini")
if os.path.isfile(settingsFile):
config = ConfigParser.ConfigParser()
config.read(settingsFile)
else:
print "INI file not found. \nMake sure a valid 'settings.ini' file exists in the same directory as this script."
sys.exit()
# AGOL Credentials
inputUsername = config.get( 'AGOL', 'USER')
inputPswd = config.get('AGOL', 'PASS')
# FS values
MXD = config.get('FS_INFO', 'MXD')
serviceName = config.get('FS_INFO', 'SERVICENAME')
folderName = config.get('FS_INFO', 'FOLDERNAME')
tags = config.get('FS_INFO', 'TAGS')
description = config.get('FS_INFO', 'DESCRIPTION')
maxRecords = config.get('FS_INFO', 'MAXRECORDS')
# Share FS to: everyone, org, groups
shared = config.get('FS_SHARE', 'SHARE')
everyone = config.get('FS_SHARE', 'EVERYONE')
orgs = config.get('FS_SHARE', 'ORG')
groups = config.get('FS_SHARE', 'GROUPS') #Groups are by ID. Multiple groups comma separated
# create a temp directory under the script
tempDir = os.path.join(localPath, "tempDir")
if not os.path.isdir(tempDir):
os.mkdir(tempDir)
finalSD = os.path.join(tempDir, serviceName + ".sd")
#initialize AGOLHandler class
agol = AGOLHandler(inputUsername, inputPswd, serviceName, folderName)
# Turn map document into .SD file for uploading
makeSD(MXD, serviceName, tempDir, finalSD, maxRecords)
# overwrite the existing .SD on arcgis.com
if upload(finalSD, tags, description):
# publish the sd which was just uploaded
newItemID = publish()
# share the item
if shared:
enableSharing(newItemID, everyone, orgs, groups)
print "\nfinished."
If I understand your question correctly, you would just want to add another loop in your main and then place most of what you have in your main into a new function (in my example, the new function is called 'process_ini'.
So, try replacing everything from your name == main line through the end with:
def process_ini(fileName):
settingsFile = os.path.join(localPath, fileName)
if os.path.isfile(settingsFile):
config = ConfigParser.ConfigParser()
config.read(settingsFile)
else:
print "INI file not found. \nMake sure a valid 'settings.ini' file exists in the same directory as this script."
sys.exit()
# AGOL Credentials
inputUsername = config.get( 'AGOL', 'USER')
inputPswd = config.get('AGOL', 'PASS')
# FS values
MXD = config.get('FS_INFO', 'MXD')
serviceName = config.get('FS_INFO', 'SERVICENAME')
folderName = config.get('FS_INFO', 'FOLDERNAME')
tags = config.get('FS_INFO', 'TAGS')
description = config.get('FS_INFO', 'DESCRIPTION')
maxRecords = config.get('FS_INFO', 'MAXRECORDS')
# Share FS to: everyone, org, groups
shared = config.get('FS_SHARE', 'SHARE')
everyone = config.get('FS_SHARE', 'EVERYONE')
orgs = config.get('FS_SHARE', 'ORG')
groups = config.get('FS_SHARE', 'GROUPS') #Groups are by ID. Multiple groups comma separated
# create a temp directory under the script
tempDir = os.path.join(localPath, "tempDir")
if not os.path.isdir(tempDir):
os.mkdir(tempDir)
finalSD = os.path.join(tempDir, serviceName + ".sd")
#initialize AGOLHandler class
agol = AGOLHandler(inputUsername, inputPswd, serviceName, folderName)
# Turn map document into .SD file for uploading
makeSD(MXD, serviceName, tempDir, finalSD, maxRecords)
# overwrite the existing .SD on arcgis.com
if upload(finalSD, tags, description):
# publish the sd which was just uploaded
newItemID = publish()
# share the item
if shared:
enableSharing(newItemID, everyone, orgs, groups)
print "\nfinished."
if __name__ == "__main__":
print "Starting Feature Service publish process"
# Find and gather settings from the ini file
localPath = sys.path[0]
for fileName in ['settings.ini', 'flurb.ini', 'durf.ini']:
process_ini(fileName)
You'd have to write all the ini filenames in the list found in the penultimate line of my example.
Alternatively, you could identify all the .ini files in the directory via code:
if __name__ == "__main__":
print "Starting Feature Service publish process"
# Find and gather settings from the ini file
localPath = sys.path[0]
fileNames = [os.path.join(localPath, i) for i in os.listdir(localPath) if i.endswith('.ini')]
for fileName in fileNames:
process_ini(fileName)
It also might help to set the working directory (e.g., os.chdir(localPath)), but I'm going off of what you already had.

How to save an image with the correct file extension?

I have a script that parses HTML and saves the images to disk.
However, for some reason it outputs the filename wrongly.
It is not saving the file with the correct file extension in Windows. Eg, the image should be saved as <filename>.jpg or <filename>.gif. Instead the images are being saved with no filename extension.
Could you help me to see why this script is not saving the extension correctly in the filename?
I'm running Python 2.7.
""" Tumbrl downloader
This program will download all the images from a Tumblr blog """
from urllib import urlopen, urlretrieve
import os, sys, re
def download_images(images, path):
for im in images:
print(im)
filename = re.findall("([^/]*).(?:jpg|gif|png)",im)[0]
filename = os.path.join(path,filename)
try:
urlretrieve(im, filename.replace("500","1280"))
except:
try:
urlretrieve(im, filename)
except:
print("Failed to download "+im)
def main():
#Check input arguments
if len(sys.argv) < 2:
print("usage: ./tumblr_rip.py url [starting page]")
sys.exit(1)
url = sys.argv[1]
if len(sys.argv) == 3:
pagenum = int(sys.argv[2])
else:
pagenum = 1
if (check_url(url) == ""):
print("Error: Malformed url")
sys.exit(1)
if (url[-1] != "/"):
url.append("/")
blog_name = url.replace("http://", "")
blog_name = re.findall("(?:.[^\.]*)", blog_name)[0]
current_path = os.getcwd()
path = os.path.join(current_path, blog_name)
#Create blog directory
if not os.path.isdir(path):
os.mkdir(path)
html_code_old = ""
while(True):
#fetch html from url
print("\nFetching images from page "+str(pagenum)+"\n")
f = urlopen(url+"page/"+str(pagenum))
html_code = f.read()
html_code = str(html_code)
if(check_end(html_code, html_code_old, pagenum)):
break
images = get_images_page(html_code)
download_images(images, path)
html_code_old = html_code
pagenum += 1
print("Done downloading all images from " + url)
if __name__ == '__main__':
main()
The line
filename = re.findall("([^/]*).(?:jpg|gif|png)",im)[0]
Does not do what you think it does. First off, the dot is unescaped, meaning it will match any character, not just a period.
But the bigger problem is that you messed up the groups. You're acessing the value of the first group in the match, which is the first part inside parenthesis, giving you only the base filename without extension. The second group, containing the extension, is a seperate, noncapturing group. The (?:...) syntax makes a group noncapturing.
The way I fixed it was by putting a group around the entire match and making the existing groups noncapturing.
re.findall("((?:[^/]*)\.(?:jpg|gif|png))",im)[0]
P.S. Another problem is that the pattern is greedy so it can match multiple filenames at once. However, this isn't necessarily invalid, since spaces and periods are allowed in filenames. So if you want to match multiple filenames here, you'll have to figure out what to do yourself. Something like "((?:\w+)\.(?:jpg|gif|png))" would be more intuitive though.

I don't see why this code is not working! can someone please tell me what i am doing wrong?

I keep getting an error, but i dont see it..
I am new to programing and if you explane me the code, please dont assume I know too much.
#!/usr/bin/env python
# Name:
# Student number:
'''
This script crawls the IMDB top 250 movies.
'''
# Python standard library imports
import os
import sys
import csv
import codecs
import cStringIO
import errno
# Third party library imports:
import pattern
from pattern.web import URL, DOM
# --------------------------------------------------------------------------
# Constants:
TOP_250_URL = 'http://www.imdb.com/chart/top'
OUTPUT_CSV = 'top250movies.csv'
SCRIPT_DIR = os.path.split(os.path.realpath(__file__))[0]
BACKUP_DIR = os.path.join(SCRIPT_DIR, 'HTML_BACKUPS')
# --------------------------------------------------------------------------
# Unicode reading/writing functionality for the Python CSV module, taken
# from the Python.org csv module documentation (very slightly adapted).
# Source: http://docs.python.org/2/library/csv.html (retrieved 2014-03-09).
class UTF8Recoder(object):
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader(object):
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter(object):
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
# --------------------------------------------------------------------------
# Utility functions (no need to edit):
def create_dir(directory):
'''
Create directory if needed.
Args:
directory: string, path of directory to be made
Note: the backup directory is used to save the HTML of the pages you
crawl.
'''
try:
os.makedirs(directory)
except OSError as e:
if e.errno == errno.EEXIST:
# Backup directory already exists, no problem for this script,
# just ignore the exception and carry on.
pass
else:
# All errors other than an already exising backup directory
# are not handled, so the exception is re-raised and the
# script will crash here.
raise
def save_csv(filename, rows):
'''
Save CSV file with the top 250 most popular movies on IMDB.
Args:
filename: string filename for the CSV file
rows: list of rows to be saved (250 movies in this exercise)
'''
with open(filename, 'wb') as f:
writer = UnicodeWriter(f) # implicitly UTF-8
writer.writerow([
'title', 'runtime', 'genre(s)', 'director(s)', 'writer(s)',
'actor(s)', 'rating(s)', 'number of rating(s)'
])
writer.writerows(rows)
def make_backup(filename, html):
'''
Save HTML to file.
Args:
filename: absolute path of file to save
html: (unicode) string of the html file
'''
with open(filename, 'wb') as f:
f.write(html)
def main():
'''
Crawl the IMDB top 250 movies, save CSV with their information.
Note:
This function also makes backups of the HTML files in a sub-directory
called HTML_BACKUPS (those will be used in grading).
'''
# Create a directory to store copies of all the relevant HTML files (those
# will be used in testing).
print 'Setting up backup dir if needed ...'
create_dir(BACKUP_DIR)
# Make backup of the IMDB top 250 movies page
print 'Access top 250 page, making backup ...'
top_250_url = URL(TOP_250_URL)
top_250_html = top_250_url.download(cached=True)
make_backup(os.path.join(BACKUP_DIR, 'index.html'), top_250_html)
# extract the top 250 movies
print 'Scraping top 250 page ...'
url_strings = scrape_top_250(top_250_url)
# grab all relevant information from the 250 movie web pages
rows = []
for i, url in enumerate(url_strings): # Enumerate, a great Python trick!
print 'Scraping movie %d ...' % i
# Grab web page
movie_html = URL(url).download(cached=True)
# Extract relevant information for each movie
movie_dom = DOM(movie_html)
rows.append(scrape_movie_page(movie_dom))
# Save one of the IMDB's movie pages (for testing)
if i == 83:
html_file = os.path.join(BACKUP_DIR, 'movie-%03d.html' % i)
make_backup(html_file, movie_html)
# Save a CSV file with the relevant information for the top 250 movies.
print 'Saving CSV ...'
save_csv(os.path.join(SCRIPT_DIR, 'top250movies.csv'), rows)
This function below, should return the webpage links of the top 250 movies:
# --------------------------------------------------------------------------
# Functions to adapt or provide implementations for:
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
table_rows = dom.by_id('main').by_tag('table')[1].by_tag('tr')
for tr in table_rows[1:]:
a = tr.by_tag('a')[0]
movie_urls.append(clean_unicode(abs_url(a.attributes.get('href', ''), url.string)))
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
# return the list of URLs of each movie's page on IMDB
return movie_urls
#print scrape_top_250(url)
And finaly this function should return specific contents.
def scrape_movie_page(dom):
'''
Scrape the IMDB page for a single movie
Args:
dom: pattern.web.DOM instance representing the page of 1 single
movie.
Returns:
A list of strings representing the following (in order): title, year,
duration, genre(s) (semicolon separated if several), director(s)
(semicolon separated if several), writer(s) (semicolon separated if
several), actor(s) (semicolon separated if several), rating, number
of ratings.
'''
# YOUR SCRAPING CODE GOES HERE:
for p in movie_urls:
p_url = URL(p)
p_dom = DOM(p_url.download(cached=True))
title = clean_unicode(p_dom.by_class('header')[0].content)
title = plaintext(strip_between('<span', '</span>', title))
runtime = clean_unicode(p_dom.by_class('infobar')[0].by_tag('time')[0].content)
duration = runtime
genres = []
for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]:
genres.append(clean_unicode(genre.content))
directors = []
writers = []
actors = []
text_blocks = p_dom.by_class('txt-block')[:3]
for t in text_blocks:
spans = t.by_tag('span')
for s in spans:
if s.attributes.get('itemprop') == 'director':
director = s.by_tag('span')[0].by_tag('a')[0].content
directors.append(clean_unicode(director))
if s.attributes.get('itemprop') == 'writer':
p_writer = s.by_tag('span')[0].by_tag('a')[0].content
writers.append(clean_unicode(p_writer))
if s.attributes.get('itemprop') == 'actors':
actor = s.by_tag('span')[0].by_tag('a')[0].content
actors.append(clean_unicode(actor))
rating = []
ratings_count = []
spans = p_dom.by_class('star-box-details')[0].by_tag('span')
for s in spans:
if s.attributes.get('itemprop') == 'ratingValue':
rating = clean_unicode(s.content)
if s.attributes.get('itemprop') == 'ratingCount':
ratings_count = clean_unicode(s.content)
# format the strings from lists
genres = concat_strings(genres)
directors = concat_strings(directors)
writers = concat_strings(writers)
actors = concat_strings(actors)
# Return everything of interest for this movie (all strings as specified
# in the docstring of this function).
return title, duration, genres, directors, writers, actors, rating, \
n_ratings
if __name__ == '__main__':
main() # call into the progam
# If you want to test the functions you wrote, you can do that here:
# ...
It's just that (in the original revision) you forgot to indent the body of the function scrape_movie_page. The for loop is in module scope.
Most common reason for cause of this error due to not proper indent the body of the function, but some time code looks proper a for as indentation point of view but still it throw same error. I always saw this error comes due to mismatch in indentation.In same block if you use two type of indentation like in same block if for some line you use tab and and for some line you use spaces, code looks good as for as indentation prospective but it always through indentation error.

How do I fix KeyError while parsing instagram?

Everyone,
I have a small script parsing names on Instagram.
Recently started having this error:
Traceback (most recent call last):
File "/home/jpegcoma/vk/posting_instagram.py", line 361, in <module>
main()
File "/home/jpegcoma/vk/posting_instagram.py", line 293, in main
table_of_content = get_stuf_from_url(urls)
File "/home/jpegcoma/vk/posting_instagram.py", line 64, in get_stuf_from_url
if json.loads(shared_data)["entry_data"]["ProfilePage"][0]["graphql"]["user"]["is_private"] == False:
KeyError: 'ProfilePage'
Currently it is running on some server. However, I tryed it on my laptop and script was working.
Here is the code that does the thing:
import requests
import json
import os
import random
from time import sleep
import time
import re
from io import BytesIO
from PIL import Image
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
file_name = 'users_names.txt'
def create_folder_photos():
if os.path.isdir(os.path.join(os.getcwd(), "photos")) == False:
os.makedirs(os.path.join(os.getcwd(), "photos"))
else:
pass
def make_list_of_users_to_scrap(file_name):
'''Opens file with instagram user_names.
Every name should be on a new line.
Prepares full URL for parsing.
Returns list URLs'''
path = os.path.join(os.getcwd(), file_name)
base_url = 'https://www.instagram.com/'
users_url_dic = []
with open(path, 'r') as file:
for name in file:
users_url_dic.append(base_url + name.rstrip() + '/')
return users_url_dic
def parsed_data(shared_data):
'''Get to ["edges"] node in shared_data from instagram'''
return json.loads(shared_data)['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']["edges"]
def get_stuf_from_url(urls):
# Open a request session
with requests.session() as s:
# Add some headers in case
s.headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
pool = ThreadPool(5)
d={}
# Go throught all the URLs on instagram
responce = pool.map(requests.get, urls)
pool.close()
pool.join()
for i in responce:
c = i.text
if 30000 < len(c) < 180000:
# Clean html, take only content of 'sharedData' part
shared_data = c.split('window._sharedData = ')[1].split(';</script>')[0]
# Check is accaunt is private
if json.loads(shared_data)["entry_data"]["ProfilePage"][0]["graphql"]["user"]["is_private"] == False:
# Go throught all the nodes:
# If video - pass.
# If photo - take {"Id":"URL"}
for node in parsed_data(shared_data)[::]:
if node['node']['is_video'] == False:
d[node['node']['id']] = node['node']['display_url']
else:
continue
else:
continue
else:
continue
return d
def check_for_new(new_data_from_request):
'''Open 'before_log.txt with previous loggs {'id':'url'}
Check if any new data is presented.
Write 2 new files:
"added.txt" - new photos with url from the last time
"before_log.txt" - updated log with all the ids and urls
returns dic with added {'id':'url'} photos'''
# Open a before_log.txt or say that no such file is presented.
if os.path.isfile(os.path.join(os.getcwd(), 'before_log.txt')):
with open(os.path.join(os.getcwd(), 'before_log.txt'), mode='r', encoding='utf8') as f_file:
before_log = json.load(f_file)
else:
print('Need to make "before_log.txt" file to use the script!!!')
# Get new data from "def get_stuf_from_url(urls):"
after = new_data_from_request
# Check if any new photos is avaliable
added = {i:after[i] for i in after if not i in before_log}
# Add new {key:value} to before_log
for key, value in after.items():
if key not in before_log.keys():
before_log[key] = value
# Write added and before_log for future use
with open(os.path.join(os.getcwd(), 'added.txt'), mode='w', encoding='utf8') as add_file:
add_file.write(json.dumps(added) + '\n')
with open(os.path.join(os.getcwd(), 'before_log.txt'), mode='w', encoding='utf8') as out_file:
out_file.write(json.dumps(before_log) + '\n')
print('We got {} new photos.'.format(len(added)))
return added
def createFilename(url, name, folder):
result = re.split(r'.jpg', url)
slashSplit = result[0].split('/')
if name == None:
name = slashSplit[-1]
ext = "jpg"
file = '{}{}.{}'.format(folder, name, ext)
return file
def getImageFast(url, name=None, folder= os.path.join(os.getcwd(), "photos/")):
'''Download new photos from instagram
Creates a photos folder'''
print("Downloading photos.....")
file = createFilename(url, name, folder)
r = requests.get(url, stream=True)
i = Image.open(BytesIO(r.content))
i.save(file)
I guess the problem is somewhere in here
if json.loads(shared_data)["entry_data"]["ProfilePage"][0]["graphql"]["user"]["is_private"] == False:
Some examples of parsed names on instagram:
_nail_ann_
alena.nails.tallinn
alyne_nails
anna_nails_erbil
aquarelle_nailstudio
cantinhodalara_nails
In a shorter version it does work as intended:
urls = 'https://www.instagram.com/_linails_/'
responce = requests.get(urls)
response_text= responce.text
shared_data = response_text.split('window._sharedData = ')[1].split(';</script>')[0]
# print(shared_data)
d={}
f = json.loads(shared_data)['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']["edges"]
for node in f[::]:
if node['node']['is_video'] == False:
d[node['node']['id']] = node['node']['display_url']
else:
continue
print (d)
After running it I'm getting all the URL and ids I need:
{
'2073876006313498489': 'https://scontent-lax3-2.cdninstagram.com/vp/6e5c8c22e54aa0c853ee88db05dc79bf/5E1BBCA4/t51.2885-15/e35/65217639_634723610367271_4450988163128206846_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=107',
'2024498169693824735': 'https://scontent-lax3-2.cdninstagram.com/vp/39188272c2305ed250ad466c7a715b91/5E2F4B15/t51.2885-15/e35/56352792_132736304460754_8293153588685230511_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=110',
'2023266828574689831': 'https://scontent-lax3-2.cdninstagram.com/vp/f313d44c5bd398a8e6b3f04fb7dbb739/5E2BBB71/t51.2885-15/e35/56578225_1055286461334820_1507399846418163801_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=104',
'2016110942668250132': 'https://scontent-lax3-2.cdninstagram.com/vp/349bbf6a920e440a4e71d5b2d149a61b/5E2BB7FE/t51.2885-15/e35/53745148_280247652888437_7055433742029015170_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=105',
'2012783478764415885': 'https://scontent-lax3-2.cdninstagram.com/vp/72dfe2f67b6dc1ea75e2ddd832384475/5E1936CE/t51.2885-15/e35/54512001_2155869857812437_3429908829670998264_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=109',
'2012464856204377926': 'https://scontent-lax3-2.cdninstagram.com/vp/5aefc3a4e047b08dc94366b0723f170d/5E32A5D3/t51.2885-15/e35/54513720_424627718315641_3423874379564248817_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=101',
'2008135031155279090': 'https://scontent-lax3-2.cdninstagram.com/vp/09cc2e7631c115a0131bda6f597dde60/5E1B4C09/t51.2885-15/e35/53156526_1025783867629475_1693464480553968728_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=111',
'2004990756607236359': 'https://scontent-lax3-2.cdninstagram.com/vp/5da04c640d70b52a3e3073667985f8e3/5E2A62EB/t51.2885-15/e35/54266355_225989821600275_560245954300705815_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=103',
'2002388991416431681': 'https://scontent-lax3-2.cdninstagram.com/vp/77bb0bf9878ca2d175dbd51350c1ef03/5E37974D/t51.2885-15/e35/53217305_581829868953428_1147405223061346025_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=108',
'2001312091952564411': 'https://scontent-lax3-2.cdninstagram.com/vp/64326e9675b389a7997ed86980cba7bc/5E30992A/t51.2885-15/e35/54513758_391705221628749_737855016941810571_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=109',
'1999425996532762294': 'https://scontent-lax3-2.cdninstagram.com/vp/4c4a5ee2b0ad46d6e3eeb1a30c1e9130/5E1BC2CA/t51.2885-15/e35/52639028_2494445767266095_4453054116414455580_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=111',
'1993652807341169347': 'https://scontent-lax3-2.cdninstagram.com/vp/d6d8ffef7fd23d1f12b14282d3bc9aca/5E17386F/t51.2885-15/e35/52024250_786523341734970_6491735451376989098_n.jpg?_nc_ht=scontent-lax3-2.cdninstagram.com&_nc_cat=106'
}

Categories