How to optimize the memory usage of my python crawler - python

I am learning python crawler these days, and I write a simple crawler to get the picture on the Pixiv by Pixiv ID.
It works quite well, but here comes a big problem: When it is running, it takes up nearly 1.2G memory on my computer.
However, sometimes it just takes up just 10M memory, I really don't know which code causes such big usage of memory.
I have uploaded the script to my VPS(Only 768M memory Vulter server) and tried to run. As a result, I get a MerroyError.
So I wonder how to optimize the memory usage(even if taking more time to run).
Here is my code:
(I have rewrote all the code to make it pass pep8, if still unclear, please tell me which code makes you confused.)
from lxml import etree
import re
import os
import requests
# Get a single Picture.
def get_single(Pixiv_ID, Tag_img_src, Headers):
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p0")
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Picture_Type = [".png", ".jpg", ".gif"]
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + ".pixiv.net/img-original/"\
+ Posttime+Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"\
+ str(Pixiv_ID) + "_p0"\
+ Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Get manga which is a bundle of pictures.
def get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers):
os.mkdir("./pic/" + str(Pixiv_ID))
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p")
Manga_URL = "http://www.pixiv.net/"+Tag_a_href
Manga_HTML = requests.get(Manga_URL, headers=Headers)
Manga_XML = etree.HTML(Manga_HTML.content)
Manga_Pages = Manga_XML.xpath('/html/body'
'/nav[#class="page-menu"]'
'/div[#class="page"]'
'/span[#class="total"]/text()')[0]
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Manga_HTML.close()
Picture_Type = [".png", ".jpg", ".gif"]
for Number in range(int(Manga_Pages)):
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + \
".pixiv.net/img-original/"\
+ Posttime + str(Number) + Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"+str(Pixiv_ID) + "/"\
+ str(Pixiv_ID) + "_p"\
+ str(Number) + Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Main function.
def get_pic(Pixiv_ID):
Index_URL = "http://www.pixiv.net/member_illust.php?"\
"mode=medium&illust_id="+str(Pixiv_ID)
Headers = {'referer': Index_URL}
Index_HTML = requests.get(Index_URL, headers=Headers, stream=True)
if Index_HTML.status_code != 200:
return Index_HTML.status_code
Index_XML = etree.HTML(Index_HTML.content)
Tag_a_href_List = Index_XML.xpath('/html/body'
'/div[#id="wrapper"]'
'/div[#class="newindex"]'
'/div[#class="newindex-inner"]'
'/div[#class="newindex-bg-container"]'
'/div[#class="cool-work"]'
'/div[#class="cool-work-main"]'
'/div[#class="img-container"]'
'/a/#href')
Tag_img_src_List = Index_XML.xpath('/html/body'
'/div[#id="wrapper"]'
'/div[#class="newindex"]'
'/div[#class="newindex-inner"]'
'/div[#class="newindex-bg-container"]'
'/div[#class="cool-work"]'
'/div[#class="cool-work-main"]'
'/div[#class="img-container"]'
'/a/img/#src')
if Tag_a_href_List == [] or Tag_img_src_List == []:
return 404
else:
Tag_a_href = Tag_a_href_List[0]
Tag_img_src = Tag_img_src_List[0]
Index_HTML.close()
if Tag_a_href.find("manga") != -1:
return get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers)
else:
return get_single(Pixiv_ID, Tag_img_src, Headers)
# Check whether the picture already exists.
def check_exist(Pixiv_ID):
if not os.path.isdir("Pic"):
os.mkdir("Pic")
if os.path.isdir("./Pic/"+str(Pixiv_ID)):
return True
Picture_Type = [".png", ".jpg", ".gif"]
Picture_Exist = False
for i in range(len(Picture_Type)):
Path = "./Pic/" + str(Pixiv_ID)\
+ "_p0" + Picture_Type[i]
if os.path.isfile(Path):
return True
return Picture_Exist
# The script starts here.
for i in range(0, 38849402):
Pixiv_ID = 38849402-i
Picture_Exist = check_exist(Pixiv_ID)
if not Picture_Exist:
Return_Code = get_pic(Pixiv_ID)
if Return_Code == 200:
print str(Pixiv_ID), "finish!"
elif Return_Code == -1:
print str(Pixiv_ID), "got an unknown error."
elif Return_Code == 404:
print str(Pixiv_ID), "not found. Maybe deleted."
else:
print str(Pixiv_ID), "picture exists!"

OMG!
Finally, I know what goes wrong.
I use mem_top() to see what takes up the memory.
Guess what?
It is for i in range(0, 38849402):
In the memory, there is a list [0, 1, 2, 3 ... 38849401], which takes up my memory.
I change it to :
Pixiv_ID = 38849402
while Pixiv_ID > 0:
some code here
Pixiv_ID = Pixiv_ID-1
Now the memory usage is just no more than 20M.
Feeling excited!

Related

python - Downloading file from website using requests

I'm trying to download mp3 files from https://www.alain-pennec.bzh/editions/livre-mp3/mp3/ with the following code :
import requests
def main():
for i in range(0, 226) :
URL = "https://www.alain-pennec.bzh/editions/livre-mp3/mp3/AP"+numberOn3Char(i)+".mp3"
response = requests.get(URL)
open("AP"+numberOn3Char(i)+".mp3", "wb").write(response.content)
print(numberOn3Char(i)+" OK")
def numberOn3Char(i):
ret = str(i)
if len(ret) == 1 :
ret = '00' + ret
elif len(ret) == 2 :
ret = '0' + ret
return ret
main()
My issue is that nothing is being downloaded. I get no error and I don't understand why it's not working.
Try changing
open("AP"+numberOn3Char(i)+".mp3", "wb").write(response.content)
to
name = "AP"+numberOn3Char(i)+".mp3"
with open(name, 'wb') as f:
f.write(response.content)
and see if it works.

download files from internet using urllib2

I am trying to download files from the internet for research purposes, but when I tried to move from python2 to python3. I got an error TypeError: a bytes-like object is required, not 'str' because python3 treats string different,but I had to change from .content to .text, and it fixed, but it doesn'tenter code here download the files, but it's grabbing them, how can I force to download them?
def downloadFile(self, url):
fDir=self.outputDir
local_file = None
if not os.path.isdir(fDir):
os.makedirs(fDir)
try:
f = urllib.request.urlopen(url, timeout=10)
for x in range(len(self.signature)):
if ord(f.read(1))!=self.signature[x]:
f.close()
raise
local_file=open("%s/file%08d.%s" % (fDir, self.successCount, self.extension), "wb")
for x in range(len(self.signature)):
local_file.write(chr(self.signature[x]))
local_file.write(f.read())
local_file.close()
f.close()
except KeyboardInterrupt:
raise
except:
if local_file != None:
local_file.close()
for x in range(10):
try:
if os.path.isfile("%s/file%08d.%s" % (fDir, self.successCount, self.extension)):
os.remove("%s/file%08d.%s" % (fDir, self.successCount, self.extension))
break
except:
if x==9:
raise
time.sleep(1)
return
self.successCount += 1
def search(self):
if self.extension == None or self.extension == "":
print("ERROR: No extension specified!")
return
if len(self.signature) == 0:
print("WARNING: No signature specified - THERE WILL BE LOT OF FALSE RESULTS :(")
print("Starting with search")
print("---------------------")
print("Extension: " + self.extension)
print("Signature: " + self.signatureText())
print("Starting search base: " + self.searchCharsText())
print("Output dir: " + self.outputDir)
print("Max results per search: " + str(self.maxPerSearch))
self.searchReal("")
pos=r.text.find('<a href="')
while pos != -1:
pos2_a=r.text.find('"', pos+16)
pos2_b=r.text.find('&', pos+16)
if pos2_a == -1:
pos2 = pos2_b
elif pos2_b == -1:
pos2 = pos2_a
else:
pos2 = min (pos2_a, pos2_b)
if pos2 == -1:
break;
url = r.text[pos+16:pos2]
if url.find('.google.') == -1 and url.startswith('http'):
blocked = False
if url not in self.downloaded:
self.downloadFile(url)
self.downloaded.append(url)
f.write(url + "\n")
pos_a=r.text.find('<a href="', pos+1)
pos_b=r.text.find('a href="/url?q=', pos+1)
if pos_a == -1:
pos = pos_b
elif pos_b == -1:
pos = pos_a
else:
pos=min(pos_a, pos_b)
log
http://www.aamalaysia.org/pdf/p-1_thisisaa1.pdf
https://www.deanza.edu/articulation/documents/ge-aa-as-dac.pdf
https://aamexico.org.mx/media/Lista_de_precios_%2520vigentes.pdf
https://www.aflglobal.com/productlist/Product-Lines/Conductor-Accessories/230kV-Aluminum-Welded-Bus-Pipe-Supports/doc/230kv-aluminum-welded-bus-supports.aspx
it looks like you have some extra code in there for your own purposes..but if it helps, downloading a file from the internet can be as simple as:
import urllib.request
url = 'http://www.aamalaysia.org/pdf/p-1_thisisaa1.pdf'
out_file = 'file.pdf'
data = urllib.request.urlopen(url).read()
with open(out_file,'wb') as out:
out.write(data)

Use of files on hard drives instead of url with python

I would like to modify this script to use offline files, if I download the files from url works, but if the same file as I withdraw from hard drives, does not open, someone helps me to understand why and how to do, thank you.
def INDEX():
TVLIST('https://www.*********/playlist/*******/test.m3u')
def TVLIST(url):
try:
m3u = getHtml(url)
parsem3u(m3u)
except:
addDir('Nothing found', '', '', '', Folder=False)
xbmcplugin.endOfDirectory(int(sys.argv[1]))
urlopen = urllib2.urlopen
Request = urllib2.Request
def getHtml(url, referer=None, hdr=None, data=None):
if not hdr:
req = Request(url, data, headers)
else:
req = Request(url, data, hdr)
if referer:
req.add_header('Referer', referer)
if data:
req.add_header('Content-Length', len(data))
response = urlopen(req)
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO( response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
f.close()
else:
data = response.read()
response.close()
return data
def parsem3u(html, sitechk=True):
match = re.compile('#.+,(.+?)\n(.+?)\n').findall(html)
txtfilter = txtfilter = GETFILTER()
txtfilter = txtfilter.split(',') if txtfilter else []
txtfilter = [f.lower().strip() for f in txtfilter]
i = 0
count = 0
for name, url in match:
status = ""
url = url.replace('\r','')
if not txtfilter or any(f in name.lower() for f in txtfilter):
if sitechk:
if i < 5:
try:
siteup = urllib.urlopen(url).getcode()
status = " [COLOR red]offline[/COLOR]" if siteup != 200 else " [COLOR green]online[/COLOR]"
except: status = " [COLOR red]offline[/COLOR]"
i += 1
addPlayLink(name+status, url, 3, uiptvicon)
count += 1
return count
I thought, was enough to put the local path
def INDEX():
TVLIST(r'c:\Desktop\IPTVLIST\M3U\playlist\test.m3u')
who explains why it does not work and how can I do? Thank you
As suggested by #languitar in the comments you would have file:// which of course it should work for windows, but moving to a platform like android, you have different file system there, you don't have C drive. So make sure you got an alternative location on the android.

Best way to make thousands of get requests in python

Right now I am working on a python script which takes in a list of url's as an argument, then performs a GET request on each url and then searches through the output with xpath to fingerprint the website. It seems to work like a charm when the list is around 50 sites long, but anything after that causes the program to slow down to the point where it stop (usually around 150 sites). Scroll down to where you see main app logic and the relevant code it below. Right now I am just using 50 elements in the array and it works fine, but anything after makes the entire program stop. Any suggestions would be greatly appreciated!
#!/usr/bin/python
# Web Scraper
# 1.0
# Imports for file
from multiprocessing.dummy import Pool as ThreadPool
from threading import Thread
from Queue import Queue
from lxml import html
import requests
import time
import sys
# Get Raw HTML
def scrape(url):
try:
page = requests.get(url, timeout=2.0)
if page.status_code == requests.codes.ok:
html_page = html.fromstring(page.content)
s =requests.session()
s.close()
return html_page
else:
s =requests.session()
s.close()
return False
except:
s =requests.session()
s.close()
return False
# Format URL
def format_url(url):
if url.find("http://") == -1:
url = "http://"+url
if url[-1] == "/":
url = url[:-1]
return url
# Check if WordPress Site
def check_wordpress(tree):
scripts = tree.xpath("//script[contains(#src,'wp-content')]")
if len(scripts) > 0:
return True
return False
# Check WordPress Version
def wordpress_version(tree):
type = tree.xpath("//meta[#name='generator']/#content")
version = 0
if len(type) > 0:
details = type[0].split()
if len(details)>1 and details[0] == "WordPress":
if len(details) > 1:
version = details[1]
else:
version = type[0]
return version
# Find Contact Page
def find_contact_page(tree):
contact = tree.xpath("//a[contains(text(),'Contact')]/#href")
try_xpath = 1
while len(contact) == 0:
if try_xpath == 1:
contact = tree.xpath("//span[contains(text(),'Contact')]/../#href")
elif try_xpath == 2:
contact = tree.xpath("//p[contains(text(),'Contact')]/../#href")
elif try_xpath == 3:
break
try_xpath+=1
if len(contact) > 0:
contact = contact[0]
if contact.find('#') == -1:
if contact[0] == '/':
contact = url + "" + contact
print contact
# Juicer method
def juice(url):
url = format_url(url)
string = url
tree = scrape(url)
if tree == False:
return string + " \t\t\t No XML tree"
elif check_wordpress(tree) == True:
version = wordpress_version(tree)
return string + " \t\t\t WordPress: " + str(version)
else:
return string + " \t\t\t Not WordPress"
# Main App Logic Below ------------------------------------->
# Open list of websites from given argument
list = open(sys.argv[1],'r').read().split('\n')
# Juice url
def juice_url():
while True:
url = q.get()
result = juice(url)
print result
q.task_done()
# Create concurrent queues
concurrent = 50
q = Queue(concurrent)
for i in range(concurrent):
t = Thread(target=juice_url)
t.daemon = True
t.start()
# Add URL to Queue
time1 = time.time()
for url in list[0:50]:
q.put(url)
q.join()
# Calculate total time
total = time.time() - time1
print "Total Time: %f" % total
print "Average Time: %f" % (total/50)

Python appending to binary file works in Unix, not in Windows

The function below receives file chunks from a web requests and assembles them. It works perfectly in Unix (OSX) but on Windows, it doesn't. Specifically, the file does assemble, however it always ends up too small, just a few KB. I cannot figure out what is causing this. No exceptions are raised, it all appears to work, except that the final file is not all there. I've included the entire function for context but I've marked the section which appears not to be working correctly. (Python 2.7 and Windows Server 2008 R2)
#view_config(route_name='upload', renderer='json')
def upload(request):
r = request.response
final_dir = 'w:\\foobar'
filename = request.params.get('flowFilename')
chunk_number = request.params.get('flowChunkNumber')
total_chunks = request.params.get('flowTotalChunks')
try:
temp_dir = os.path.join(final_dir, request.params.get('flowIdentifier'))
file_part = os.path.join(temp_dir, '%s.part.%s' % (filename, chunk_number))
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
except TypeError:
pass
if request.method == 'GET':
if file_part:
if os.path.isfile(file_part):
r.status = 200
else:
r.status = 404
return r
if request.POST:
try:
fo = request.params.get('file').file
f = open(file_part, 'wb')
f.write(fo.read())
f.close()
if chunk_number == total_chunks:
final_filename = os.path.join(final_dir, filename)
temp_filename = filename + '_INCOMPLETE'
#####################################################################
# This is where is appears to be going wrong...
final_file = open(temp_filename, 'a+b')
try:
for i in range(1, int(total_chunks) + 1):
ff = open(os.path.join(temp_dir, '%s.part.%s' % (filename, i)))
final_file.write(ff.read())
ff.close()
final_file.close()
os.rename(temp_filename, final_filename) # rename to final filename
shutil.rmtree(temp_dir) # clean up temp part files
except:
raise
####################################################################
r.status = 200
except Exception, e:
print 'ERROR', e.message
r.status = 404
return r

Categories