How to request multiple url at one time using urllib in python - python

I'm programing a program for downloading images from internet and I would like to speed it up using multiple requests at once.
So I wrote a code you can see here at GitHub.
I can request for webpage only like this:
def myrequest(url):
worked = False
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
while not worked:
try:
webpage_read = urlopen(req).read()
worked = True
except:
print("failed to connect to \n{}".format(url))
return(webpage_read)
url = "http://www.mangahere.co/manga/mysterious_girlfriend_x"
webpage_read = myrequest(url).decode("utf-8")
The while is here because I definitely want to download every single picture, so I'm trying until it work (nothing can go wrong except urllib.error.HTTPError: HTTP Error 504: Gateway Time-out)
My question is, how to run that multiple times at once?
My idea is to have " a comander" which will run 5 (or 85) pythonic scripts, give each url and get webpage from them once they are finished, but this is definitely a silly solution :)
EDIT:
I used _thread but it doesn't seem to speed up the program. That should have been the solution am I doing it wrong? that is my new question.
You can use link do get to my code on GitHub
def thrue_thread_download_pics(path, url, ep, name):
lock.acquire()
global goal
goal += 1
lock.release()
webpage_read = myrequest("{}/{}.html".format(url, ep))
url_to_pic = webpage_read.decode("utf-8").split('" onerror="')[0].split('<img src="')[-1]
pic = myrequest(url_to_pic)
myfile = open("{}/pics/{}.jpg".format(path, name), "wb")
myfile.write(pic)
myfile.close()
global finished
finished += 1
and I'm using it here:
for url_ep in urls_eps:
url, maxep = url_ep.split()
maxep = int(maxep)
chap = url.split("/")[-1][2:]
if "." in chap:
chap = chap.replace(".", "")
else:
chap = "{}0".format(chap)
for ep in range(1, maxep + 1):
ted = time.time()
name = "{}{}".format(chap, "{}{}".format((2 - len(str(ep))) * "0", ep))
if name in downloaded:
continue
_thread.start_new_thread(thrue_thread_download_pics, (path, url, ep, name))
checker = -1
while finished != goal:
if finished != checker:
checker = finished
print("{} of {} downloaded".format(finished, goal))
time.sleep(0.1)

Requests Futures is built on top of the very popular requests library and uses non-blocking IO:
from requests_futures.sessions import FuturesSession
session = FuturesSession()
# These requests will run at the same time
future_one = session.get('http://httpbin.org/get')
future_two = session.get('http://httpbin.org/get?foo=bar')
# Get the first result
response_one = future_one.result()
print(response_one.status_code)
print(response_one.text)
# Get the second result
response_two = future_two.result()
print(response_two.status_code)
print(response_two.text)

Related

How to do this (check requests status codes) in Python concurrently or in parallel?

Here's what I'm doing:
Get words from a text file - every word is on a separate line.
Add http://www. and .com to words to create a url.
Get the URL with requests.
Find out if it's a free domain or not (based on status codes and
error in connection/other error).
Add free domains to a text file.
Time it all.
I've kind of made it work so far but it's very slow. The text file has 350 000 words. How would I go about doing this concurrently or in parallel? Also which would be a better choice for this task?
Here's my code:
import requests, time
start = time.time()
with open('words1.txt','r') as f:
words = []
for item in f:
words.append(item.strip())
for w in words:
url = 'http://www.'+w+'.com'
try:
header = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(url, headers=header)
codes = [200,201,202,203,204,205,206,300,301,302,303,307,308,400,401,402,403,404,405,406,500,501,502,503]
if r.status_code in codes:
print(url,': Known Status Code > Unavailable')
else:
print(url,': Unknown Status Code > Probably Free')
with open('available.txt','a') as myfile:
myfile.write(url+'\n')
except requests.exceptions.ConnectionError:
print(url,' : Connection Error > Probably Free')
with open('available.txt','a') as myfile:
myfile.write(url+'\n')
except requests.exceptions.HTTPError:
print('http error')
except requests.exceptions.Timeout:
print('timeout error')
except requests.exceptions.TooManyRedirects:
print('too many redirects')
end = time.time()
print('\n')
print(end-start, 'seconds')
print((end-start)/60,'minutes')
print(((end-start)/60)/60,'hours')
Thanks!
EDIT: I got it to work. Thanks for the help Kendas and DeepSpace!
Here's a quick test:
100 words - 22 sec
1000 words - 285 sec
Not too fast but way faster than my first try.
Seems like gevent + socket is the way to go.
Please let me know if you have any tips on making this better/faster.
Here's the code:
import gevent,time
from gevent import socket
start = time.time()
words = []
with open('words1000.txt','r') as f:
for item in f:
words.append(item.strip())
urls = ['www.{}.com'.format(w) for w in words]
jobs = [gevent.spawn(socket.gethostbyname, url) for url in urls]
gevent.joinall(jobs)
values = {url:job.value for (url,job) in zip(urls,jobs)}
freeDomains = []
for (v,job,url) in zip(values,jobs,urls):
if job.value == None:
freeDomains.append(url)
with open('availableds.txt','a') as myFile:
myFile.write(url+'\n')
print(freeDomains)
end = time.time()
print(end-start,'seconds')
print((end-start)/60,'minutes')
print((end-start)/3600,'hours')
grequests (the concurrent version of requests) makes this pretty easy.
It will also help to use .format and not redefining header every iteration
import grequests
def exception_handler(request, exception):
print(exception)
with open('words1.txt','r') as f:
words = []
for item in f:
words.append(item.strip())
urls = ['http://www.{}.com'.format(w) for w in words]
header = {'User-Agent': 'Mozilla/5.0'}
requests = [grequests.get(url) for url in urls]
responses = grequests.map(requests, exception_handler=exception_handler)
for resp in responses:
if resp:
print(resp.status_code)

Sequential requests using python-requests

right now I'm using Flask, and I'm having trouble while trying to do more than one GET request using python requests module.
If I try to send a series of requests, the first one is completed successfully, but the other ones throw a timeout exception.
Here is part of the view's code:
import requests
sess = requests.Session()
site_url = 'http://www.example.com/api/'
steps = ['first_step', 'second_step', 'third_step']
step_responses = dict()
for s in steps:
try:
req = sess.get(site_url + s, timeout=5))
except requests.exceptions.Timeout:
return jsonify({'result':False, 'error':'timeout'})
except requests.exceptions.ConnectionError:
return jsonify({'result':False, 'error':'connection_error'})
else:
step_responses[s] = True
If I extract this part into a standalone .py file, it completes successfully.
import requests
sess = requests.Session()
site_url = 'http://www.example.com/api/'
steps = ['first_step', 'second_step', 'third_step']
step_responses = dict()
for s in steps:
try:
req = sess.get(site_url + s, timeout=5)
except requests.exceptions.Timeout:
step_responses[s] = 'timeout'
except requests.exceptions.ConnectionError:
step_responses[s] = 'conn_error'
else:
step_responses[s] = 'ok'
print step_responses
Works for me. You may want to check the second and third steps
import requests
sess = requests.Session()
def module():
site_url = 'http://stackoverflow.com/'
steps = ['users', 'questions', 'tags']
step_responses = dict()
for s in steps:
try:
req = sess.get(site_url + s, timeout=5)
except requests.exceptions.Timeout:
return jsonify({'result':False, 'error':'timeout'})
except requests.exceptions.ConnectionError:
return jsonify({'result':False, 'error':'connection_error'})
else:
step_responses[s] = True
You might want to make sure that you read all the values from the req object.
I think you might need req.text and req.status_code or req.content
Check half-way down the page here: http://docs.python-requests.org/en/latest/api/#request-sessions where they discuss session parameters
"class requests.adapters.HTTPAdapter(pool_connections=10, pool_maxsize=10, max_retries=0, pool_block=False)"
I'm not at all sure how to use connection pools and so forth but the docs do say (http://docs.python-requests.org/en/latest/user/advanced/) (Look for Keep Alive)
"Note that connections are only released back to the pool for reuse once all body data has been read; be sure to either set stream to False or read the content property of the Response object."

Python: Cannot pop from empty list? When list is clearly not empty?

I'm obviously missing something here. Same project I've been working on for a number of days. Stepping through it bit by bit, seemed to be working fine. I added in a portion of the main() function to actually create the comparison lists, and suddenly starts throwing out cannot pop from empty list error at me, even through a print function I've placed ahead of the pop() call clearly shows that the list is not empty? Any ideas what I'm doing wrong? and is this monstrosity gonna actually work the way I intend? First time working with threads and all. Here is the code in its entirety:
import urllib
import urllib2
import sys
from lxml.html import parse, tostring, fromstring
from urlparse import urlparse
import threading
class Crawler(threading.Thread):
def __init__(self):
self.links = []
self.queue = []
self.mal_list = []
self.count = 0
self.mal_set = set(self.mal_list)
self.crawled = []
self.crawled_set = set(self.crawled)
self.links_set = set(self.links)
self.queue.append(sys.argv[1])
self.queue_set = set(self.queue)
def run(self, max_depth):
print(self.queue)
while self.count < max_depth:
tgt = self.queue.pop(0)
if tgt not in self.mal_set:
self.crawl(tgt)
else:
print("Malicious Link Found: {0}".format(tgt)
continue
sys.exit("Finished!")
def crawl(self, tgt):
url = urlparse(tgt)
self.crawled.append(tgt)
try:
print("Crawling {0}".format(tgt))
request = urllib2.Request(tgt)
request.add_header("User-Agent", "Mozilla/5,0")
opener = urllib2.build_opener()
data = opener.open(request)
self.count += 1
except:
return
doc = parse(data).getroot()
for tag in doc.xpath("//a[#href]"):
old = tag.get('href')
fixed = urllib.unquote(old)
self.links.append(fixed)
self.queue_links(self.links_set, url)
def queue_links(self, links, url):
for link in links:
if link.startswith('/'):
link = "http://" + url.netloc + "/" + link
elif link.startswith('#'):
continue
elif link.startswith('http'):
link = 'http://' + url.netloc + '/' + link
if link.decode('utf-8') not in self.crawled_set:
self.queue.append(link)
def make_mal_list(self):
"""
Open various malware and phishing related blacklists and create a list
of URLS from which to compare to the crawled links
"""
hosts1 = "hosts.txt"
hosts2 = "MH-sitelist.txt"
hosts3 = "urls.txt"
with open(hosts1) as first:
for line1 in first.readlines():
link = "http://" + line1.strip()
self.mal_list.append(link)
with open(hosts2) as second:
for line2 in second.readlines():
link = "http://" + line2.strip()
self.mal_list.append(link)
with open(hosts3) as third:
for line3 in third.readlines():
link = "http://" + line3.strip()
self.mal_list.append(link)
def main():
crawler = Crawler()
crawler.make_mal_list()
crawler.run(25)
if __name__ == "__main__":
main()
First of all , i did get lost while reading your code so maybe i can give you some remark if i may before:
to many instance variable you don't have to create a new instance var just to put on it a set() of another vars like this code : self.mal_set = set(self.mal_list)and you are repeating the same thing many times
if you want to use threading so use it, because in your code you are just creating one thread, for that you should create like (10) thread or so each thread will deal with a bunch of URL that he should fetch, and don't forget to put the threads in a Queue.Queue to synchronize between them.
EDIT : Ahh i forgot : indent your code :)
now about your problem :
where do you assign self.queue because i don't see it ? you are just calling the make_mal_list() method that will initialize only self.mal_listand after when you run you own thread i think it's obvious that self.queue is empty so you can't pop() right ?
EDIT 2:
i think your example is more complicate (using black list and all this stuff, ...) but you can start with something like this:
import threading
import Queue
import sys
import urllib2
import url
from urlparse import urlparse
THREAD_NUMBER = 10
class Crawler(threading.Thread):
def __init__(self, queue, mal_urls):
self.queue = queue
self.mal_list = mal_urls
threading.Thread.__init__(self) # i forgot , thanks seriyPS :)
def run(self):
while True:
# Grabs url to fetch from queue.
url = self.queue.get()
if url not in self.mal_list:
self.crawl(url)
else:
print "Malicious Link Found: {0}".format(url)
# Signals to queue job is done
self.queue.task_done()
def crawl(self, tgt):
try:
url = urlparse(tgt)
print("Crawling {0}".format(tgt))
request = urllib2.Request(tgt)
request.add_header("User-Agent", "Mozilla/5,0")
opener = urllib2.build_opener()
data = opener.open(request)
except: # TODO: write explicit exceptions the URLError, ValueERROR ...
return
doc = parse(data).getroot()
for tag in doc.xpath("//a[#href]"):
old = tag.get('href')
fixed = urllib.unquote(old)
# I don't think you need this, but maybe i'm mistaken.
# self.links.append(fixed)
# Add more URL to the queue.
self.queue_links(fixed, url)
def queue_links(self, link, url):
"""I guess this method allow recursive download of urls that will
be fetched from the web pages ????
"""
#for link in links: # i changed the argument so now links it just one url.
if link.startswith('/'):
link = "http://" + url.netloc + "/" + link
elif link.startswith('#'):
continue
elif link.startswith('http'):
link = 'http://' + url.netloc + '/' + link
# Add urls extracted from the HTML text to the queue to fetche them
if link.decode('utf-8') not in self.crawled_set:
self.queue.put(link)
def get_make_mal_list():
"""Open various malware and phishing related blacklists and create a list
of URLS from which to compare to the crawled links
"""
hosts1 = "hosts.txt"
hosts2 = "MH-sitelist.txt"
hosts3 = "urls.txt"
mal_list = []
with open(hosts1) as first:
for line1 in first:
link = "http://" + line1.strip()
mal_list.append(link)
with open(hosts2) as second:
for line2 in second:
link = "http://" + line2.strip()
mal_list.append(link)
with open(hosts3) as third:
for line3 in third:
link = "http://" + line3.strip()
mal_list.append(link)
return mal_list
def main():
queue = Queue.Queue()
# Get malicious URLs.
mal_urls = set(get_make_mal_list())
# Create a THREAD_NUMBER thread and start them.
for i in xrange(THREAD_NUMBER):
cr = Crawler(queue, mal_urls)
cr.start()
# Get all url that you want to fetch and put them in the queue.
for url in sys.argv[1:]:
queue.put(url)
# Wait on the queue until everything has been processed.
queue.join()
if __name__ == '__main__':
main()
Small offtopic:
class Crawler(threading.Thread):
def __init__(self):
#you code
threading.Thread.__init__(self)#!!!
don't forget run Thread.__init__(self) directly if you override __init__ function
And, ofcourse, you must use http://docs.python.org/library/queue.html class for implement you job's queue in thread-safe mode
My primary language is C#, but issue you are experiencing is because of threading. In thread #1 you check that list is not empty, while thread #2 clears that list and thus you receive exception.
list is not thread-safe. If you need a thread-safe data structure, use Queue.Queue (Python 2.x) or queue.Queue (Python 3.x).
Also, look on this fragment:
print(self.queue)
while self.count < max_depth:
tgt = self.queue.pop(0)
you do print(self.queue) only before in first while iteration, so, self.queue.pop() can make many iterations (and fetch many links) and raise "cannot pop from empty list" only when queue is really empty!
try this:
while self.count < max_depth:
print(self.queue)
tgt = self.queue.pop(0)
for detect moment when you take exception.

get many pages with pycurl?

I want to get many pages from a website, like
curl "http://farmsubsidy.org/DE/browse?page=[0000-3603]" -o "de.#1"
but get the pages' data in python, not disk files.
Can someone please post pycurl code to do this,
or fast urllib2 (not one-at-a-time) if that's possible,
or else say "forget it, curl is faster and more robust" ? Thanks
So you have 2 problem and let me show you in one example. Notice the pycurl already did the multithreading/not one-at-a-time w/o your hardwork.
#! /usr/bin/env python
import sys, select, time
import pycurl,StringIO
c1 = pycurl.Curl()
c2 = pycurl.Curl()
c3 = pycurl.Curl()
c1.setopt(c1.URL, "http://www.python.org")
c2.setopt(c2.URL, "http://curl.haxx.se")
c3.setopt(c3.URL, "http://slashdot.org")
s1 = StringIO.StringIO()
s2 = StringIO.StringIO()
s3 = StringIO.StringIO()
c1.setopt(c1.WRITEFUNCTION, s1.write)
c2.setopt(c2.WRITEFUNCTION, s2.write)
c3.setopt(c3.WRITEFUNCTION, s3.write)
m = pycurl.CurlMulti()
m.add_handle(c1)
m.add_handle(c2)
m.add_handle(c3)
# Number of seconds to wait for a timeout to happen
SELECT_TIMEOUT = 1.0
# Stir the state machine into action
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Keep going until all the connections have terminated
while num_handles:
# The select method uses fdset internally to determine which file descriptors
# to check.
m.select(SELECT_TIMEOUT)
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Cleanup
m.remove_handle(c3)
m.remove_handle(c2)
m.remove_handle(c1)
m.close()
c1.close()
c2.close()
c3.close()
print "http://www.python.org is ",s1.getvalue()
print "http://curl.haxx.se is ",s2.getvalue()
print "http://slashdot.org is ",s3.getvalue()
Finally, these code is mainly based on an example on the pycurl site =.=
may be you should really read doc. ppl spend huge time on it.
here is a solution based on urllib2 and threads.
import urllib2
from threading import Thread
BASE_URL = 'http://farmsubsidy.org/DE/browse?page='
NUM_RANGE = range(0000, 3603)
THREADS = 2
def main():
for nums in split_seq(NUM_RANGE, THREADS):
t = Spider(BASE_URL, nums)
t.start()
def split_seq(seq, num_pieces):
start = 0
for i in xrange(num_pieces):
stop = start + len(seq[i::num_pieces])
yield seq[start:stop]
start = stop
class Spider(Thread):
def __init__(self, base_url, nums):
Thread.__init__(self)
self.base_url = base_url
self.nums = nums
def run(self):
for num in self.nums:
url = '%s%s' % (self.base_url, num)
data = urllib2.urlopen(url).read()
print data
if __name__ == '__main__':
main()
You can just put that into a bash script inside a for loop.
However you may have better success at parsing each page using python.
http://www.securitytube.net/Crawling-the-Web-for-Fun-and-Profit-video.aspx
You will be able to get at the exact data and save it at the same time into a db.
http://www.securitytube.net/Storing-Mined-Data-from-the-Web-for-Fun-and-Profit-video.aspx
If you want to crawl a website using python, you should have a look to scrapy http://scrapy.org
Using BeautifulSoup4 and requests -
Grab head page:
page = Soup(requests.get(url='http://rootpage.htm').text)
Create an array of requests:
from requests import async
requests = [async.get(url.get('href')) for url in page('a')]
responses = async.map(requests)
[dosomething(response.text) for response in responses]
Requests requires gevent to do this btw.
I can recommend you to user async module of human_curl
Look example:
from urlparse import urljoin
from datetime import datetime
from human_curl.async import AsyncClient
from human_curl.utils import stdout_debug
def success_callback(response, **kwargs):
"""This function call when response successed
"""
print("success callback")
print(response, response.request)
print(response.headers)
print(response.content)
print(kwargs)
def fail_callback(request, opener, **kwargs):
"""Collect errors
"""
print("fail callback")
print(request, opener)
print(kwargs)
with AsyncClient(success_callback=success_callback,
fail_callback=fail_callback) as async_client:
for x in xrange(10000):
async_client.get('http://google.com/', params=(("x", str(x)),)
async_client.get('http://google.com/', params=(("x", str(x)),),
success_callback=success_callback, fail_callback=fail_callback)
Usage very simple. Then page success loaded of failed async_client call you callback. Also you can specify number on parallel connections.

Anyone know of a good Python based web crawler that I could use?

Locked. This question and its answers are locked because the question is off-topic but has historical significance. It is not currently accepting new answers or interactions.
I'm half-tempted to write my own, but I don't really have enough time right now. I've seen the Wikipedia list of open source crawlers but I'd prefer something written in Python. I realize that I could probably just use one of the tools on the Wikipedia page and wrap it in Python. I might end up doing that - if anyone has any advice about any of those tools, I'm open to hearing about them. I've used Heritrix via its web interface and I found it to be quite cumbersome. I definitely won't be using a browser API for my upcoming project.
Thanks in advance. Also, this is my first SO question!
Mechanize is my favorite; great high-level browsing capabilities (super-simple form filling and submission).
Twill is a simple scripting language built on top of Mechanize
BeautifulSoup + urllib2 also works quite nicely.
Scrapy looks like an extremely promising project; it's new.
Use Scrapy.
It is a twisted-based web crawler framework. Still under heavy development but it works already. Has many goodies:
Built-in support for parsing HTML, XML, CSV, and Javascript
A media pipeline for scraping items with images (or any other media) and download the image files as well
Support for extending Scrapy by plugging your own functionality using middlewares, extensions, and pipelines
Wide range of built-in middlewares and extensions for handling of compression, cache, cookies, authentication, user-agent spoofing, robots.txt handling, statistics, crawl depth restriction, etc
Interactive scraping shell console, very useful for developing and debugging
Web management console for monitoring and controlling your bot
Telnet console for low-level access to the Scrapy process
Example code to extract information about all torrent files added today in the mininova torrent site, by using a XPath selector on the HTML returned:
class Torrent(ScrapedItem):
pass
class MininovaSpider(CrawlSpider):
domain_name = 'mininova.org'
start_urls = ['http://www.mininova.org/today']
rules = [Rule(RegexLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
def parse_torrent(self, response):
x = HtmlXPathSelector(response)
torrent = Torrent()
torrent.url = response.url
torrent.name = x.x("//h1/text()").extract()
torrent.description = x.x("//div[#id='description']").extract()
torrent.size = x.x("//div[#id='info-left']/p[2]/text()[2]").extract()
return [torrent]
Check the HarvestMan, a multi-threaded web-crawler written in Python, also give a look to the spider.py module.
And here you can find code samples to build a simple web-crawler.
I've used Ruya and found it pretty good.
I hacked the above script to include a login page as I needed it to access a drupal site. Not pretty but may help someone out there.
#!/usr/bin/python
import httplib2
import urllib
import urllib2
from cookielib import CookieJar
import sys
import re
from HTMLParser import HTMLParser
class miniHTMLParser( HTMLParser ):
viewedQueue = []
instQueue = []
headers = {}
opener = ""
def get_next_link( self ):
if self.instQueue == []:
return ''
else:
return self.instQueue.pop(0)
def gethtmlfile( self, site, page ):
try:
url = 'http://'+site+''+page
response = self.opener.open(url)
return response.read()
except Exception, err:
print " Error retrieving: "+page
sys.stderr.write('ERROR: %s\n' % str(err))
return ""
return resppage
def loginSite( self, site_url ):
try:
cj = CookieJar()
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
url = 'http://'+site_url
params = {'name': 'customer_admin', 'pass': 'customer_admin123', 'opt': 'Log in', 'form_build_id': 'form-3560fb42948a06b01d063de48aa216ab', 'form_id':'user_login_block'}
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self.headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(params)
response = self.opener.open(url, data)
print "Logged in"
return response.read()
except Exception, err:
print " Error logging in"
sys.stderr.write('ERROR: %s\n' % str(err))
return 1
def handle_starttag( self, tag, attrs ):
if tag == 'a':
newstr = str(attrs[0][1])
print newstr
if re.search('http', newstr) == None:
if re.search('mailto', newstr) == None:
if re.search('#', newstr) == None:
if (newstr in self.viewedQueue) == False:
print " adding", newstr
self.instQueue.append( newstr )
self.viewedQueue.append( newstr )
else:
print " ignoring", newstr
else:
print " ignoring", newstr
else:
print " ignoring", newstr
def main():
if len(sys.argv)!=3:
print "usage is ./minispider.py site link"
sys.exit(2)
mySpider = miniHTMLParser()
site = sys.argv[1]
link = sys.argv[2]
url_login_link = site+"/node?destination=node"
print "\nLogging in", url_login_link
x = mySpider.loginSite( url_login_link )
while link != '':
print "\nChecking link ", link
# Get the file from the site and link
retfile = mySpider.gethtmlfile( site, link )
# Feed the file into the HTML parser
mySpider.feed(retfile)
# Search the retfile here
# Get the next link in level traversal order
link = mySpider.get_next_link()
mySpider.close()
print "\ndone\n"
if __name__ == "__main__":
main()
Trust me nothing is better than curl.. . the following code can crawl 10,000 urls in parallel in less than 300 secs on Amazon EC2
CAUTION: Don't hit the same domain at such a high speed.. .
#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# vi:ts=4:et
# $Id: retriever-multi.py,v 1.29 2005/07/28 11:04:13 mfx Exp $
#
# Usage: python retriever-multi.py <file with URLs to fetch> [<# of
# concurrent connections>]
#
import sys
import pycurl
# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try:
import signal
from signal import SIGPIPE, SIG_IGN
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
pass
# Get args
num_conn = 10
try:
if sys.argv[1] == "-":
urls = sys.stdin.readlines()
else:
urls = open(sys.argv[1]).readlines()
if len(sys.argv) >= 3:
num_conn = int(sys.argv[2])
except:
print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
raise SystemExit
# Make a queue with (url, filename) tuples
queue = []
for url in urls:
url = url.strip()
if not url or url[0] == "#":
continue
filename = "doc_%03d.dat" % (len(queue) + 1)
queue.append((url, filename))
# Check args
assert queue, "no URLs given"
num_urls = len(queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
# Pre-allocate a list of curl objects
m = pycurl.CurlMulti()
m.handles = []
for i in range(num_conn):
c = pycurl.Curl()
c.fp = None
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.setopt(pycurl.CONNECTTIMEOUT, 30)
c.setopt(pycurl.TIMEOUT, 300)
c.setopt(pycurl.NOSIGNAL, 1)
m.handles.append(c)
# Main loop
freelist = m.handles[:]
num_processed = 0
while num_processed < num_urls:
# If there is an url to process and a free curl object, add to multi stack
while queue and freelist:
url, filename = queue.pop(0)
c = freelist.pop()
c.fp = open(filename, "wb")
c.setopt(pycurl.URL, url)
c.setopt(pycurl.WRITEDATA, c.fp)
m.add_handle(c)
# store some info
c.filename = filename
c.url = url
# Run the internal curl state machine for the multi stack
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Check for curl objects which have terminated, and add them to the freelist
while 1:
num_q, ok_list, err_list = m.info_read()
for c in ok_list:
c.fp.close()
c.fp = None
m.remove_handle(c)
print "Success:", c.filename, c.url, c.getinfo(pycurl.EFFECTIVE_URL)
freelist.append(c)
for c, errno, errmsg in err_list:
c.fp.close()
c.fp = None
m.remove_handle(c)
print "Failed: ", c.filename, c.url, errno, errmsg
freelist.append(c)
num_processed = num_processed + len(ok_list) + len(err_list)
if num_q == 0:
break
# Currently no more I/O is pending, could do something in the meantime
# (display a progress bar, etc.).
# We just call select() to sleep until some more data is available.
m.select(1.0)
# Cleanup
for c in m.handles:
if c.fp is not None:
c.fp.close()
c.fp = None
c.close()
m.close()
Another simple spider
Uses BeautifulSoup and urllib2. Nothing too sophisticated, just reads all a href's builds a list and goes though it.
pyspider.py

Categories