So Im doing a project scraping different websites using multiple spiders. I want to make it so that the spiders run again when the user says "Yes" when asked to continue.
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
But I get an error saying reactor is not restartable.
Traceback (most recent call last):
File "/Users/user/Desktop/programs/eshopSpider/eshopSpider.py", line 47, in <module>
process.start()
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/crawler.py", line 327, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1317, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1299, in startRunning
ReactorBase.startRunning(cast(ReactorBase, self))
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 843, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
So I guess using while loop is no-go. I don't know where to even start...
Method 1:
scrapy creates Reactor which can't be reused after stop but if you will run Crawler in separated process then new process will have to create new Reactor.
import multiprocessing
def run_crawler(keyword, page_range):
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
# --- main ---
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
p = multiprocessing(target=run_crawler, args=(keyword, page_range))
p.start()
p.join()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
It will not work if you use threading instead of multiprocessing because threads share variables so new thread will use the same Reactor as previous thread.
Minimal working code (tested on Linux).
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
import multiprocessing
from scrapy.crawler import CrawlerProcess
def run_crawler(keyword, page_range):
#from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(MySpider, keyword, page)
c.crawl(MySpider, keyword, int(page)+1)
c.crawl(MySpider, keyword, int(page)+2)
c.start()
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
running = True
while running:
p = multiprocessing.Process(target=run_crawler, args=(keyword, page))
p.start()
p.join()
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
Method 2:
Found in Google: Restarting a Twisted Reactor.
It is old post which uses del to remove module twisted from memory and later it imports it again.
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
import sys
del sys.modules['twisted.internet.reactor']
from twisted.internet import reactor
from twisted.internet import default
default.install()
Minimal working code (tested on Linux)
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
def run_crawler(keyword, page):
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(MySpider, keyword, page)
c.crawl(MySpider, keyword, int(page)+1)
c.crawl(MySpider, keyword, int(page)+2)
c.start()
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
running = True
while running:
run_crawler(keyword, page)
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
import sys
del sys.modules['twisted.internet.reactor']
from twisted.internet import reactor
from twisted.internet import default
default.install()
Method 3:
It seems you could use use CrawlRunner instead of CrawlProcess - but I didn't test it yet.
Base on last example in doc for Running multiple spiders in the same process I created code which runs while-loop inside reactor (so it doesn't have to stop it) but it first starts one Spider, next runs second Spider, next it asks for contiuation and it runs again first Spider, next runs second Spider. It doesn't runs both Spiders at the same time but maybe it could be somehow changed.
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
#defer.inlineCallbacks
def run_crawler():
running = True
while running:
yield runner.crawl(MySpider, keyword, page)
yield runner.crawl(MySpider, keyword, int(page)+1)
yield runner.crawl(MySpider, keyword, int(page)+2)
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
reactor.stop()
#return
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
configure_logging()
runner = CrawlerRunner({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
run_crawler()
reactor.run()
EDIT:
The same but now all crawlers run at the same time
#defer.inlineCallbacks
def run_crawler():
running = True
while running:
runner.crawl(MySpider, keyword, page)
runner.crawl(MySpider, keyword, int(page)+1)
runner.crawl(MySpider, keyword, int(page)+2)
d = runner.join()
yield d
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
reactor.stop()
#return
You can remove the while loop and use callbacks instead.
Edit: Example added:
def callback_f():
# stuff #
calling_f()
def calling_f():
answer = input("Continue? (y/n)")
if not answer == 'n':
callback_f()
callback_f()
from twisted.internet import reactor #only this is supposed to be here, we will be deleting the reactor after each run, using the main
configure_logging()
settings = get_project_settings()
runner = CrawlerRunner(settings)
d = runner.crawl('your spider class name')
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until all crawling jobs are finished
del sys.modules['twisted.internet.reactor'] #deleting the reactor, because we want to run a for loop, the reactor will be imported again at the top
default.install()
Related
I have a script that retrieves the URLs from a file (1_1.txt)
I configured ThreadPoolExecutor to max 50 workers (at the end of the code), but even with just one worker, the memory grows very fast (several GB), it executes the get_url function which launches the requests module
The other functions called (file_writer and file_writer_html) don't seem involved because I already tried without
When I remove the code that executes HTTP requests and the associated variables, I no longer have this bug
Also, I installed the memory-profiler module (as you can see in the code), and it shows me that it is all the variables of the function get_url that increase the ram (result_request, soup, html, headers...)
import requests
from concurrent.futures import ThreadPoolExecutor
import fileinput
from bs4 import BeautifulSoup
import traceback
from threading import Thread
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import warnings
from random import random
from queue import Queue
from memory_profiler import profile
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
count_requests = 0
host_error = 0
ongoing = 0
##profile
def get_url(url):
global queue
global queue_html
global count_requests
global ongoing
global host_error
try:
result_request = requests.get(url, verify=False, timeout=40)
soup = BeautifulSoup(result_request.text, 'html.parser')
title = soup.title.get_text().splitlines(False)
html = result_request.content.splitlines(False)
headers = str(result_request.headers)
headers = headers.splitlines(False)
title = str(title)
html = str(html)
headers = str(headers)
title = title[0:10000]
html = html[0:10000000]
headers = headers[0:10000]
count_requests = count_requests + 1
queue.put(f'{url} - {title} \n')
queue_html.put(f'{url} - {html} - HEADER: {headers} \n')
except:
queue.put(f'FAILED : {url} \n')
host_error = host_error + 1
# dedicated file writing task
def file_writer(filepath, queue):
global count_requests
# open the file
with open(filepath, 'a', encoding="utf-8") as file:
# run until the event is set
while True:
# get a line of text from the queue
line = queue.get()
# check if we are done
if line is None:
# exit the loop
break
# write it to file
file.write(line)
# flush the buffer
file.flush()
# mark the unit of work complete
queue.task_done()
# mark the exit signal as processed, after the file was closed
queue.task_done()
# dedicated file writing task
def file_writer_html(filepath_html, queue_html):
# open the file
with open(filepath_html, 'a', encoding="utf-8") as file:
# run until the event is set
while True:
# get a line of text from the queue
line = queue_html.get()
print(str("requests success : ") + str(count_requests) + str(" | requests error ") + str(host_error), end='\r')
# check if we are done
if line is None:
# exit the loop
break
# write it to file
file.write(line)
# flush the buffer
file.flush()
# mark the unit of work complete
queue_html.task_done()
# mark the exit signal as processed, after the file was closed
queue_html.task_done()
# create the shared queue
queue = Queue()
queue_html = Queue()
# defile the shared file path
filepath = 'output.txt'
filepath_html = 'output_html.txt'
# create and start the file writer thread
writer_thread = Thread(target=file_writer, args=(filepath,queue), daemon=True)
writer_thread.start()
writer_html_thread = Thread(target=file_writer_html, args=(filepath_html,queue_html), daemon=True)
writer_html_thread.start()
# wait for all tasks in the queue to be processed
queue.join()
with open("1_1.txt") as stream:
urls = [line.strip() for line in stream]
with ThreadPoolExecutor(max_workers=50) as pool:
pool.map(get_url, urls)
Trying to launch scrapy from a py file with this command :
py myproject.py -f C:\Users\admin\Downloads\test.csv
Here my file named "myproject.py"
import spiders.ggspider as MySpiders
# Return array
dataFile = args.file
myData = CSVReader.getAnimalList(dataFile)
leSpider = MySpiders.GGCSpider()
leSpider.myList = myData
leSpider.start_requests()
Here my spider file :
import scrapy
import urllib
class GGSpider(scrapy.Spider):
name = "spiderman"
domain = "https://www.google.fr/?q={}"
myList = []
def __init__(self):
pass
def start_requests(self):
for leObject in self.myList:
tmpURL = self.domain.format(urllib.parse.urlencode({'text' : leObject[0]}))
yield scrapy.Request(url=self.domain+leObject[0],callback = self.parse)
def parse(self, response):
print('hello')
print(response)
My problem is : I go into start_requests, because I put a print before the yield and got the print in console
But the callback seems to not append (I don't get the 'Hello' print).
I really don't know why (I'm new to Python, maybe I'm missing something obvious)
I guess that's because generator doesn't actually runs before you'll retrieve its values. You could try to consume generator somehow:
import spiders.ggspider as MySpiders
# Return array
dataFile = args.file
myData = CSVReader.getAnimalList(dataFile)
leSpider = MySpiders.GGCSpider()
leSpider.myList = myData
for request in leSpider.start_requests():
do_something(request)
UPD: Here is a better example of running Spider from a script:
import scrapy
from scrapy.crawler import CrawlerProcess
class MySpider(scrapy.Spider):
# Your spider definition
...
process = CrawlerProcess(settings={
"FEEDS": {
"items.json": {"format": "json"},
},
})
process.crawl(MySpider)
process.start() # the script will block here until the crawling is finished
I am trying to implement a multihtreaded crawler that takes an initial url and searches for links within that link and displays each links and at the same time look for links within each link
This is my code
import urllib.request, re, threading, csv
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit
class a3_6:
__url_q = Queue(100)
__html_q = Queue()
__data_q = Queue()
__visited_urls = []
def __init__(self, start_url, max_threads):
self.__url_q.put(start_url)
self.max_threads = max_threads
def gethtml(self,url):
try:
req=urllib.request.Request(url)
html=urllib.request.urlopen(req).read()
self.__html_q.put(html)
except urllib.error.URLError as e:
print(e.reason)
except:
print("invalid: " + url)
self.__visited_urls.append(url)
def mine_thread(self):
while True:
if not self.__html_q.empty():
soup = BeautifulSoup(self.__html_q.get(),"html.parser")
for a in soup.find_all('a', href=True):
if a not in self.__visited_urls:
link='https://en.wikipedia.org'+a.get('href')
self.__url_q.put(link)
self.__data_q.put(link)
else:
break
def store(self):
while True:
if not self.__data_q.empty():
print (self.__data_q.get())
def download_thread(self):
while True:
if not self.__url_q.empty():
self.gethtml(self.__url_q.get())
else:
break
def run(self):
self.download_thread()
self.mine_thread()
self.store()
def op(self):
for x in range(self.max_threads):
t = threading.Thread(target=self.run)
t.daemon = True
t.start()
self.store()
if __name__ == '__main__':
a=a3_6('https://en.wikipedia.org/wiki/Main_Page', 5)
a.op()
EDIT: I edited the code and now I am getting proper results but again not ending.
I arrived at the solution. I took James Harrison's help. i don't know why he deleted his original solution but here it is
import urllib.request, threading
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit
from a3_3 import store_to_db
class a3_5:
__url_q = Queue(100)
__html_q = Queue()
__data_q = Queue()
__visited_urls=[]
def gethtml(self,url):
try:
req=urllib.request.Request(url)
html=urllib.request.urlopen(req).read()
self.__html_q.put(html)
pars=urlparse(url)
except urllib.error.URLError as e:
print(e.reason+':'+url)
except:
print("invalid: " + url)
def mine_thread(self):
while True:
if not self.__html_q.empty():
soup = BeautifulSoup(self.__html_q.get(),"html.parser")
for a in soup.find_all('a', href=True):
link=a.get('href')
"""if not link.startswith('www'):
link=self.__prfx+link"""
if link not in self.__visited_urls:
self.__url_q.put(link)
self.__data_q.put(link)
else:
break
def store(self):
while True:
if not self.__data_q.empty():
cont=self.__data_q.get()
print (cont)
else:
break
def download_thread(self):
while True:
if not self.__url_q.empty():
self.gethtml(self.__url_q.get())
self.__url_q.task_done()
def op(self,*urls):
for x in range(25):
d = threading.Thread(target=self.download_thread)
d.setDaemon(True)
d.start()
for url in urls:
self.__url_q.put(url)
self.__url_q.join()
self.mine_thread()
self.store()
if __name__ == '__main__':
urls=['https://en.wikipedia.org/wiki/Bajirao']#,'https://en.wikipedia.org/wiki/Malharrao_Holkar','https://en.wikipedia.org/wiki/Ranoji_Scindia']
a=a3_5()
a.op(*urls)
Essentially I had to arrange another queue where I had to set the workers to activate the threads. Also, the mine_thread and store methods needed to start after the completion of download_thread method, because the values wouldn't get stored.
I've written a script that runs a scrapy spider that is located inside a different directory. The script takes in user input, parses it and adds it to a url to be scraped. The script seemed to be working earlier but now I'm getting the following error:
URLError: <urlopen error [Errno 101] Network is unreachable>
ERROR: Unable to read instance data, giving up
The code for the spider works properly when run with the scrapy crawl command, but isn't working when run from a script for some reason.
Here is the code for the function that runs the spider from the script (located within the spider file):
def spiderCrawl(bandname):
aSpider = MySpider3()
aSpider.create_link(bandname)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(aSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
function that creates the url:
def create_link(self, bandname):
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
start_urls = [tc_url]
Also, below is an image of the terminal with the error message. The fact that a random bandname was entered suggests that the url wasn't even read in the first place. What could be the problem, here? Any help would be appreciated, thanks.
Update:
So it seems that the problem was that my create_link method inside of the spider class wasn't properly adding the link to the start_urls list, but the script does seem to be running the spider when I use the raw_input statement inside of the spider file as opposed to the script. What would be the proper way to pass the argument of the user's input to the spider file to be added as a link? I have the code for the spider and the script running the spider below to make the post more complete:
script code
from ticket_city_scraper.ticket_city_scraper import *
from ticket_city_scraper.ticket_city_scraper.spiders import tc_spider
bandname = raw_input("Enter bandname\n") # I took out this line and added it to the spider file to make the script work
tc_spider.spiderCrawl(bandname)
spider file
class MySpider3(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.ticketcity.com"]
start_urls = [tc_url]
tickets_list_xpath = './/div[#class = "vevent"]'
def create_link(self, bandname):
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
self.start_urls = [tc_url]
#return tc_url
tickets_list_xpath = './/div[#class = "vevent"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('B')
price_list = [i.get('P') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
print "parse price function entered \n"
loader = response.meta['loader']
event_City = response.xpath('.//span[#itemprop="addressLocality"]/text()').extract()
eventCity = ''.join(event_City)
loader.add_value('eventCity' , eventCity)
event_State = response.xpath('.//span[#itemprop="addressRegion"]/text()').extract()
eventState = ''.join(event_State)
loader.add_value('eventState' , eventState)
event_Date = response.xpath('.//span[#class="event_datetime"]/text()').extract()
eventDate = ''.join(event_Date)
loader.add_value('eventDate' , eventDate)
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "https://www.ticketcity.com/Catalog/public/v1/events/" + json_id + "/ticketblocks?P=0,99999999&q=0&per_page=250&page=1&sort=p.asc&f.t=s&_=1436642392938"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/span[#class="summary listingEventName"]/text()')
loader.add_xpath('eventLocation' , './/div[#class="divVenue location"]/text()')
loader.add_xpath('ticketsLink' , './/a[#class="divEventDetails url"]/#href')
#loader.add_xpath('eventDateTime' , '//div[#id="divEventDate"]/#title') #datetime type
#loader.add_xpath('eventTime' , './/*[#class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[#id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
def spiderCrawl(bandname):
# process = CrawlerProcess({
# 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
# })
# process.crawl(aSpider)
# process.start()
aSpider = MySpider3()
#aSpider.create_link(bandname)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(aSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
I could only guess since you didn't provide a MCVE. However I'd say in your function create_link, this line:
start_urls = [tc_url]
should really be:
self.start_urls = [tc_url]
I'm obviously missing something here. Same project I've been working on for a number of days. Stepping through it bit by bit, seemed to be working fine. I added in a portion of the main() function to actually create the comparison lists, and suddenly starts throwing out cannot pop from empty list error at me, even through a print function I've placed ahead of the pop() call clearly shows that the list is not empty? Any ideas what I'm doing wrong? and is this monstrosity gonna actually work the way I intend? First time working with threads and all. Here is the code in its entirety:
import urllib
import urllib2
import sys
from lxml.html import parse, tostring, fromstring
from urlparse import urlparse
import threading
class Crawler(threading.Thread):
def __init__(self):
self.links = []
self.queue = []
self.mal_list = []
self.count = 0
self.mal_set = set(self.mal_list)
self.crawled = []
self.crawled_set = set(self.crawled)
self.links_set = set(self.links)
self.queue.append(sys.argv[1])
self.queue_set = set(self.queue)
def run(self, max_depth):
print(self.queue)
while self.count < max_depth:
tgt = self.queue.pop(0)
if tgt not in self.mal_set:
self.crawl(tgt)
else:
print("Malicious Link Found: {0}".format(tgt)
continue
sys.exit("Finished!")
def crawl(self, tgt):
url = urlparse(tgt)
self.crawled.append(tgt)
try:
print("Crawling {0}".format(tgt))
request = urllib2.Request(tgt)
request.add_header("User-Agent", "Mozilla/5,0")
opener = urllib2.build_opener()
data = opener.open(request)
self.count += 1
except:
return
doc = parse(data).getroot()
for tag in doc.xpath("//a[#href]"):
old = tag.get('href')
fixed = urllib.unquote(old)
self.links.append(fixed)
self.queue_links(self.links_set, url)
def queue_links(self, links, url):
for link in links:
if link.startswith('/'):
link = "http://" + url.netloc + "/" + link
elif link.startswith('#'):
continue
elif link.startswith('http'):
link = 'http://' + url.netloc + '/' + link
if link.decode('utf-8') not in self.crawled_set:
self.queue.append(link)
def make_mal_list(self):
"""
Open various malware and phishing related blacklists and create a list
of URLS from which to compare to the crawled links
"""
hosts1 = "hosts.txt"
hosts2 = "MH-sitelist.txt"
hosts3 = "urls.txt"
with open(hosts1) as first:
for line1 in first.readlines():
link = "http://" + line1.strip()
self.mal_list.append(link)
with open(hosts2) as second:
for line2 in second.readlines():
link = "http://" + line2.strip()
self.mal_list.append(link)
with open(hosts3) as third:
for line3 in third.readlines():
link = "http://" + line3.strip()
self.mal_list.append(link)
def main():
crawler = Crawler()
crawler.make_mal_list()
crawler.run(25)
if __name__ == "__main__":
main()
First of all , i did get lost while reading your code so maybe i can give you some remark if i may before:
to many instance variable you don't have to create a new instance var just to put on it a set() of another vars like this code : self.mal_set = set(self.mal_list)and you are repeating the same thing many times
if you want to use threading so use it, because in your code you are just creating one thread, for that you should create like (10) thread or so each thread will deal with a bunch of URL that he should fetch, and don't forget to put the threads in a Queue.Queue to synchronize between them.
EDIT : Ahh i forgot : indent your code :)
now about your problem :
where do you assign self.queue because i don't see it ? you are just calling the make_mal_list() method that will initialize only self.mal_listand after when you run you own thread i think it's obvious that self.queue is empty so you can't pop() right ?
EDIT 2:
i think your example is more complicate (using black list and all this stuff, ...) but you can start with something like this:
import threading
import Queue
import sys
import urllib2
import url
from urlparse import urlparse
THREAD_NUMBER = 10
class Crawler(threading.Thread):
def __init__(self, queue, mal_urls):
self.queue = queue
self.mal_list = mal_urls
threading.Thread.__init__(self) # i forgot , thanks seriyPS :)
def run(self):
while True:
# Grabs url to fetch from queue.
url = self.queue.get()
if url not in self.mal_list:
self.crawl(url)
else:
print "Malicious Link Found: {0}".format(url)
# Signals to queue job is done
self.queue.task_done()
def crawl(self, tgt):
try:
url = urlparse(tgt)
print("Crawling {0}".format(tgt))
request = urllib2.Request(tgt)
request.add_header("User-Agent", "Mozilla/5,0")
opener = urllib2.build_opener()
data = opener.open(request)
except: # TODO: write explicit exceptions the URLError, ValueERROR ...
return
doc = parse(data).getroot()
for tag in doc.xpath("//a[#href]"):
old = tag.get('href')
fixed = urllib.unquote(old)
# I don't think you need this, but maybe i'm mistaken.
# self.links.append(fixed)
# Add more URL to the queue.
self.queue_links(fixed, url)
def queue_links(self, link, url):
"""I guess this method allow recursive download of urls that will
be fetched from the web pages ????
"""
#for link in links: # i changed the argument so now links it just one url.
if link.startswith('/'):
link = "http://" + url.netloc + "/" + link
elif link.startswith('#'):
continue
elif link.startswith('http'):
link = 'http://' + url.netloc + '/' + link
# Add urls extracted from the HTML text to the queue to fetche them
if link.decode('utf-8') not in self.crawled_set:
self.queue.put(link)
def get_make_mal_list():
"""Open various malware and phishing related blacklists and create a list
of URLS from which to compare to the crawled links
"""
hosts1 = "hosts.txt"
hosts2 = "MH-sitelist.txt"
hosts3 = "urls.txt"
mal_list = []
with open(hosts1) as first:
for line1 in first:
link = "http://" + line1.strip()
mal_list.append(link)
with open(hosts2) as second:
for line2 in second:
link = "http://" + line2.strip()
mal_list.append(link)
with open(hosts3) as third:
for line3 in third:
link = "http://" + line3.strip()
mal_list.append(link)
return mal_list
def main():
queue = Queue.Queue()
# Get malicious URLs.
mal_urls = set(get_make_mal_list())
# Create a THREAD_NUMBER thread and start them.
for i in xrange(THREAD_NUMBER):
cr = Crawler(queue, mal_urls)
cr.start()
# Get all url that you want to fetch and put them in the queue.
for url in sys.argv[1:]:
queue.put(url)
# Wait on the queue until everything has been processed.
queue.join()
if __name__ == '__main__':
main()
Small offtopic:
class Crawler(threading.Thread):
def __init__(self):
#you code
threading.Thread.__init__(self)#!!!
don't forget run Thread.__init__(self) directly if you override __init__ function
And, ofcourse, you must use http://docs.python.org/library/queue.html class for implement you job's queue in thread-safe mode
My primary language is C#, but issue you are experiencing is because of threading. In thread #1 you check that list is not empty, while thread #2 clears that list and thus you receive exception.
list is not thread-safe. If you need a thread-safe data structure, use Queue.Queue (Python 2.x) or queue.Queue (Python 3.x).
Also, look on this fragment:
print(self.queue)
while self.count < max_depth:
tgt = self.queue.pop(0)
you do print(self.queue) only before in first while iteration, so, self.queue.pop() can make many iterations (and fetch many links) and raise "cannot pop from empty list" only when queue is really empty!
try this:
while self.count < max_depth:
print(self.queue)
tgt = self.queue.pop(0)
for detect moment when you take exception.