Use scrapy from a py file (not in command line)

Use scrapy from a py file (not in command line) - python

Trying to launch scrapy from a py file with this command :
py myproject.py -f C:\Users\admin\Downloads\test.csv
Here my file named "myproject.py"
import spiders.ggspider as MySpiders
# Return array
dataFile = args.file
myData = CSVReader.getAnimalList(dataFile)
leSpider = MySpiders.GGCSpider()
leSpider.myList = myData
leSpider.start_requests()
Here my spider file :
import scrapy
import urllib
class GGSpider(scrapy.Spider):
name = "spiderman"
domain = "https://www.google.fr/?q={}"
myList = []
def __init__(self):
pass
def start_requests(self):
for leObject in self.myList:
tmpURL = self.domain.format(urllib.parse.urlencode({'text' : leObject[0]}))
yield scrapy.Request(url=self.domain+leObject[0],callback = self.parse)
def parse(self, response):
print('hello')
print(response)
My problem is : I go into start_requests, because I put a print before the yield and got the print in console
But the callback seems to not append (I don't get the 'Hello' print).
I really don't know why (I'm new to Python, maybe I'm missing something obvious)

I guess that's because generator doesn't actually runs before you'll retrieve its values. You could try to consume generator somehow:
import spiders.ggspider as MySpiders
# Return array
dataFile = args.file
myData = CSVReader.getAnimalList(dataFile)
leSpider = MySpiders.GGCSpider()
leSpider.myList = myData
for request in leSpider.start_requests():
do_something(request)
UPD: Here is a better example of running Spider from a script:
import scrapy
from scrapy.crawler import CrawlerProcess
class MySpider(scrapy.Spider):
# Your spider definition
...
process = CrawlerProcess(settings={
"FEEDS": {
"items.json": {"format": "json"},
},
})
process.crawl(MySpider)
process.start() # the script will block here until the crawling is finished

Related

How to run Scrapy in a while loop

So Im doing a project scraping different websites using multiple spiders. I want to make it so that the spiders run again when the user says "Yes" when asked to continue.
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
But I get an error saying reactor is not restartable.
Traceback (most recent call last):
File "/Users/user/Desktop/programs/eshopSpider/eshopSpider.py", line 47, in <module>
process.start()
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/crawler.py", line 327, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1317, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1299, in startRunning
ReactorBase.startRunning(cast(ReactorBase, self))
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 843, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
So I guess using while loop is no-go. I don't know where to even start...

Method 1:
scrapy creates Reactor which can't be reused after stop but if you will run Crawler in separated process then new process will have to create new Reactor.
import multiprocessing
def run_crawler(keyword, page_range):
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
# --- main ---
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
p = multiprocessing(target=run_crawler, args=(keyword, page_range))
p.start()
p.join()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
It will not work if you use threading instead of multiprocessing because threads share variables so new thread will use the same Reactor as previous thread.
Minimal working code (tested on Linux).
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
import multiprocessing
from scrapy.crawler import CrawlerProcess
def run_crawler(keyword, page_range):
#from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(MySpider, keyword, page)
c.crawl(MySpider, keyword, int(page)+1)
c.crawl(MySpider, keyword, int(page)+2)
c.start()
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
running = True
while running:
p = multiprocessing.Process(target=run_crawler, args=(keyword, page))
p.start()
p.join()
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
Method 2:
Found in Google: Restarting a Twisted Reactor.
It is old post which uses del to remove module twisted from memory and later it imports it again.
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
import sys
del sys.modules['twisted.internet.reactor']
from twisted.internet import reactor
from twisted.internet import default
default.install()
Minimal working code (tested on Linux)
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
def run_crawler(keyword, page):
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(MySpider, keyword, page)
c.crawl(MySpider, keyword, int(page)+1)
c.crawl(MySpider, keyword, int(page)+2)
c.start()
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
running = True
while running:
run_crawler(keyword, page)
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
import sys
del sys.modules['twisted.internet.reactor']
from twisted.internet import reactor
from twisted.internet import default
default.install()
Method 3:
It seems you could use use CrawlRunner instead of CrawlProcess - but I didn't test it yet.
Base on last example in doc for Running multiple spiders in the same process I created code which runs while-loop inside reactor (so it doesn't have to stop it) but it first starts one Spider, next runs second Spider, next it asks for contiuation and it runs again first Spider, next runs second Spider. It doesn't runs both Spiders at the same time but maybe it could be somehow changed.
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
#defer.inlineCallbacks
def run_crawler():
running = True
while running:
yield runner.crawl(MySpider, keyword, page)
yield runner.crawl(MySpider, keyword, int(page)+1)
yield runner.crawl(MySpider, keyword, int(page)+2)
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
reactor.stop()
#return
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
configure_logging()
runner = CrawlerRunner({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
run_crawler()
reactor.run()
EDIT:
The same but now all crawlers run at the same time
#defer.inlineCallbacks
def run_crawler():
running = True
while running:
runner.crawl(MySpider, keyword, page)
runner.crawl(MySpider, keyword, int(page)+1)
runner.crawl(MySpider, keyword, int(page)+2)
d = runner.join()
yield d
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
reactor.stop()
#return

You can remove the while loop and use callbacks instead.
Edit: Example added:
def callback_f():
# stuff #
calling_f()
def calling_f():
answer = input("Continue? (y/n)")
if not answer == 'n':
callback_f()
callback_f()

from twisted.internet import reactor #only this is supposed to be here, we will be deleting the reactor after each run, using the main
configure_logging()
settings = get_project_settings()
runner = CrawlerRunner(settings)
d = runner.crawl('your spider class name')
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until all crawling jobs are finished
del sys.modules['twisted.internet.reactor'] #deleting the reactor, because we want to run a for loop, the reactor will be imported again at the top
default.install()

Using WGET or Python to download and rename attachments from CSV requiring basic authentication

I scraped a ticketing website that we were using and I now have a CSV file which looks like this: ID, Attachment_URL, Ticket_URL. What I now need to do is download every attachment and rename the file with the Ticket_URL. The main issue I have is that when navigating to the Attachment_URL you must use basic authentication and then you are redirected to an aws s3 link. I have been able to download individual files using wget, but I have not been able to iterate through the entire list (35k rows or so), and I am not sure how I would be able to name the file as the ticket_id. Any advice would be appreciated.

Got it.
To open the authenticated session:
# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
import csv
import pandas as pd
import time
s = requests.session()
payload = {
'user': '',
'pw': ''
}
s.post('login.url.here', data=payload)
for i in range(1, 6000):
testURL = s.get(
'https://urlhere.com/efw/stuff&page={}'.format(i))
soup = BeautifulSoup(testURL.content)
table = soup.find("table", {"class": "table-striped"})
table_body = table.find('tbody')
rows = table_body.find_all('tr')[1:]
print "The current page is: " + str(i)
for row in rows:
cols = row.find_all('a', attrs={'href': re.compile("^/helpdesk/")})
# time.sleep(1)
with open('fd.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(cols)
print cols
print cols
Then I cleaned the links a bit in R and to download the files.
#! /usr/bin/env python
import threading
import os
from time import gmtime, strftime
from Queue import Queue
import requests
s = requests.session()
payload = {
'user': '',
'pw': ''
}
s.post('login', data=payload)
class log:
def info(self, message):
self.__message("info", message)
def error(self, message):
self.__message("error", message)
def debug(self, message):
self.__message("debug", message)
def __message(self, log_level, message):
date = strftime("%Y-%m-%d %H:%M:%S", gmtime())
print "%s [%s] %s" % (date, log_level, message)
class fetch:
def __init__(self):
self.temp_dir = "/tmp"
def run_fetcher(self, queue):
while not queue.empty():
url, ticketid = queue.get()
if ticketid.endswith("NA"):
fileName = url.split("/")[-1] + 'NoTicket'
else:
fileName = ticketid.split("/")[-1]
response = s.get(url)
with open(os.path.join('/Users/Desktop/FolderHere', fileName + '.mp3'), 'wb') as f:
f.write(response.content)
print fileName
queue.task_done()
if __name__ == '__main__':
# load in classes
q = Queue()
log = log()
fe = fetch()
# get bucket name
#Read in input file
with open('/Users/name/csvfilehere.csv', 'r') as csvfile:
for line in csvfile:
id,url,ticket = line.split(",")
q.put([url.strip(),ticket.strip()])
# spin up fetcher workers
threads = []
for i in range(8):
t = threading.Thread(target=fe.run_fetcher, args=(q,))
t.daemon = True
threads.append(t)
t.start()
# close threads
[x.join() for x in threads]
# close queue
q.join()
log.info("End")

urlopen error when scrapy spider is run from a script

I've written a script that runs a scrapy spider that is located inside a different directory. The script takes in user input, parses it and adds it to a url to be scraped. The script seemed to be working earlier but now I'm getting the following error:
URLError: <urlopen error [Errno 101] Network is unreachable>
ERROR: Unable to read instance data, giving up
The code for the spider works properly when run with the scrapy crawl command, but isn't working when run from a script for some reason.
Here is the code for the function that runs the spider from the script (located within the spider file):
def spiderCrawl(bandname):
aSpider = MySpider3()
aSpider.create_link(bandname)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(aSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
function that creates the url:
def create_link(self, bandname):
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
start_urls = [tc_url]
Also, below is an image of the terminal with the error message. The fact that a random bandname was entered suggests that the url wasn't even read in the first place. What could be the problem, here? Any help would be appreciated, thanks.
Update:
So it seems that the problem was that my create_link method inside of the spider class wasn't properly adding the link to the start_urls list, but the script does seem to be running the spider when I use the raw_input statement inside of the spider file as opposed to the script. What would be the proper way to pass the argument of the user's input to the spider file to be added as a link? I have the code for the spider and the script running the spider below to make the post more complete:
script code
from ticket_city_scraper.ticket_city_scraper import *
from ticket_city_scraper.ticket_city_scraper.spiders import tc_spider
bandname = raw_input("Enter bandname\n") # I took out this line and added it to the spider file to make the script work
tc_spider.spiderCrawl(bandname)
spider file
class MySpider3(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.ticketcity.com"]
start_urls = [tc_url]
tickets_list_xpath = './/div[#class = "vevent"]'
def create_link(self, bandname):
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
self.start_urls = [tc_url]
#return tc_url
tickets_list_xpath = './/div[#class = "vevent"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('B')
price_list = [i.get('P') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
print "parse price function entered \n"
loader = response.meta['loader']
event_City = response.xpath('.//span[#itemprop="addressLocality"]/text()').extract()
eventCity = ''.join(event_City)
loader.add_value('eventCity' , eventCity)
event_State = response.xpath('.//span[#itemprop="addressRegion"]/text()').extract()
eventState = ''.join(event_State)
loader.add_value('eventState' , eventState)
event_Date = response.xpath('.//span[#class="event_datetime"]/text()').extract()
eventDate = ''.join(event_Date)
loader.add_value('eventDate' , eventDate)
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "https://www.ticketcity.com/Catalog/public/v1/events/" + json_id + "/ticketblocks?P=0,99999999&q=0&per_page=250&page=1&sort=p.asc&f.t=s&_=1436642392938"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/span[#class="summary listingEventName"]/text()')
loader.add_xpath('eventLocation' , './/div[#class="divVenue location"]/text()')
loader.add_xpath('ticketsLink' , './/a[#class="divEventDetails url"]/#href')
#loader.add_xpath('eventDateTime' , '//div[#id="divEventDate"]/#title') #datetime type
#loader.add_xpath('eventTime' , './/*[#class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[#id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
def spiderCrawl(bandname):
# process = CrawlerProcess({
# 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
# })
# process.crawl(aSpider)
# process.start()
aSpider = MySpider3()
#aSpider.create_link(bandname)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(aSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()

I could only guess since you didn't provide a MCVE. However I'd say in your function create_link, this line:
start_urls = [tc_url]
should really be:
self.start_urls = [tc_url]

Python: Cannot pop from empty list? When list is clearly not empty?

I'm obviously missing something here. Same project I've been working on for a number of days. Stepping through it bit by bit, seemed to be working fine. I added in a portion of the main() function to actually create the comparison lists, and suddenly starts throwing out cannot pop from empty list error at me, even through a print function I've placed ahead of the pop() call clearly shows that the list is not empty? Any ideas what I'm doing wrong? and is this monstrosity gonna actually work the way I intend? First time working with threads and all. Here is the code in its entirety:
import urllib
import urllib2
import sys
from lxml.html import parse, tostring, fromstring
from urlparse import urlparse
import threading
class Crawler(threading.Thread):
def __init__(self):
self.links = []
self.queue = []
self.mal_list = []
self.count = 0
self.mal_set = set(self.mal_list)
self.crawled = []
self.crawled_set = set(self.crawled)
self.links_set = set(self.links)
self.queue.append(sys.argv[1])
self.queue_set = set(self.queue)
def run(self, max_depth):
print(self.queue)
while self.count < max_depth:
tgt = self.queue.pop(0)
if tgt not in self.mal_set:
self.crawl(tgt)
else:
print("Malicious Link Found: {0}".format(tgt)
continue
sys.exit("Finished!")
def crawl(self, tgt):
url = urlparse(tgt)
self.crawled.append(tgt)
try:
print("Crawling {0}".format(tgt))
request = urllib2.Request(tgt)
request.add_header("User-Agent", "Mozilla/5,0")
opener = urllib2.build_opener()
data = opener.open(request)
self.count += 1
except:
return
doc = parse(data).getroot()
for tag in doc.xpath("//a[#href]"):
old = tag.get('href')
fixed = urllib.unquote(old)
self.links.append(fixed)
self.queue_links(self.links_set, url)
def queue_links(self, links, url):
for link in links:
if link.startswith('/'):
link = "http://" + url.netloc + "/" + link
elif link.startswith('#'):
continue
elif link.startswith('http'):
link = 'http://' + url.netloc + '/' + link
if link.decode('utf-8') not in self.crawled_set:
self.queue.append(link)
def make_mal_list(self):
"""
Open various malware and phishing related blacklists and create a list
of URLS from which to compare to the crawled links
"""
hosts1 = "hosts.txt"
hosts2 = "MH-sitelist.txt"
hosts3 = "urls.txt"
with open(hosts1) as first:
for line1 in first.readlines():
link = "http://" + line1.strip()
self.mal_list.append(link)
with open(hosts2) as second:
for line2 in second.readlines():
link = "http://" + line2.strip()
self.mal_list.append(link)
with open(hosts3) as third:
for line3 in third.readlines():
link = "http://" + line3.strip()
self.mal_list.append(link)
def main():
crawler = Crawler()
crawler.make_mal_list()
crawler.run(25)
if __name__ == "__main__":
main()

First of all , i did get lost while reading your code so maybe i can give you some remark if i may before:
to many instance variable you don't have to create a new instance var just to put on it a set() of another vars like this code : self.mal_set = set(self.mal_list)and you are repeating the same thing many times
if you want to use threading so use it, because in your code you are just creating one thread, for that you should create like (10) thread or so each thread will deal with a bunch of URL that he should fetch, and don't forget to put the threads in a Queue.Queue to synchronize between them.
EDIT : Ahh i forgot : indent your code :)
now about your problem :
where do you assign self.queue because i don't see it ? you are just calling the make_mal_list() method that will initialize only self.mal_listand after when you run you own thread i think it's obvious that self.queue is empty so you can't pop() right ?
EDIT 2:
i think your example is more complicate (using black list and all this stuff, ...) but you can start with something like this:
import threading
import Queue
import sys
import urllib2
import url
from urlparse import urlparse
THREAD_NUMBER = 10
class Crawler(threading.Thread):
def __init__(self, queue, mal_urls):
self.queue = queue
self.mal_list = mal_urls
threading.Thread.__init__(self) # i forgot , thanks seriyPS :)
def run(self):
while True:
# Grabs url to fetch from queue.
url = self.queue.get()
if url not in self.mal_list:
self.crawl(url)
else:
print "Malicious Link Found: {0}".format(url)
# Signals to queue job is done
self.queue.task_done()
def crawl(self, tgt):
try:
url = urlparse(tgt)
print("Crawling {0}".format(tgt))
request = urllib2.Request(tgt)
request.add_header("User-Agent", "Mozilla/5,0")
opener = urllib2.build_opener()
data = opener.open(request)
except: # TODO: write explicit exceptions the URLError, ValueERROR ...
return
doc = parse(data).getroot()
for tag in doc.xpath("//a[#href]"):
old = tag.get('href')
fixed = urllib.unquote(old)
# I don't think you need this, but maybe i'm mistaken.
# self.links.append(fixed)
# Add more URL to the queue.
self.queue_links(fixed, url)
def queue_links(self, link, url):
"""I guess this method allow recursive download of urls that will
be fetched from the web pages ????
"""
#for link in links: # i changed the argument so now links it just one url.
if link.startswith('/'):
link = "http://" + url.netloc + "/" + link
elif link.startswith('#'):
continue
elif link.startswith('http'):
link = 'http://' + url.netloc + '/' + link
# Add urls extracted from the HTML text to the queue to fetche them
if link.decode('utf-8') not in self.crawled_set:
self.queue.put(link)
def get_make_mal_list():
"""Open various malware and phishing related blacklists and create a list
of URLS from which to compare to the crawled links
"""
hosts1 = "hosts.txt"
hosts2 = "MH-sitelist.txt"
hosts3 = "urls.txt"
mal_list = []
with open(hosts1) as first:
for line1 in first:
link = "http://" + line1.strip()
mal_list.append(link)
with open(hosts2) as second:
for line2 in second:
link = "http://" + line2.strip()
mal_list.append(link)
with open(hosts3) as third:
for line3 in third:
link = "http://" + line3.strip()
mal_list.append(link)
return mal_list
def main():
queue = Queue.Queue()
# Get malicious URLs.
mal_urls = set(get_make_mal_list())
# Create a THREAD_NUMBER thread and start them.
for i in xrange(THREAD_NUMBER):
cr = Crawler(queue, mal_urls)
cr.start()
# Get all url that you want to fetch and put them in the queue.
for url in sys.argv[1:]:
queue.put(url)
# Wait on the queue until everything has been processed.
queue.join()
if __name__ == '__main__':
main()

Small offtopic:
class Crawler(threading.Thread):
def __init__(self):
#you code
threading.Thread.__init__(self)#!!!
don't forget run Thread.__init__(self) directly if you override __init__ function
And, ofcourse, you must use http://docs.python.org/library/queue.html class for implement you job's queue in thread-safe mode

My primary language is C#, but issue you are experiencing is because of threading. In thread #1 you check that list is not empty, while thread #2 clears that list and thus you receive exception.

list is not thread-safe. If you need a thread-safe data structure, use Queue.Queue (Python 2.x) or queue.Queue (Python 3.x).

Also, look on this fragment:
print(self.queue)
while self.count < max_depth:
tgt = self.queue.pop(0)
you do print(self.queue) only before in first while iteration, so, self.queue.pop() can make many iterations (and fetch many links) and raise "cannot pop from empty list" only when queue is really empty!
try this:
while self.count < max_depth:
print(self.queue)
tgt = self.queue.pop(0)
for detect moment when you take exception.

get many pages with pycurl?

I want to get many pages from a website, like
curl "http://farmsubsidy.org/DE/browse?page=[0000-3603]" -o "de.#1"
but get the pages' data in python, not disk files.
Can someone please post pycurl code to do this,
or fast urllib2 (not one-at-a-time) if that's possible,
or else say "forget it, curl is faster and more robust" ? Thanks

So you have 2 problem and let me show you in one example. Notice the pycurl already did the multithreading/not one-at-a-time w/o your hardwork.
#! /usr/bin/env python
import sys, select, time
import pycurl,StringIO
c1 = pycurl.Curl()
c2 = pycurl.Curl()
c3 = pycurl.Curl()
c1.setopt(c1.URL, "http://www.python.org")
c2.setopt(c2.URL, "http://curl.haxx.se")
c3.setopt(c3.URL, "http://slashdot.org")
s1 = StringIO.StringIO()
s2 = StringIO.StringIO()
s3 = StringIO.StringIO()
c1.setopt(c1.WRITEFUNCTION, s1.write)
c2.setopt(c2.WRITEFUNCTION, s2.write)
c3.setopt(c3.WRITEFUNCTION, s3.write)
m = pycurl.CurlMulti()
m.add_handle(c1)
m.add_handle(c2)
m.add_handle(c3)
# Number of seconds to wait for a timeout to happen
SELECT_TIMEOUT = 1.0
# Stir the state machine into action
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Keep going until all the connections have terminated
while num_handles:
# The select method uses fdset internally to determine which file descriptors
# to check.
m.select(SELECT_TIMEOUT)
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Cleanup
m.remove_handle(c3)
m.remove_handle(c2)
m.remove_handle(c1)
m.close()
c1.close()
c2.close()
c3.close()
print "http://www.python.org is ",s1.getvalue()
print "http://curl.haxx.se is ",s2.getvalue()
print "http://slashdot.org is ",s3.getvalue()
Finally, these code is mainly based on an example on the pycurl site =.=
may be you should really read doc. ppl spend huge time on it.

here is a solution based on urllib2 and threads.
import urllib2
from threading import Thread
BASE_URL = 'http://farmsubsidy.org/DE/browse?page='
NUM_RANGE = range(0000, 3603)
THREADS = 2
def main():
for nums in split_seq(NUM_RANGE, THREADS):
t = Spider(BASE_URL, nums)
t.start()
def split_seq(seq, num_pieces):
start = 0
for i in xrange(num_pieces):
stop = start + len(seq[i::num_pieces])
yield seq[start:stop]
start = stop
class Spider(Thread):
def __init__(self, base_url, nums):
Thread.__init__(self)
self.base_url = base_url
self.nums = nums
def run(self):
for num in self.nums:
url = '%s%s' % (self.base_url, num)
data = urllib2.urlopen(url).read()
print data
if __name__ == '__main__':
main()

You can just put that into a bash script inside a for loop.
However you may have better success at parsing each page using python.
http://www.securitytube.net/Crawling-the-Web-for-Fun-and-Profit-video.aspx
You will be able to get at the exact data and save it at the same time into a db.
http://www.securitytube.net/Storing-Mined-Data-from-the-Web-for-Fun-and-Profit-video.aspx

If you want to crawl a website using python, you should have a look to scrapy http://scrapy.org

Using BeautifulSoup4 and requests -
Grab head page:
page = Soup(requests.get(url='http://rootpage.htm').text)
Create an array of requests:
from requests import async
requests = [async.get(url.get('href')) for url in page('a')]
responses = async.map(requests)
[dosomething(response.text) for response in responses]
Requests requires gevent to do this btw.

I can recommend you to user async module of human_curl
Look example:
from urlparse import urljoin
from datetime import datetime
from human_curl.async import AsyncClient
from human_curl.utils import stdout_debug
def success_callback(response, **kwargs):
"""This function call when response successed
"""
print("success callback")
print(response, response.request)
print(response.headers)
print(response.content)
print(kwargs)
def fail_callback(request, opener, **kwargs):
"""Collect errors
"""
print("fail callback")
print(request, opener)
print(kwargs)
with AsyncClient(success_callback=success_callback,
fail_callback=fail_callback) as async_client:
for x in xrange(10000):
async_client.get('http://google.com/', params=(("x", str(x)),)
async_client.get('http://google.com/', params=(("x", str(x)),),
success_callback=success_callback, fail_callback=fail_callback)
Usage very simple. Then page success loaded of failed async_client call you callback. Also you can specify number on parallel connections.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Use scrapy from a py file (not in command line) - python

Related

How to run Scrapy in a while loop

Using WGET or Python to download and rename attachments from CSV requiring basic authentication

urlopen error when scrapy spider is run from a script

Python: Cannot pop from empty list? When list is clearly not empty?

get many pages with pycurl?

Categories

Resources