how can i do a scrapy spider that runs "forever".
So i will start again when it gets def closed(self, spider):
This is the function that calls when its at the end. I tested it with print a text. So everytime at the end, i have that text.
But how can i start the spider than again?
class Spider(scrapy.Spider):
def start_requests(self):
Spidercode...
def closed(self, spider):
print('END');
The spider start every round with "start_requests" and ends with closed()
import scrapy
from scrapy.crawler import CrawlerProcess
from twisted.internet import reactor
from twisted.internet.task import deferLater
...
runner=CrawlerProcess(setting={})
def sleep(self, *args, seconds):
"""Non blocking sleep callback"""
return deferLater(reactor, seconds, lambda: None
def crawl(result):
d=runner.crawl(MySpider)
d.addCallback(lambda results: print('waiting 0 seconds before restart...'))
d.addErrback(crash) # <-- add errback here
d.addCallback(sleep, seconds=0) # call back in second
d.addCallback(crawl)
return d
crawl(None)
runner.start()
Related
I am trying to run scrapy as a python script and want to process the data scraped instead of storing in a file/database. The code looks like
import scrapy
import scrapy.crawler as crawler
from scrapy.utils.log import configure_logging
from multiprocessing import Process, Queue
from twisted.internet import reactor
# spider
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ['http://quotes.toscrape.com/tag/humor/']
def parse(self, response):
yield {"html_data": response.text}
# the wrapper to make it run more times
def run_spider(spider):
def f(q):
try:
runner = crawler.CrawlerRunner()
deferred = runner.crawl(spider)
deferred.addBoth(lambda _: reactor.stop())
reactor.run()
q.put(None)
except Exception as e:
q.put(e)
q = Queue()
p = Process(target=f, args=(q,))
p.start()
result = q.get()
p.join()
if result is not None:
raise result
configure_logging()
x = run_spider(QuotesSpider)
I want to run the spider when it is called. How this can be done
As I understand, you want to crawl the links using scrapy, and instead of storing these links on a file, you want to return them to the script.
Using your method
import scrapy
import scrapy.crawler as crawler
from scrapy.utils.log import configure_logging
from multiprocessing import Process, Queue
from twisted.internet import reactor
all_html_data = []
# spider
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ['http://quotes.toscrape.com/tag/humor/']
html_data = []
def parse(self, response):
self.html_data.append(response.text)
def close(self, reason):
global all_html_data
all_html_data = self.html_data
# the wrapper to make it run more times
def run_spider(spider):
def f(q):
try:
runner = crawler.CrawlerRunner()
deferred = runner.crawl(spider)
deferred.addBoth(lambda _: reactor.stop())
reactor.run()
q.put(None)
except Exception as e:
q.put(e)
q = Queue()
p = Process(target=f, args=(q,))
p.start()
result = q.get()
p.join()
if result:
raise result
configure_logging()
run_spider(QuotesSpider)
data = all_html_data # this is the data
But if you want to crawl the links and use the HTML response, I think it will be better if you use another library like Requests or use HtmlResponse from scrapy like
from scrapy.http import HtmlResponse
def parse(url):
response = HtmlResponse(url=url)
return response.text
data=parse('example.com')
I have more log with stats than logs with successful celery task, why? Does this mean that some of my old tasks are not killed?
I have following 8 log lines in celery worker log file:
Task track_new_items[<id>] succeeded in <time>s: None
But I have following 19 lines:
[scrapy.statscollectors] INFO: Dumping Scrapy stats:
<stats_dict>
This logs are sent every time when spider was closed.
I run my task by following code:
from billiard.context import Process
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class CrawlerScript:
def __init__(self, settings=None):
if settings is None:
settings = get_project_settings()
self.crawler = CrawlerProcess(settings)
def _crawl(self):
self.crawler.start()
self.crawler.stop()
def crawl(self, spider_cls, *args, **kwargs):
self.crawler.crawl(spider_cls, *args, **kwargs)
def run(self):
p = Process(target=self._crawl)
p.start()
p.join()
#shared_task()
def track_new_items():
crawler = CrawlerScript()
crawler.crawl(<spider>)
crawler.run()
I fixed it by --max-tasks-per-child 1.
Who can explain why without this arg celery run several some tasks?
I get twisted.internet.error.ReactorNotRestartable error when I execute following code:
from time import sleep
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.xlib.pydispatch import dispatcher
result = None
def set_result(item):
result = item
while True:
process = CrawlerProcess(get_project_settings())
dispatcher.connect(set_result, signals.item_scraped)
process.crawl('my_spider')
process.start()
if result:
break
sleep(3)
For the first time it works, then I get error. I create process variable each time, so what's the problem?
By default, CrawlerProcess's .start() will stop the Twisted reactor it creates when all crawlers have finished.
You should call process.start(stop_after_crawl=False) if you create process in each iteration.
Another option is to handle the Twisted reactor yourself and use CrawlerRunner. The docs have an example on doing that.
I was able to solve this problem like this. process.start() should be called only once.
from time import sleep
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.xlib.pydispatch import dispatcher
result = None
def set_result(item):
result = item
while True:
process = CrawlerProcess(get_project_settings())
dispatcher.connect(set_result, signals.item_scraped)
process.crawl('my_spider')
process.start()
For a particular process once you call reactor.run() or process.start() you cannot rerun those commands. The reason is the reactor cannot be restarted. The reactor will stop execution once the script completes the execution.
So the best option is to use different subprocesses if you need to run the reactor multiple times.
you can add the content of while loop to a function(say execute_crawling).
Then you can simply run this using different subprocesses. For this python Process module can be used.
Code is given below.
from multiprocessing import Process
def execute_crawling():
process = CrawlerProcess(get_project_settings())#same way can be done for Crawlrunner
dispatcher.connect(set_result, signals.item_scraped)
process.crawl('my_spider')
process.start()
if __name__ == '__main__':
for k in range(Number_of_times_you_want):
p = Process(target=execute_crawling)
p.start()
p.join() # this blocks until the process terminates
Ref http://crawl.blog/scrapy-loop/
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
from twisted.internet.task import deferLater
def sleep(self, *args, seconds):
"""Non blocking sleep callback"""
return deferLater(reactor, seconds, lambda: None)
process = CrawlerProcess(get_project_settings())
def _crawl(result, spider):
deferred = process.crawl(spider)
deferred.addCallback(lambda results: print('waiting 100 seconds before
restart...'))
deferred.addCallback(sleep, seconds=100)
deferred.addCallback(_crawl, spider)
return deferred
_crawl(None, MySpider)
process.start()
I faced error ReactorNotRestartable on AWS lambda and after I came to this solution
By default, the asynchronous nature of scrapy is not going to work well with Cloud Functions, as we'd need a way to block on the crawl to prevent the function from returning early and the instance being killed before the process terminates.
Instead, we can use `
import scrapy
import scrapy.crawler as crawler
rom scrapy.spiders import CrawlSpider
import scrapydo
scrapydo.setup()
# your spider
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ['http://quotes.toscrape.com/tag/humor/']
def parse(self, response):
for quote in response.css('div.quote'):
print(quote.css('span.text::text').extract_first())
scrapydo.run_spider(QuotesSpider)
` to run your existing spider in a blocking fashion:
I was able to mitigate this problem using package crochet via this simple code based on Christian Aichinger's answer to the duplicate of this question Scrapy - Reactor not Restartable.
The initialization of Spiders is done in the main thread whereas the particular crawling is done in different thread. I'm using Anaconda (Windows).
import time
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup
class MySpider(scrapy.Spider):
name = "MySpider"
allowed_domains = ['httpbin.org']
start_urls = ['http://httpbin.org/ip']
def parse(self, response):
print(response.text)
for i in range(1,6):
time.sleep(1)
print("Spider "+str(self.name)+" waited "+str(i)+" seconds.")
def run_spider(number):
crawler = CrawlerRunner()
crawler.crawl(MySpider,name=str(number))
setup()
for i in range(1,6):
time.sleep(1)
print("Initialization of Spider #"+str(i))
run_spider(i)
I had a similar issue using Spyder. Running the file from the command line instead fixed it for me.
Spyder seems to work the first time but after that it doesn't. Maybe the reactor stays open and doesn't close?
I could advice you to run scrapers using subprocess module
from subprocess import Popen, PIPE
spider = Popen(["scrapy", "crawl", "spider_name", "-a", "argument=value"], stdout=PIPE)
spider.wait()
If you're trying to get a flask or django or fast-api service that is running into this. You've tried all the things people suggest about forking a new process to run the reactor-- none of it seems to work.
Stop what you're doing and go read this: https://github.com/notoriousno/scrapy-flask
Crochet is your best opportunity to get this working within gunicorn without writing your own crawler from scratch.
My way is multiprocessing use Process
#create spider
class PricesSpider(scrapy.Spider):
name = 'prices'
allowed_domains = ['index.minfin.com.ua']
start_urls = ['https://index.minfin.com.ua/ua/markets/fuel/tm/']
def parse(self, response):
pass
Than I create func which run my spider
#run spider
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from twisted.internet import reactor
def parser():
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(PricesSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
Than I create new Python file, import here func 'parser' and create schedule for my spider
#create schedule for spider
import schedule
from import parser
from multiprocessing import Process
def worker(pars):
print('Worker starting')
pr = Process(target=parser)
pr.start()
pr.join()
def main():
schedule.every().day.at("15:00").do(worker, parser)
# schedule.every().day.at("20:21").do(worker, parser)
# schedule.every().day.at("20:23").do(worker, parser)
# schedule.every(1).minutes.do(worker, parser)
print('Spider working now')
while True:
schedule.run_pending()
if __name__ == '__main__':
main()
I get twisted.internet.error.ReactorNotRestartable error when I execute following code:
from time import sleep
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.xlib.pydispatch import dispatcher
result = None
def set_result(item):
result = item
while True:
process = CrawlerProcess(get_project_settings())
dispatcher.connect(set_result, signals.item_scraped)
process.crawl('my_spider')
process.start()
if result:
break
sleep(3)
For the first time it works, then I get error. I create process variable each time, so what's the problem?
By default, CrawlerProcess's .start() will stop the Twisted reactor it creates when all crawlers have finished.
You should call process.start(stop_after_crawl=False) if you create process in each iteration.
Another option is to handle the Twisted reactor yourself and use CrawlerRunner. The docs have an example on doing that.
I was able to solve this problem like this. process.start() should be called only once.
from time import sleep
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.xlib.pydispatch import dispatcher
result = None
def set_result(item):
result = item
while True:
process = CrawlerProcess(get_project_settings())
dispatcher.connect(set_result, signals.item_scraped)
process.crawl('my_spider')
process.start()
For a particular process once you call reactor.run() or process.start() you cannot rerun those commands. The reason is the reactor cannot be restarted. The reactor will stop execution once the script completes the execution.
So the best option is to use different subprocesses if you need to run the reactor multiple times.
you can add the content of while loop to a function(say execute_crawling).
Then you can simply run this using different subprocesses. For this python Process module can be used.
Code is given below.
from multiprocessing import Process
def execute_crawling():
process = CrawlerProcess(get_project_settings())#same way can be done for Crawlrunner
dispatcher.connect(set_result, signals.item_scraped)
process.crawl('my_spider')
process.start()
if __name__ == '__main__':
for k in range(Number_of_times_you_want):
p = Process(target=execute_crawling)
p.start()
p.join() # this blocks until the process terminates
Ref http://crawl.blog/scrapy-loop/
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
from twisted.internet.task import deferLater
def sleep(self, *args, seconds):
"""Non blocking sleep callback"""
return deferLater(reactor, seconds, lambda: None)
process = CrawlerProcess(get_project_settings())
def _crawl(result, spider):
deferred = process.crawl(spider)
deferred.addCallback(lambda results: print('waiting 100 seconds before
restart...'))
deferred.addCallback(sleep, seconds=100)
deferred.addCallback(_crawl, spider)
return deferred
_crawl(None, MySpider)
process.start()
I faced error ReactorNotRestartable on AWS lambda and after I came to this solution
By default, the asynchronous nature of scrapy is not going to work well with Cloud Functions, as we'd need a way to block on the crawl to prevent the function from returning early and the instance being killed before the process terminates.
Instead, we can use `
import scrapy
import scrapy.crawler as crawler
rom scrapy.spiders import CrawlSpider
import scrapydo
scrapydo.setup()
# your spider
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ['http://quotes.toscrape.com/tag/humor/']
def parse(self, response):
for quote in response.css('div.quote'):
print(quote.css('span.text::text').extract_first())
scrapydo.run_spider(QuotesSpider)
` to run your existing spider in a blocking fashion:
I was able to mitigate this problem using package crochet via this simple code based on Christian Aichinger's answer to the duplicate of this question Scrapy - Reactor not Restartable.
The initialization of Spiders is done in the main thread whereas the particular crawling is done in different thread. I'm using Anaconda (Windows).
import time
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup
class MySpider(scrapy.Spider):
name = "MySpider"
allowed_domains = ['httpbin.org']
start_urls = ['http://httpbin.org/ip']
def parse(self, response):
print(response.text)
for i in range(1,6):
time.sleep(1)
print("Spider "+str(self.name)+" waited "+str(i)+" seconds.")
def run_spider(number):
crawler = CrawlerRunner()
crawler.crawl(MySpider,name=str(number))
setup()
for i in range(1,6):
time.sleep(1)
print("Initialization of Spider #"+str(i))
run_spider(i)
I had a similar issue using Spyder. Running the file from the command line instead fixed it for me.
Spyder seems to work the first time but after that it doesn't. Maybe the reactor stays open and doesn't close?
I could advice you to run scrapers using subprocess module
from subprocess import Popen, PIPE
spider = Popen(["scrapy", "crawl", "spider_name", "-a", "argument=value"], stdout=PIPE)
spider.wait()
If you're trying to get a flask or django or fast-api service that is running into this. You've tried all the things people suggest about forking a new process to run the reactor-- none of it seems to work.
Stop what you're doing and go read this: https://github.com/notoriousno/scrapy-flask
Crochet is your best opportunity to get this working within gunicorn without writing your own crawler from scratch.
My way is multiprocessing use Process
#create spider
class PricesSpider(scrapy.Spider):
name = 'prices'
allowed_domains = ['index.minfin.com.ua']
start_urls = ['https://index.minfin.com.ua/ua/markets/fuel/tm/']
def parse(self, response):
pass
Than I create func which run my spider
#run spider
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from twisted.internet import reactor
def parser():
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(PricesSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
Than I create new Python file, import here func 'parser' and create schedule for my spider
#create schedule for spider
import schedule
from import parser
from multiprocessing import Process
def worker(pars):
print('Worker starting')
pr = Process(target=parser)
pr.start()
pr.join()
def main():
schedule.every().day.at("15:00").do(worker, parser)
# schedule.every().day.at("20:21").do(worker, parser)
# schedule.every().day.at("20:23").do(worker, parser)
# schedule.every(1).minutes.do(worker, parser)
print('Spider working now')
while True:
schedule.run_pending()
if __name__ == '__main__':
main()
When i call my spider through a python script which is as follows:
import os
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'project.settings')
from twisted.internet import reactor
from scrapy import log, signals
from scrapy.crawler import Crawler
from scrapy.settings import CrawlerSettings
from scrapy.xlib.pydispatch import dispatcher
from spiders.image import aqaqspider
def stop_reactor():
reactor.stop()
dispatcher.connect(stop_reactor, signal=signals.spider_closed)
spider = aqaqspider(domain='aqaq.com')
crawler = Crawler(CrawlerSettings())
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
log.msg('Running reactor...')
reactor.run() # the script will block here until the spider is closed
log.msg('Reactor stopped.')
My Json file is not being created. My pipelines.py is has following code:
import json
import codecs
class JsonWithEncodingPipeline(object):
def __init__(self):
self.file = codecs.open('scraped_data_utf8.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
When I call my spider with simple command line as scrapy crawl it is working fine i.e JSON file is being created.
Please Help me. I am new to scrapy???
Thank You all !!
I have found the solution....