Empty output file while crawling - python

I know I already ask a similar question but it is a new spider and I have the same problem (Crawling data successfully but cannot scraped or write it into csv)... I put my other spider here with an exemple of the output I should have and all the info I usually need to get an output file... Is there anybody who could help me please? I have to finish this spider on Friday... So, I'm in hurry!!
The strange thing is that my Fnac.csv is created but always empty... So I tried to run my spider directly on an example of a page I want to crawl and I have all the info I need... So, I don't understand... Maybe the problem comes just from my Rules or something?
My spider :
# -*- coding: utf-8 -*-
# Every import is done for a specific use
import scrapy # Once you downloaded scrapy, you have to import it in your code to use it.
import re # To use the .re() function, which extracts just a part of the text you crawl. It's using regex (regular expressions)
import numbers # To use mathematics things, in this case : numbers.
from fnac.items import FnacItem # To return the items you want. Each item has a space allocated in the momery, created in the items.py file, which is in the second cdiscount_test directory.
from urllib.request import urlopen # To use urlopen, which allow the spider to find the links in a page that is in the actual page.
from scrapy.spiders import CrawlSpider, Rule # To use rules and LinkExtractor, which allowed the spider to follow every url on the page you crawl.
from scrapy.linkextractors import LinkExtractor # Look above.
from bs4 import BeautifulSoup # To crawl an iframe, which is a page in a page in web prgrammation.
# Your spider
class Fnac(CrawlSpider):
name = 'FnacCom' # Name of your spider. You call it in the anaconda prompt.
allowed_domains = ['fnac.com'] # Web domains allowed by you, your spider cannot enter on a page which is not in that domain.
start_urls = ['https://www.fnac.com/Index-Vendeurs-MarketPlace/A/'] # The first link you crawl.
# To allow your spider to follow the urls that are on the actual page.
rules = (
Rule(LinkExtractor(), callback='parse_start_url'),
)
# Your function that crawl the actual page you're on.
def parse_start_url(self, response):
item = FnacItem() # The spider now knowws that the items you want have to be stored in the item variable.
# First data you want which are on the actual page.
nb_sales = response.xpath('//body//table[#summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').re(r'([\d]*) ventes')
country = response.xpath('//body//table[#summary="données détaillée du vendeur"]/tbody/tr/td/text()').re(r'([A-Z].*)')
# To store the data in their right places.
item['nb_sales'] = ''.join(nb_sales).strip()
item['country'] = ''.join(country).strip()
# Find a specific link on the actual page and launch this function on it. It's the place where you will find your two first data.
test_list = response.xpath('//a/#href')
for test_list in response.xpath('.//div[#class="ProductPriceBox-item detail"]'):
temporary = response.xpath('//div[#class="ProductPriceBox-item detail"]/div/a/#href').extract()
for i in range(len(temporary)):
scrapy.Request(temporary[i], callback=self.parse_start_url, meta={'dont_redirect': True, 'item': item})
# To find the iframe on a page, launch the next function.
yield scrapy.Request(response.url, callback=self.parse_iframe, meta={'dont_redirect': True, 'item': item})
# Your function that crawl the iframe on a page
def parse_iframe(self, response):
f_item1 = response.meta['item'] # Just to use the same item location you used above.
# Find all the iframe on a page.
soup = BeautifulSoup(urlopen(response.url), "lxml")
iframexx = soup.find_all('iframe')
# If there's at least one iframe, launch the next function on it
if (len(iframexx) != 0):
for iframe in iframexx:
yield scrapy.Request(iframe.attrs['src'], callback=self.extract_or_loop, meta={'dont_redirect': True, 'item': f_item1})
# If there's no iframe, launch the next function on the link of the page where you looked after the potential iframe.
else:
yield scrapy.Request(response.url, callback=self.extract_or_loop, meta={'dont_redirect': True, 'item': f_item1})
# Function to find the other data.
def extract_or_loop(self, response):
f_item2 = response.meta['item'] # Just to use the same item location you used above.
# The rest of the data you want.
address = response.xpath('//body//div/p/text()').re(r'.*Adresse \: (.*)\n?.*')
email = response.xpath('//body//div/ul/li[contains(text(),"#")]/text()').extract()
name = response.xpath('//body//div/p[#class="customer-policy-label"]/text()').re(r'Infos sur la boutique \: ([a-zA-Z0-9]*\s*)')
phone = response.xpath('//body//div/p/text()').re(r'.*Tél \: ([\d]*)\n?.*')
siret = response.xpath('//body//div/p/text()').re(r'.*Siret \: ([\d]*)\n?.*')
vat = response.xpath('//body//div/text()').re(r'.*TVA \: (.*)')
# If the name of the seller exist, then return the data.
if (len(name) != 0):
f_item2['name'] = ''.join(name).strip()
f_item2['address'] = ''.join(address).strip()
f_item2['phone'] = ''.join(phone).strip()
f_item2['email'] = ''.join(email).strip()
f_item2['vat'] = ''.join(vat).strip()
f_item2['siret'] = ''.join(siret).strip()
yield f_item2
# If not, there was no data on the page and you have to find all the links on your page and launch the first function on them.
else:
for sel in response.xpath('//html/body'):
list_urls = sel.xpath('//a/#href').extract()
list_iframe = response.xpath('//div[#class="ProductPriceBox-item detail"]/div/a/#href').extract()
if (len(list_iframe) != 0):
for list_iframe in list_urls:
yield scrapy.Request(list_iframe, callback=self.parse_start_url, meta={'dont_redirect': True})
for url in list_urls:
yield scrapy.Request(response.urljoin(url), callback=self.parse_start_url, meta={'dont_redirect': True})
My settings :
BOT_NAME = 'fnac'
SPIDER_MODULES = ['fnac.spiders']
NEWSPIDER_MODULE = 'fnac.spiders'
DOWNLOAD_DELAY = 2
COOKIES_ENABLED = False
ITEM_PIPELINES = {
'fnac.pipelines.FnacPipeline': 300,
}
My pipeline :
# -*- coding: utf-8 -*-
from scrapy import signals
from scrapy.exporters import CsvItemExporter
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
# Define your output file.
class FnacPipeline(CsvItemExporter):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
f = open('..\\..\\..\\..\\Fnac.csv', 'w').close()
file = open('..\\..\\..\\..\\Fnac.csv', 'wb')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
My items :
# -*- coding: utf-8 -*-
import scrapy
# Define here the models for your scraped items
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
class FnacItem(scrapy.Item):
# define the fields for your items :
# name = scrapy.Field()
name = scrapy.Field()
nb_sales = scrapy.Field()
country = scrapy.Field()
address = scrapy.Field()
siret = scrapy.Field()
vat = scrapy.Field()
phone = scrapy.Field()
email = scrapy.Field()
The command I write in my prompt to run the spider is :
scrapy crawl FnacCom
An example of output is :
2017-08-08 10:21:54 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Panasonic/TV-par-marque/nsh474980/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:21:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Philips/TV-par-marque/nsh474981/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:21:58 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Sony/TV-par-marque/nsh475001/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:01 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-LG/TV-par-marque/nsh474979/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Samsung/TV-par-marque/nsh474984/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Television/TV-par-marque/shi474972/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Television/TV-par-prix/shi474946/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Television/TV-par-taille-d-ecran/shi474945/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:12 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Television/TV-par-Technologie/shi474944/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Smart-TV-TV-connectee/TV-par-Technologie/nsh474953/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-QLED/TV-par-Technologie/nsh474948/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:21 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-4K-UHD/TV-par-Technologie/nsh474947/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:23 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Toutes-les-TV/TV-Television/nsh474940/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:26 [scrapy.extensions.logstats] INFO: Crawled 459 pages (at 24 pages/min), scraped 0 items (at 0 items/min)
2017-08-08 10:22:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Television/shi474914/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/partner/canalplus#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:34 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Meilleures-ventes-TV/TV-Television/nsh474942/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Toutes-nos-Offres/Offres-de-remboursement/shi159784/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Offres-Adherents/Toutes-nos-Offres/nsh81745/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/labofnac#bl=MMtvh#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Lecteur-et-Enregistreur-DVD-Blu-Ray/Lecteur-DVD-Blu-Ray/shi475063/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:46 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-OLED/TV-par-Technologie/nsh474949/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Lecteur-DVD-Portable/Lecteur-et-Enregistreur-DVD-Blu-Ray/nsh475064/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Home-Cinema/Home-Cinema-par-marque/shi475116/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Univers-TV/Univers-Ecran-plat/cl179/w-4#bl=MMtvh> (referer: https://www.fnac.com)
2017-08-08 10:22:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Casque-TV-HiFi/Casque-par-usage/nsh450507/w-4#bl=MMtvh> (referer: https://www.fnac.com)
Thank you so much for your help!!!

I wrote a small code refactor to show off how the spider could be written explicitly without using crawlspider and using common scrapy idioms:
class Fnac(Spider):
name = 'fnac.com'
allowed_domains = ['fnac.com']
start_urls = ['https://www.fnac.com/Index-Vendeurs-MarketPlace/0/'] # The first link you crawl.
def parse(self, response):
# parse sellers
sellers = response.xpath("//h1[contains(selftext(),'MarketPlace')]/following-sibling::ul/li/a/#href").extract()
for url in sellers:
yield Request(url, callback=self.parse_seller)
# parse other pages A-Z
pages = response.css('.pagerletter a::attr(href)').extract()
for url in pages:
yield Request(url, callback=self.parse)
def parse_seller(self, response):
nb_sales = response.xpath('//body//table[#summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').re(r'([\d]*) ventes')
country = response.xpath('//body//table[#summary="données détaillée du vendeur"]/tbody/tr/td/text()').re(r'([A-Z].*)')
item = FnacItem()
# To store the data in their right places.
item['nb_sales'] = ''.join(nb_sales).strip()
item['country'] = ''.join(country).strip()
# go to details page now
details_url = response.xpath("//iframe/#src[contains(.,'retour')]").extract_first()
yield Request(details_url, self.parse_seller_details,
meta={'item': item}) # carry over our item to next response
def parse_seller_details(self, response):
item = response.meta['item'] # get item that's got filled in `parse_seller`
address = response.xpath('//body//div/p/text()').re(r'.*Adresse \: (.*)\n?.*')
email = response.xpath('//body//div/ul/li[contains(text(),"#")]/text()').extract()
# parse here
yield item

Related

Why doesn't a callback get executed immediately upon calling yield in Scrapy?

I am building a web scraper to scrape remote jobs. The spider behaves in a way that I don't understand and I'd appreciate it if someone could explain why.
Here's the code for the spider:
import scrapy
import time
class JobsSpider(scrapy.Spider):
name = "jobs"
start_urls = [
"https://stackoverflow.com/jobs/remote-developer-jobs"
]
already_visited_links = []
def parse(self, response):
jobs = response.xpath("//div[contains(#class, 'job')]")
links_to_next_pages = response.xpath("//a[contains(#class, 's-pagination--item')]").css("a::attr(href)").getall()
# visit each job page (as I do in the browser) and scrape the relevant information (Job title etc.)
for job in jobs:
job_id = int(job.xpath('#data-jobid').extract_first()) # there will always be one element
# now visit the link with the job_id and get the info
job_link_to_visit = "https://stackoverflow.com/jobs?id=" + str(job_id)
request = scrapy.Request(job_link_to_visit,
callback=self.parse_job)
yield request
# sleep for 10 seconds before requesting the next page
print("Sleeping for 10 seconds...")
time.sleep(10)
# go to the next job listings page (if you haven't already been there)
# not sure if this solution is the best since it has a loop which has a recursion in it
for link_to_next_page in links_to_next_pages:
if link_to_next_page not in self.already_visited_links:
self.already_visited_links.append(link_to_next_page)
yield response.follow(link_to_next_page, callback=self.parse)
print("End of parse method")
def parse_job(self, response):
print(response.body)
print("Sleeping for 10 seconds...")
time.sleep(10)
pass
Here's the output (the relevant parts):
Sleeping for 10 seconds...
End of parse method
2021-04-29 20:49:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://stackoverflow.com/jobs?id=525754> (referer: https://stackoverflow.com/jobs/remote-developer-jobs)
2021-04-29 20:49:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://stackoverflow.com/jobs?id=525748> (referer: https://stackoverflow.com/jobs/remote-developer-jobs)
2021-04-29 20:49:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://stackoverflow.com/jobs?id=497114> (referer: https://stackoverflow.com/jobs/remote-developer-jobs)
2021-04-29 20:49:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://stackoverflow.com/jobs?id=523136> (referer: https://stackoverflow.com/jobs/remote-developer-jobs)
2021-04-29 20:49:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://stackoverflow.com/jobs?id=525730> (referer: https://stackoverflow.com/jobs/remote-developer-jobs)
In parse_job
2021-04-29 20:50:05 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://stackoverflow.com/jobs/remote-developer-jobs?so_source=JobSearch&so_medium=Internal> (referer: https://stackoverflow.com/jobs/remote-developer-jobs)
2021-04-29 20:50:05 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://stackoverflow.com/jobs?id=523319> (referer: https://stackoverflow.com/jobs/remote-developer-jobs)
2021-04-29 20:50:05 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://stackoverflow.com/jobs?id=522480> (referer: https://stackoverflow.com/jobs/remote-developer-jobs)
2021-04-29 20:50:05 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://stackoverflow.com/jobs?id=511761> (referer: https://stackoverflow.com/jobs/remote-developer-jobs)
2021-04-29 20:50:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://stackoverflow.com/jobs?id=522483> (referer: https://stackoverflow.com/jobs/remote-developer-jobs)
2021-04-29 20:50:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://stackoverflow.com/jobs?id=249610> (referer: https://stackoverflow.com/jobs/remote-developer-jobs)
2021-04-29 20:50:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://stackoverflow.com/jobs?id=522481> (referer: https://stackoverflow.com/jobs/remote-developer-jobs)
In parse_job
In parse_job
In parse_job
In parse_job
...
I don't understand why the parse method gets executed fully before the parse_job method gets called. From my understanding, as soon as I yield a job from jobs, the parse_job method should get called. The spider should go over each page of job listings and visit the details of each individual job at that job listing page. However, the description I gave in the previous sentence doesn't match the output. I also don't understand why are there multiple GET requests between each call to the parse_job method.
Can someone explain what is going on here?
Scrapy is event driven. Firstly requests are queued by Scheduler. Queued requests are passed to Downloader. The callback function is called when the response is downloaded and ready and then, response will be passed as the first argument to the callback function.
You are blocking callbacks by using time.sleep(). In the presented logs, after the first callback call the procedure was blocked for 10 seconds in parsed_job() but at the same time Downloader was working and getting responses ready for callback function as it is obvious in successive DEBUG: Crawled (200) logs after the first parse_job() call. So, while callback was blocked, Downloader finished its job and the responses were queued to be fed to callback function. As it is obvious in the last part of the logs, passing response to callback function was bottle necked.
If you want to put delay between requests, it's better to use DOWNLOAD_DELAY settings instead of time.sleep().
Take a look at this for more details about Scrapy architecture.

Can't make my first spider run,any advice?

This is my first time using scrapy and maybe the third in python, so i'm a noob.
The problem with this code is that it doesn't even enter the page.
I have tried to use:
scrapy shell 'https://www.zooplus.es/shop/tienda_perros/pienso_perros/pienso_hipoalergenico'
This works and then using...
response.xpath('//*[#class="product__varianttitle ui-text--small"]')
... I can retrieve information.
My code:
import scrapy
class ZooplusSpider(scrapy.Spider):
name = 'Zooplus'
allowed_domains = ['zooplus.es']
start_urls = ['https://www.zooplus.es/shop/tienda_perros/pienso_perros/pienso_hipoalergenico']
def parse(self, response):
item= scrapy.Item()
item['nombre']=response.xpath('//*[#class="product__varianttitle ui-text--small"]')
item['preciooriginal']=response.xpath('//*[#class="product__prices_col prices"]')
item['preciorebaja']=response.xpath('//*[#class="product__specialprice__text"]')
return item
The error message says:
2019-08-30 21:16:57 [scrapy.core.engine] INFO: Spider opened
2019-08-30 21:16:57 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-08-30 21:16:57 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2019-08-30 21:16:57 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.zooplus.es/robots.txt> (referer: None)
2019-08-30 21:16:57 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.zooplus.es/shop/tienda_perros/pienso_perros/pienso_hipoalergenico> from <GET https://www.zooplus.es/shop/tienda_perros/pienso_perros/pienso_hipoalergenico/>
2019-08-30 21:16:58 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.zooplus.es/shop/tienda_perros/pienso_perros/pienso_hipoalergenico> (referer: None)
2019-08-30 21:16:58 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.zooplus.es/shop/tienda_perros/pienso_perros/pienso_hipoalergenico> (referer: None)
I think you haven't defined the fields for your items.py
the error is coming from item['nombre']
Either you should define the field in items.py or simply replace
item= scrapy.Item()
with item = dict()

scraping site logos

I have sites and I want to scrape their logos.
PROBLEM:
I have an outer class, in which I save all the data about the logos - urls, links, everything is working correct:
class PatternUrl:
def __init__(self, path_to_img="", list_of_conditionals=[]):
self.url_pattern = ""
self.file_url = ""
self.path_to_img = path_to_img
self.list_of_conditionals = list_of_conditionals
def find_obj(self, response):
for el in self.list_of_conditionals:
if el:
if self.path_to_img:
url = response
file_url = str(self.path_to_img)
print(file_url)
yield LogoScrapeItem(url=url, file_url=file_url)
class LogoSpider(scrapy.Spider):
....
def parse(self, response):
a = PatternUrl(response.css("header").xpath("//a[#href='"+response.url+'/'+"']/img/#src").extract_first(), [response.css("header").xpath("//a[#href='"+response.url+'/'+"']")] )
a.find_obj(response)
The problem is in the yield line
yield LogoScrapeItem(url=url, file_url=file_url)
For some reason when I comment this line, all the lines in this method are being executed.
Output when yield is commentated:
#yield LogoScrapeItem(url=url, file_url=file_url)
2017-12-25 11:09:32 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://time.com> (referer: None)
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAKQAAAAyCAYAAAD........
2017-12-25 11:09:32 [scrapy.core.engine] INFO: Closing spider (finished)
2017-12-25 11:09:32 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
Output when yield is not commentated:
yield LogoScrapeItem(url=url, file_url=file_url)
2017-12-25 11:19:28 [scrapy.core.engine] INFO: Spider opened
2017-12-25 11:19:28 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2017-12-25 11:19:28 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6024
2017-12-25 11:19:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://git-scm.com/robots.txt> (referer: None)
2017-12-25 11:19:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://git-scm.com/docs/git-merge> (referer: None)
2017-12-25 11:19:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://time.com/robots.txt> (referer: None)
2017-12-25 11:19:29 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://time.com> (referer: None)
2017-12-25 11:19:29 [scrapy.core.engine] INFO: Closing spider (finished)
2017-12-25 11:19:29 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 926,
QUESTION:
The function is not executed when there is a yield statement, why ?
Yield is designed to produce a generator.
It looks like you should run your find_obj as:
for x in a.find_obj(response):
instead.
For details on yield please see What does the "yield" keyword do?
Your find_obj method is actually a generator because of the yield keyword. For a thorough explanation on generators and yield I recommend this StackOverflow question.
In order to get results from your method you should call it in a manner similar to this :
for logo_scrape_item in a.find_obj(response):
# perform an action on your logo_scrape_item

Get all URLs in a entire site using Scrapy

folks!
I'm trying to get all internal URLs in entire site for SEO purposes and i recently discovered Scrapy to help me in this task. But my code always returns a error:
2017-10-11 10:32:00 [scrapy.core.engine] INFO: Spider opened
2017-10-11 10:32:00 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min
)
2017-10-11 10:32:00 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2017-10-11 10:32:01 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://www.**test**.com/> from
<GET http://www.**test**.com/robots.txt>
2017-10-11 10:32:02 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.**test**.com/> (referer: None)
2017-10-11 10:32:03 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://www.**test**.com/> from
<GET http://www.**test**.com>
2017-10-11 10:32:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.**test**.com/> (referer: None)
2017-10-11 10:32:03 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.**test**.com/> (referer: None)
Traceback (most recent call last):
File "c:\python27\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\python27\lib\site-packages\scrapy\spiders\__init__.py", line 90, in parse
raise NotImplementedError
NotImplementedError
I change the original url.
Here's the code i'm running
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class TestSpider(scrapy.Spider):
name = "test"
allowed_domains = ["http://www.test.com"]
start_urls = ["http://www.test.com"]
rules = [Rule (LinkExtractor(allow=['.*']))]
Thanks!
EDIT:
This worked for me:
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
filename = response.url
arquivo = open("file.txt", "a")
string = str(filename)
arquivo.write(string+ '\n')
arquivo.close
=D
The error you are getting is caused by the fact that you don't have defined parse method in your spider, which is mandatory if you base your spider on scrapy.Spider class.
For your purpose (i.e. crawling whole website) it's best to base your spider on scrapy.CrawlSpider class. Also, in Rule, you have to define callback attribute as a method that will parse every page you visit. Last one cosmetic change, in LinkExtractor, if you want to visit every page, you can leave out allow as its default value is empty tuple which means it will match all links found.
Consult a CrawlSpider example for concrete code.

SgmlLinkExtractor 'allow' definition not working with Scrapy

I am using Python.org version 2.7 64 bit on Windows Vista 64 bit. I have the following Scrapy code where the way I have defined SgmlLinkExtractor is not crawling the site correctly:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from scrapy.spider import BaseSpider
from scrapy import log
from scrapy.cmdline import execute
from scrapy.utils.markup import remove_tags
import time
class ExampleSpider(CrawlSpider):
name = "goal3"
allowed_domains = ["whoscored.com"]
start_urls = ["http://www.whoscored.com"]
download_delay = 1
#rules = [Rule(SgmlLinkExtractor(allow=()),
#follow=True),
#Rule(SgmlLinkExtractor(allow=()), callback='parse_item')
#]
rules = [
Rule(
SgmlLinkExtractor(allow=('Regions/252/Tournaments/2',)),
callback='parse_item',
follow=True,
)
]
def parse_item(self,response):
self.log('A response from %s just arrived!' % response.url)
scripts = response.selector.xpath("normalize-space(//title)")
for scripts in scripts:
body = response.xpath('//p').extract()
body2 = "".join(body)
print remove_tags(body2).encode('utf-8')
execute(['scrapy','crawl','goal3'])
I've tried a few different versions of what the SgmlLinkExtractor is defined as, yet all seem to be getting printing to Command Shell is the following:
Contact Us | About Us | Glossary | Privacy Policy | WhoScored Ratings
Copyright ┬® 2014 WhoScored.com
2014-07-20 00:14:38+0100 [goal3] DEBUG: Filtered duplicate request: <GET http://www.whoscored.com/Statistics> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
2014-07-20 00:14:40+0100 [goal3] DEBUG: Crawled (200) <GET http://www.whoscored.com/Statistics/Teams> (referer: http://www.whoscored.com/Statistics)
2014-07-20 00:14:40+0100 [goal3] DEBUG: A response from http://www.whoscored.com/Statistics/Teams just arrived!
Contact Us | About Us | Glossary | Privacy Policy | WhoScored Ratings
Copyright ┬® 2014 WhoScored.com
2014-07-20 00:14:41+0100 [goal3] DEBUG: Redirecting (302) to <GET http://www.whoscored.com/404.html?aspxerrorpath=/Statistics/3> from <GET http://www.whoscored.com/Statistics/3>
2014-07-20 00:14:42+0100 [goal3] DEBUG: Redirecting (302) to <GET http://www.whoscored.com/404.html?aspxerrorpath=/Statistics/2> from <GET http://www.whoscored.com/Statistics/2>
2014-07-20 00:14:43+0100 [goal3] DEBUG: Redirecting (302) to <GET http://www.whoscored.com/404.html?aspxerrorpath=/Statistics/1> from <GET http://www.whoscored.com/Statistics/1>
2014-07-20 00:14:45+0100 [goal3] DEBUG: Crawled (200) <GET http://www.whoscored.com/404.html?aspxerrorpath=/Statistics/3> (referer: http://www.whoscored.com/Statistics/Teams)
2014-07-20 00:14:45+0100 [goal3] DEBUG: A response from http://www.whoscored.com/404.html?aspxerrorpath=/Statistics/3 just arrived!
2014-07-20 00:14:46+0100 [goal3] DEBUG: Crawled (200) <GET http://www.whoscored.com/404.html?aspxerrorpath=/Statistics/2> (referer: http://www.whoscored.com/Statistics/Teams)
2014-07-20 00:14:46+0100 [goal3] DEBUG: A response from http://www.whoscored.com/404.html?aspxerrorpath=/Statistics/2 just arrived!
2014-07-20 00:14:47+0100 [goal3] DEBUG: Crawled (200) <GET http://www.whoscored.com/404.html?aspxerrorpath=/Statistics/1> (referer: http://www.whoscored.com/Statistics/Teams)
2014-07-20 00:14:47+0100 [goal3] DEBUG: A response from http://www.whoscored.com/404.html?aspxerrorpath=/Statistics/1 just arrived!
Can anyone see anything obvious here as to why this is not working?
Thanks

Categories