process_item pipeline not called - python

I'm new to python and scrapy
After scraping process I tried to save database to mysqlite,
Follow by this src : https://github.com/sunshineatnoon/Scrapy-Amazon-Sqlite( from url)
My problem is database was created successfully but items can't be inserted to database because process_item not called
EDIT
I paste the source code from github link above
setting.py
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import sqlite3
import os
con = None
class AmazonPipeline(object):
def __init__(self):
self.setupDBCon()
self.createTables()
def process_item(self, item, spider):
print('---------------process item----')
self.storeInDb(item)
return item
def setupDBCon(self):
self.con = sqlite3.connect(os.getcwd() + '/test.db')
self.cur = self.con.cursor()
def createTables(self):
self.dropAmazonTable()
self.createAmazonTable()
def dropAmazonTable(self):
#drop amazon table if it exists
self.cur.execute("DROP TABLE IF EXISTS Amazon")
def closeDB(self):
self.con.close()
def __del__(self):
self.closeDB()
def createAmazonTable(self):
self.cur.execute("CREATE TABLE IF NOT EXISTS Amazon(id INTEGER PRIMARY KEY NOT NULL, \
name TEXT, \
path TEXT, \
source TEXT \
)")
self.cur.execute("INSERT INTO Amazon(name, path, source ) VALUES( 'Name1', 'Path1', 'Source1')")
print ('------------------------')
self.con.commit()
def storeInDb(self,item):
# self.cur.execute("INSERT INTO Amazon(\
# name, \
# path, \
# source \
# ) \
# VALUES( ?, ?, ?)", \
# ( \
# item.get('Name',''),
# item.get('Path',''),
# item.get('Source','')
# ))
self.cur.execute("INSERT INTO Amazon(name, path, source ) VALUES( 'Name1', 'Path1', 'Source1')")
print ('------------------------')
print ('Data Stored in Database')
print ('------------------------')
self.con.commit()
spiders/amazonspider.py
import scrapy
import urllib
from amazon.items import AmazonItem
import os
class amazonSpider(scrapy.Spider):
imgcount = 1
name = "amazon"
allowed_domains = ["amazon.com"]
'''
start_urls = ["http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=backpack",
"http://www.amazon.com/s/ref=sr_pg_2?rh=i%3Aaps%2Ck%3Abackpack&page=2&keywords=backpack&ie=UTF8&qid=1442907452&spIA=B00YCRMZXW,B010HWLMMA"
]
'''
def start_requests(self):
yield scrapy.Request("http://www.amazon.com/s/ref=sr_ex_n_3?rh=n%3A7141123011%2Cn%3A10445813011%2Cn%3A9479199011%2Cn%3A360832011&bbn=10445813011&ie=UTF8&qid=1442910853&ajr=0",self.parse)
for i in range(2,3):
yield scrapy.Request("http://www.amazon.com/s/ref=lp_360832011_pg_2?rh=n%3A7141123011%2Cn%3A10445813011%2Cn%3A9479199011%2Cn%3A360832011&page="+str(i)+"&bbn=10445813011&ie=UTF8&qid=1442910987",self.parse)
def parse(self,response):
#namelist = response.xpath('//a[#class="a-link-normal s-access-detail-page a-text-normal"]/#title').extract()
#htmllist = response.xpath('//a[#class="a-link-normal s-access-detail-page a-text-normal"]/#href').extract()
#imglist = response.xpath('//a[#class="a-link-normal a-text-normal"]/img/#src').extract()
namelist = response.xpath('//a[#class="a-link-normal s-access-detail-page s-overflow-ellipsis a-text-normal"]/#title').extract()
htmllist = response.xpath('//a[#class="a-link-normal s-access-detail-page s-overflow-ellipsis a-text-normal"]/#href').extract()
imglist = response.xpath('//img[#class="s-access-image cfMarker"]/#src').extract()
listlength = len(namelist)
pwd = os.getcwd()+'/'
if not os.path.isdir(pwd+'crawlImages/'):
os.mkdir(pwd+'crawlImages/')
for i in range(0,listlength):
item = AmazonItem()
item['Name'] = namelist[i]
item['Source'] = htmllist[i]
urllib.urlretrieve(imglist[i],pwd+"crawlImages/"+str(amazonSpider.imgcount)+".jpg")
item['Path'] = pwd+"crawlImages/"+str(amazonSpider.imgcount)+".jpg"
amazonSpider.imgcount = amazonSpider.imgcount + 1
yield item
Result
after run scrapy crawl amazone
I have test.db created but item haven't inserted (I've checked my sqlite db.test), that mean process_item was not run
build result
2018-09-18 16:38:38 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: amazon)
2018-09-18 16:38:38 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.0, w3lib 1.19.0, Twisted 17.5.0, Python 3.6.5 |Anaconda, Inc.| (default, Apr 26 2018, 08:42:37) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2o 27 Mar 2018), cryptography 2.2.2, Platform Darwin-17.7.0-x86_64-i386-64bit
2018-09-18 16:38:38 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'amazon', 'NEWSPIDER_MODULE': 'amazon.spiders', 'SPIDER_MODULES': ['amazon.spiders']}
2018-09-18 16:38:38 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
2018-09-18 16:38:38 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2018-09-18 16:38:38 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
------------------------
2018-09-18 16:38:38 [scrapy.middleware] INFO: Enabled item pipelines:
['amazon.pipelines.AmazonPipeline']
2018-09-18 16:38:38 [scrapy.core.engine] INFO: Spider opened
2018-09-18 16:38:38 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2018-09-18 16:38:38 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2018-09-18 16:38:38 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.amazon.com/s/ref=lp_360832011_pg_2?rh=n%3A7141123011%2Cn%3A10445813011%2Cn%3A9479199011%2Cn%3A360832011&page=2&bbn=10445813011&ie=UTF8&qid=1442910987> from <GET http://www.amazon.com/s/ref=lp_360832011_pg_2?rh=n%3A7141123011%2Cn%3A10445813011%2Cn%3A9479199011%2Cn%3A360832011&page=2&bbn=10445813011&ie=UTF8&qid=1442910987>
2018-09-18 16:38:38 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.amazon.com/s/ref=sr_ex_n_3?rh=n%3A7141123011%2Cn%3A10445813011%2Cn%3A9479199011%2Cn%3A360832011&bbn=10445813011&ie=UTF8&qid=1442910853&ajr=0> from <GET http://www.amazon.com/s/ref=sr_ex_n_3?rh=n%3A7141123011%2Cn%3A10445813011%2Cn%3A9479199011%2Cn%3A360832011&bbn=10445813011&ie=UTF8&qid=1442910853&ajr=0>
2018-09-18 16:38:39 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.amazon.com/backpacks/b?ie=UTF8&node=360832011> from <GET https://www.amazon.com/s/ref=sr_ex_n_3?rh=n%3A7141123011%2Cn%3A10445813011%2Cn%3A9479199011%2Cn%3A360832011&bbn=10445813011&ie=UTF8&qid=1442910853&ajr=0>
2018-09-18 16:38:39 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.amazon.com/Backpacks-Luggage-Travel-Gear/s?ie=UTF8&page=2&rh=n%3A360832011> from <GET https://www.amazon.com/s/ref=lp_360832011_pg_2?rh=n%3A7141123011%2Cn%3A10445813011%2Cn%3A9479199011%2Cn%3A360832011&page=2&bbn=10445813011&ie=UTF8&qid=1442910987>
2018-09-18 16:38:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/Backpacks-Luggage-Travel-Gear/s?ie=UTF8&page=2&rh=n%3A360832011> (referer: None)
2018-09-18 16:38:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/backpacks/b?ie=UTF8&node=360832011> (referer: None)
2018-09-18 16:38:41 [scrapy.core.engine] INFO: Closing spider (finished)
2018-09-18 16:38:41 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1909,
'downloader/request_count': 6,
'downloader/request_method_count/GET': 6,
'downloader/response_bytes': 140740,
'downloader/response_count': 6,
'downloader/response_status_count/200': 2,
'downloader/response_status_count/301': 4,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 9, 18, 9, 38, 41, 53948),
'log_count/DEBUG': 7,
'log_count/INFO': 7,
'memusage/max': 52600832,
'memusage/startup': 52600832,
'response_received_count': 2,
'scheduler/dequeued': 6,
'scheduler/dequeued/memory': 6,
'scheduler/enqueued': 6,
'scheduler/enqueued/memory': 6,
'start_time': datetime.datetime(2018, 9, 18, 9, 38, 38, 677280)}
2018-09-18 16:38:41 [scrapy.core.engine] INFO: Spider closed (finished)
I've searching around but have no luck
Thanks

If I crawl
https://www.amazon.com/backpacks/b?ie=UTF8&node=360832011
I don't get any result in namelist and htmllist. Urllist is filled.
Checking the html-code:
... <a class="a-link-normal s-access-detail-page s-overflow-ellipsis s-color-twister-title-link a-text-normal" ...
I found an additional "s-color-twister-title-link" so your specific xpath is not correct. You can add the s-color-twister-title-link
In [9]: response.xpath('//a[#class="a-link-normal s-access-detail-page s-overflow-ellipsis s-
...: color-twister-title-link a-text-normal"]/#title').extract()
Out[9]:
['Anime Anti-theft Backpack, Luminous School Bag, Waterproof Laptop Backpack with USB Charging Port, Unisex 15.6 Inch College Daypack, Starry',
'Anime Luminous Backpack Noctilucent School Bags Daypack USB chargeing Port Laptop Bag Handbag for Boys Girls Men Women',
or you can use a more specific one like:
response.xpath('//a[contains(#class,"s-access-detail-page")]/#title').extract()

Related

Custom Files Pipeline in Scrapy never downloads Files even though logs should all functions being accessed

I have the following custom pipeline for downloading JSON files. It was functioning fine until I need to add the __init__ function, in which I subclass the FilesPipeline class in order to add a few new properties. The pipeline takes URLs that are to API endpoints and downloads their responses. The folders are properly created when running the spider via scrapy crawl myspider and the two print statements in the file_path function show the correct values (filename and filepath). However, the files are never actually downloaded.
I did find a few similar questions about custom file pipelines and files not downloading (here (the solution was they needed to yield the items instead of returning them) and here (the solution was needing to adjust the ROBOTSTXT_OBEY setting) for example), but the solutions did not work for me.
What am I doing wrong (or forgetting to do when subclassing the FilesPipeline)? I've been racking my brain over this issue for a good 3 hours and my google-fu has not yielded any resolutions for my case.
class LocalJsonFilesPipeline(FilesPipeline):
FILES_STORE = "json_src"
FILES_URLS_FIELD = "json_url"
FILES_RESULT_FIELD = "local_json"
def __init__(self, store_uri, use_response_url=False, filename_regex=None, settings=None):
# super(LocalJsonFilesPipeline, self).__init__(store_uri)
self.store_uri = store_uri
self.use_response_url = use_response_url
if filename_regex:
self.filename_regex = re.compile(filename_regex)
else:
self.filename_regex = filename_regex
super(LocalJsonFilesPipeline, self).__init__(store_uri, settings=settings)
#classmethod
def from_crawler(cls, crawler):
if not crawler.spider:
return BasePipeline()
store_uri = f'{cls.FILES_STORE}/{crawler.spider.name}'
settings = crawler.spider.settings
use_response_url = settings.get('JSON_FILENAME_USE_RESPONSE_URL', False)
filename_regex = settings.get('JSON_FILENAME_REGEX')
return cls(store_uri, use_response_url, filename_regex, settings)
def parse_path(self, value):
if self.filename_regex:
try:
return self.filename_regex.findall(value)[0]
except IndexError:
pass
# fallback method in the event no regex is provided by the spider
# example: /p/russet-potatoes-5lb-bag-good-38-gather-8482/-/A-77775602
link_path = os.path.splitext(urlparse(value).path)[0] # omit extension if there is one
link_params = link_path.rsplit('/', 1)[1] # preserve the last portion separated by forward-slash (A-77775602)
return link_params if '=' not in link_params else link_params.split('=', 1)[1]
def get_media_requests(self, item, info):
json_url = item.get(self.FILES_URLS_FIELD)
if json_url:
filename_url = json_url if not self.use_response_url else item.get('url', '')
return [Request(json_url, meta={'filename': self.parse_path(filename_url), 'spider': info.spider.name})]
def file_path(self, request, response=None, info=None):
final_path = f'{self.FILES_STORE}/{request.meta["spider"]}/{request.meta["filename"]}.json'
print('url', request.url)
print('downloading to', final_path)
return final_path
And the custom settings of my spider
class MockSpider(scrapy.Spider):
name = 'mock'
custom_settings = {
'ITEM_PIPELINES': {
'mock.pipelines.LocalJsonFilesPipeline': 200
},
'JSON_FILENAME_REGEX': r'products\/(.+?)\/ProductInfo\+ProductDetails'
}
Log with the level set to debug
C:\Users\Mike\Desktop\scrapy_test\pipeline_test>scrapy crawl testsite
2020-07-19 11:23:08 [scrapy.utils.log] INFO: Scrapy 2.2.1 started (bot: pipeline
_test)
2020-07-19 11:23:08 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2 2.9
.5, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.7.6 (
tags/v3.7.6:43364a7ae0, Dec 19 2019, 00:42:30) [MSC v.1916 64 bit (AMD64)], pyOp
enSSL 19.0.0 (OpenSSL 1.1.0i 14 Aug 2018), cryptography 2.3.1, Platform Windows
-7-6.1.7601-SP1
2020-07-19 11:23:08 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.se
lectreactor.SelectReactor
2020-07-19 11:23:08 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'pipeline_test',
'LOG_STDOUT': True,
'NEWSPIDER_MODULE': 'pipeline_test.spiders',
'ROBOTSTXT_OBEY': True,
'SPIDER_MODULES': ['pipeline_test.spiders']}
2020-07-19 11:23:08 [scrapy.extensions.telnet] INFO: Telnet Password: 0454b083df
d2028a
2020-07-19 11:23:08 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2020-07-19 11:23:08 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-07-19 11:23:08 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-07-19 11:23:08 [scrapy.middleware] INFO: Enabled item pipelines:
['pipeline_test.pipelines.LocalJsonFilesPipeline']
2020-07-19 11:23:08 [scrapy.core.engine] INFO: Spider opened
2020-07-19 11:23:08 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pag
es/min), scraped 0 items (at 0 items/min)
2020-07-19 11:23:08 [scrapy.extensions.telnet] INFO: Telnet console listening on
127.0.0.1:6023
2020-07-19 11:23:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.[testsite].com/robots.txt> (referer: None)
2020-07-19 11:23:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://[testsite]/vpd/v1/products/prod6149174-product/ProductInfo+ProductDetails> (re
ferer: None)
2020-07-19 11:23:08 [stdout] INFO: url
2020-07-19 11:23:08 [stdout] INFO: https://[testsite]/vpd/v1/products/pro
d6149174-product/ProductInfo+ProductDetails
2020-07-19 11:23:08 [stdout] INFO: downloading to
2020-07-19 11:23:08 [stdout] INFO: json_src/[testsite]/prod6149174-product.json
2020-07-19 11:23:09 [scrapy.core.scraper] DEBUG: Scraped from <200 https://[testsite]/vpd/v1/products/prod6149174-product/ProductInfo+ProductDetails>
{'json_url': 'https://[testsite].com/vpd/v1/products/prod6149174-product/Prod
uctInfo+ProductDetails',
'local_json': [],
'url': 'https://[testsite].com/store/c/nature-made-super-b-complex,-tablets/
ID=prod6149174-product'}
2020-07-19 11:23:09 [scrapy.core.engine] INFO: Closing spider (finished)
2020-07-19 11:23:09 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 506,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 5515,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'elapsed_time_seconds': 0.468001,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 7, 19, 15, 23, 9, 96399),
'item_scraped_count': 1,
'log_count/DEBUG': 3,
'log_count/INFO': 14,
'response_received_count': 2,
'robotstxt/request_count': 1,
'robotstxt/response_count': 1,
'robotstxt/response_status_count/200': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2020, 7, 19, 15, 23, 8, 628398)}
2020-07-19 11:23:09 [scrapy.core.engine] INFO: Spider closed (finished)
I finally figured out the issue, which was the fact that the FilesPipeline class does not have a from_crawler method, but instead requires a from_settings method when wanting to pass added parameters to a subclassed/custom FilesPipeline. Below is my working version of the custom FilesPipeline
from scrapy import Request
from scrapy.pipelines.files import FilesPipeline
from urllib.parse import urlparse
import os
import re
class LocalFilesPipeline(FilesPipeline):
FILES_STORE = "data_src"
FILES_URLS_FIELD = "data_url"
FILES_RESULT_FIELD = "local_file"
def __init__(self, settings=None):
"""
Attributes:
use_response_url indicates we want to grab the filename from the response url instead of json_url
filename_regex regexes to use for grabbing filenames out of urls
filename_suffixes suffixes to append to filenames when there are multiple files to download per item
filename_extension the file extension to append to each filename in the file_path function
"""
self.use_response_url = settings.get('FILENAME_USE_RESPONSE_URL', False)
self.filename_regex = settings.get('FILENAME_REGEX', [])
self.filename_suffixes = settings.get('FILENAME_SUFFIXES', [])
self.filename_extension = settings.get('FILENAME_EXTENSION', 'json')
if isinstance(self.filename_regex, str):
self.filename_regex = [self.filename_regex]
if isinstance(self.filename_suffixes, str):
self.filename_suffixes = [self.filename_suffixes]
if self.filename_regex and self.filename_suffixes and len(self.filename_regex) != len(self.filename_suffixes):
raise ValueError('FILENAME_REGEX and FILENAME_SUFFIXES settings must contain the same number of elements')
if self.filename_regex:
for i, f_regex in enumerate(self.filename_regex):
self.filename_regex[i] = re.compile(f_regex)
super(LocalFilesPipeline, self).__init__(self.FILES_STORE, settings=settings)
#classmethod
def from_settings(cls, settings):
return cls(settings=settings)
def parse_path(self, value, index):
if self.filename_regex:
try:
return self.filename_regex[index-1].findall(value)[0]
except IndexError:
pass
# fallback method in the event no regex is provided by the spider
link_path = os.path.splitext(urlparse(value).path)[0]
# preserve the last portion separated by forward-slash
try:
return link_path.rsplit('/', 1)[1]
except IndexError:
return link_path
def get_media_requests(self, item, info):
file_urls = item.get(self.FILES_URLS_FIELD)
requests = []
if file_urls:
total_urls = len(file_urls)
for i, file_url in enumerate(file_urls, 1):
filename_url = file_url if not self.use_response_url else item.get('url', '')
filename = self.parse_path(filename_url, i)
if self.filename_suffixes:
current_suffix = self.filename_suffixes[i-1]
if current_suffix.startswith('/'):
# this will end up creating a separate folder for the different types of files
filename += current_suffix
else:
# this will keep all files in single folder while still making it easy to differentiate each
# type of file. this comes in handy when searching for a file by the base name.
filename += f'_{current_suffix}'
elif total_urls > 1:
# default to numbering files sequentially in the order they were added to the item
filename += f'_file{i}'
requests.append(Request(file_url, meta={'spider': info.spider.name, 'filename': filename}))
return requests
def file_path(self, request, response=None, info=None):
return f'{request.meta["spider"]}/{request.meta["filename"]}.{self.filename_extension}'
Then, to utilize the pipeline you can set the applicable values in a spider's custom_settings property
custom_settings = {
'ITEM_PIPELINES': {
'spins.pipelines.LocalFilesPipeline': 200
},
'FILENAME_REGEX': [r'products\/(.+?)\/ProductInfo\+ProductDetails']
}

Scrapy Spider not returning anything

I'm very new to the Scrapy library and i'm struggling with my spider. I'm trying to scrape data from this website https://murderpedia.org/male.A/index.A.htm
What i'm trying to do is for every link on the page, I'd like to follow the link and scrape the image as well as the text [rows 3 - 11].
Any help here would be immensely appreciated.
Here is my code:
from scrapy.spiders import Request
from scrapy.linkextractors import LinkExtractor
from scrapy.http import HtmlResponse
import re
BASE_URL = 'http://murderpedia.org/'
PROTOCOL = 'https:'
class SerialKillerItem(scrapy.Item):
name = scrapy.Field()
bio = scrapy.Field()
images = scrapy.Field()
link = scrapy.Field()
image_urls = scrapy.Field()
bio_image = scrapy.Field()
classification = scrapy.Field()
characteristics = scrapy.Field()
number_of_victims = scrapy.Field()
date_of_murders = scrapy.Field()
date_of_birth = scrapy.Field()
victims_profile = scrapy.Field()
method_of_murder = scrapy.Field()
location = scrapy.Field()
status = scrapy.Field()
class SerialKillerBio(scrapy.Spider):
name = 'serial_killer_bio'
start_urls = ['http://murderpedia.org/male.A/index.A.htm']
def parse(self, response):
images = response.css("#AutoNumber3 > tbody > tr:nth-child(2)
> td > font:nth-child(1) > div > center > table:nth-child(2) >
tbody > tr > td > font > div > table > tbody > tr > td:nth-
child(2) > p > img::attr(src)").extract_first()
for row in response.css('#table4 > tbody'):
text = {
'Classification' : row.css('tr[3]::text').extract_first(),
'Characteristics': row.css('tr[4]::text').extract_first(),
'Number of
Victims':row.css('tr[5]::text').extract_first(),
'Date of Murders': row.css('tr[6]::text').extract_first(),
'Date of Birth': row.xpath('tr[7]::text').extract_first(),
'Victims Profile': row.xpath('tr[8]
::text').extract_first(),
'Method of Murder': row.xpath('tr[9]
::text').extract_first(),
'Location' : row.css('tr[10] ::text').extract_first(),
'Status' : row.css('tr[11] ::text').extract_first()}
text2 = ''.join(text)
print(text2)
if images:
yield {'text2':
SerialKillerItem(classification=name['Classification'],
characteristics=name['Characteristics'],
number_of_victims=name['Number of
Victims'],
date_of_murders=name['Date of Murders'],
date_of_birth=name['Date of Birth'],
victims_profile=name['Victims Profile'],
method_of_murder=name['Method of Murder'],
location=name['Location'],
status=name['Status']),
'image_urls': [PROTOCOL+ images][:10]}
else:
yield {'text2':
SerialKillerItem(classification=name['Classification'],
characteristics=name['Characteristics'],
number_of_victims=name['Number of
Victims'],
date_of_murders=name['Date of Murders'],
date_of_birth=name['Date of Birth'],
victims_profile=name['Victims Profile'],
method_of_murder=name['Method of Murder'],
location=name['Location'],
status=name['Status']), 'image_urls':[]}
for next_page in response.css('#table2 > tbody >
tr:nth-child(2) > td > font:nth-child(1) > div > table
> tbody > tr > td:nth-child(2) > p > font > font >
a::attr(href)').extract():
print(BASE_URL + next_page)
yield Request(BASE_URL + next_page, \
callback=self.parse)
Here is the crawl log:
2018-10-24 21:11:04 [scrapy.utils.log] INFO: Scrapy 1.5.1 started
(bot: serial_killers)
2018-10-24 21:11:04 [scrapy.utils.log] INFO: Versions: lxml 4.2.3.0,
libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.0, w3lib 1.19.0, Twisted
18.9.0, Python 3.6.5 (default, Apr 25 2018, 14:22:56) - [GCC 4.2.1
Compatible Apple LLVM 8.0.0 (clang-800.0.42.1)], pyOpenSSL 18.0.0
(OpenSSL 1.1.0h 27 Mar 2018), cryptography 2.2.2, Platform Darwin-
15.2.0-x86_64-i386-64bit
2018-10-24 21:12:19 [scrapy.utils.log] INFO: Scrapy 1.5.1 started
(bot: serial_killers)
2018-10-24 21:12:19 [scrapy.utils.log] INFO: Versions: lxml 4.2.3.0,
libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.0, w3lib 1.19.0, Twisted
18.9.0, Python 3.6.5 (default, Apr 25 2018, 14:22:56) - [GCC 4.2.1
Compatible Apple LLVM 8.0.0 (clang-800.0.42.1)], pyOpenSSL 18.0.0
(OpenSSL 1.1.0h 27 Mar 2018), cryptography 2.2.2, Platform Darwin-
15.2.0-x86_64-i386-64bit
2018-10-24 21:12:19 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'serial_killers', 'FEED_EXPORT_ENCODING': 'utf-8',
'HTTPCACHE_ENABLED': True, 'LOG_FILE': 'output.log',
'NEWSPIDER_MODULE': 'serial_killers.spiders', 'ROBOTSTXT_OBEY': True,
'SPIDER_MODULES': ['serial_killers.spiders']}
2018-10-24 21:12:19 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
2018-10-24 21:12:19 [scrapy.middleware] INFO: Enabled downloader
middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.dowladermidlewares.downloatimeout.DownloadTi\meoutMidleware'
'scrapy.downloadermiddlewares.defaltheaders.DefaultHedersMidleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.dowloadermiddlewares.httpcompression.HtpCompressionMddleware
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats',
'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware']
2018-10-24 21:12:19 [scrapy.middleware] INFO: Enabled spider
middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2018-10-24 21:12:19 [scrapy.middleware] INFO: Enabled item
pipelines:
['scrapy.pipelines.images.ImagesPipeline']
2018-10-24 21:12:19 [scrapy.core.engine] INFO: Spider opened
2018-10-24 21:12:19 [scrapy.extensions.logstats] INFO: Crawled 0
pages
(at 0 pages/min), scraped 0 items (at 0 items/min)
2018-10-24 21:12:19 [scrapy.extensions.httpcache] DEBUG: Using
filesystem
cache storage in
/Users/app_10/serial_kil
lers/.scrapy/httpcache
2018-10-24 21:12:19 [scrapy.extensions.telnet] DEBUG: Telnet console
listening on 127.0.0.1:6023
2018-10-24 21:12:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET
http://murderpedia.org/robots.txt> (referer: None) ['cached']
2018-10-24 21:12:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET
http://murderpedia.org/male.A/index.A.htm> (referer: None) ['cached']
2018-10-24 21:12:19 [scrapy.core.engine] INFO: Closing spider
(finished)
2018-10-24 21:12:19 [scrapy.statscollectors] INFO: Dumping Scrapy
stats:
{'downloader/request_bytes': 456,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 29306,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 10, 25, 1, 12, 19, 569830),
'httpcache/hit': 2,
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'memusage/max': 47525888,
'memusage/startup': 47525888,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2018, 10, 25, 1, 12, 19, 415905)}
2018-10-24 21:12:19 [scrapy.core.engine] INFO: Spider closed
(finished)
It seems like you crawler is not chained correctly.
Your wanted crawl logic is:
1. Go to A listing page
2. Go to every listed person
3. Parse html of every person
Right now your code is missing step #2
Lets try this:
class MySpider(Spider):
name = 'corn-flake-killers'
start_urls = ['http://murderpedia.org/male.A/index.A.htm']
def parse(self, response):
# find table
# we can find table by looking for text and then going up the xml tree
table= response.xpath('//td[contains(font//font/text(),"Victims")]/../..')
# find every url in the table
urls = table.xpath('//a/#href').extract()
for url in urls:
# for every url download person's page to parse_person callback
yield Request(response.urljoin(url), self.parse_person)
def parse_person(self, response):
item = {}
# parse person html here
yield item

pyquery response.body retrieve div elements

I am trying to write a web crawler using scrapy and PyQuery.The full spider code is as follows.
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class gotspider(CrawlSpider):
name='gotspider'
allowed_domains=['fundrazr.com']
start_urls = ['https://fundrazr.com/find?category=Health']
rules = [
Rule(LinkExtractor(allow=('/find/category=Health')), callback='parse',follow=True)
]
def parse(self, response):
self.logger.info('A response from %s just arrived!', response.url)
print(response.body)
The web page skeleton
<div id="header">
<h2 class="title"> Township </h2>
<p><strong>Client: </strong> Township<br>
<strong>Location: </strong>Pennsylvania<br>
<strong>Size: </strong>54,000 SF</p>
</div>
output of the crawler, The crawler fetches the Requesting URL and its hitting the correct web target but the parse_item or parse method is not getting the response. The Response.URL is not printing. I tried to verify this by running the spider without logs scrapy crawl rsscrach--nolog but nothing is printed as logs. The problem is very granular.
2017-11-26 18:07:12 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: rsscrach)
2017-11-26 18:07:12 [scrapy.utils.log] INFO: Overridden settings: {'BOT_NAME': 'rsscrach', 'NEWSPIDER_MODULE': 'rsscrach.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['rsscrach.spiders']}
2017-11-26 18:07:12 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
2017-11-26 18:07:12 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2017-11-26 18:07:12 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2017-11-26 18:07:12 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2017-11-26 18:07:12 [scrapy.core.engine] INFO: Spider opened
2017-11-26 18:07:12 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2017-11-26 18:07:12 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6024
2017-11-26 18:07:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://fundrazr.com/robots.txt> (referer: None)
2017-11-26 18:07:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://fundrazr.com/find?category=Health> (referer: None)
2017-11-26 18:07:15 [scrapy.core.engine] INFO: Closing spider (finished)
2017-11-26 18:07:15 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 605,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 13510,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 11, 26, 10, 7, 15, 46516),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'memusage/max': 52465664,
'memusage/startup': 52465664,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2017, 11, 26, 10, 7, 12, 198182)}
2017-11-26 18:07:15 [scrapy.core.engine] INFO: Spider closed (finished)
How do I get the Client, Location and Size of the attributes ?
I made standalone script with Scrapy which test different methods to get data and it works without problem. Maybe it helps you find your problem.
import scrapy
import pyquery
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://fundrazr.com/find?category=Health']
def parse(self, response):
print('--- css 1 ---')
for title in response.css('h2'):
print('>>>', title)
print('--- css 2 ---')
for title in response.css('h2'):
print('>>>', title.extract()) # without _first())
print('>>>', title.css('a').extract_first())
print('>>>', title.css('a ::text').extract_first())
print('-----')
print('--- css 3 ---')
for title in response.css('h2 a ::text'):
print('>>>', title.extract()) # without _first())
print('--- pyquery 1 ---')
p = pyquery.PyQuery(response.body)
for title in p('h2'):
print('>>>', title, title.text, '<<<') # `title.text` gives "\n"
print('--- pyquery 2 ---')
p = pyquery.PyQuery(response.body)
for title in p('h2').text():
print('>>>', title)
print(p('h2').text())
print('--- pyquery 3 ---')
p = pyquery.PyQuery(response.body)
for title in p('h2 a'):
print('>>>', title, title.text)
# ---------------------------------------------------------------------
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(MySpider)
process.start()

Error with links in scrapy

I want to scrape some links to a news site and get the full news. However the links are relative
The news site is http://www.puntal.com.ar/v2/
And the links are as well
<div class="article-title">
Barros Schelotto: "No somos River y vamos a tratar de pasar a la final"
</div>
then the relative link is "/v2/article.php?id=187222"
My spider is as follows (edit)
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
#from urlparse import urljoin
from scrapy.http.request import Request
try:
from urllib.parse import urljoin # Python3.x
except ImportError:
from urlparse import urljoin # Python2.7
from puntalcomar.items import PuntalcomarItem
class PuntalComArSpider(CrawlSpider):
name = 'puntal.com.ar'
allowed_domains = ['http://www.puntal.com.ar/v2/']
start_urls = ['http://www.puntal.com.ar/v2/']
rules = (
Rule(LinkExtractor(allow=(''),), callback="parse", follow=True),
)
def parse_url(self, response):
hxs = Selector(response)
urls = hxs.xpath('//div[#class="article-title"]/a/#href').extract()
print 'enlace relativo ', urls
for url in urls:
urlfull = urljoin('http://www.puntal.com.ar',url
print 'enlace completo ', urlfull
yield Request(urlfull, callback = self.parse_item)
def parse_item(self, response):
hxs = Selector(response)
dates = hxs.xpath('//span[#class="date"]')
title = hxs.xpath('//div[#class="title"]')
subheader = hxs.xpath('//div[#class="subheader"]')
body = hxs.xpath('//div[#class="body"]/p')
items = []
for date in dates:
item = PuntalcomarItem()
item["date"] = date.xpath('text()').extract()
item["title"] = title.xpath("text()").extract()
item["subheader"] = subheader.xpath('text()').extract()
item["body"] = body.xpath("text()").extract()
items.append(item)
return items
But it does not work
I have Linux Mint with Python 2.7.6
Shell:
$ scrapy crawl puntal.com.ar
2016-07-10 13:39:15 [scrapy] INFO: Scrapy 1.1.0 started (bot: puntalcomar)
2016-07-10 13:39:15 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'puntalcomar.spiders', 'SPIDER_MODULES': ['puntalcomar.spiders'], 'ROBOTSTXT_OBEY': True, 'BOT_NAME': 'puntalcomar'}
2016-07-10 13:39:15 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.corestats.CoreStats']
2016-07-10 13:39:15 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-07-10 13:39:15 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-07-10 13:39:15 [scrapy] INFO: Enabled item pipelines:
['puntalcomar.pipelines.XmlExportPipeline']
2016-07-10 13:39:15 [scrapy] INFO: Spider opened
2016-07-10 13:39:15 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-07-10 13:39:15 [scrapy] DEBUG: Crawled (404) <GET http://www.puntal.com.ar/robots.txt> (referer: None)
2016-07-10 13:39:15 [scrapy] DEBUG: Redirecting (301) to <GET http://www.puntal.com.ar/v2/> from <GET http://www.puntal.com.ar/v2>
2016-07-10 13:39:15 [scrapy] DEBUG: Crawled (200) <GET http://www.puntal.com.ar/v2/> (referer: None)
enlace relativo [u'/v2/article.php?id=187334', u'/v2/article.php?id=187324', u'/v2/article.php?id=187321', u'/v2/article.php?id=187316', u'/v2/article.php?id=187335', u'/v2/article.php?id=187308', u'/v2/article.php?id=187314', u'/v2/article.php?id=187315', u'/v2/article.php?id=187317', u'/v2/article.php?id=187319', u'/v2/article.php?id=187310', u'/v2/article.php?id=187298', u'/v2/article.php?id=187300', u'/v2/article.php?id=187299', u'/v2/article.php?id=187306', u'/v2/article.php?id=187305']
enlace completo http://www.puntal.com.ar/v2/article.php?id=187334
2016-07-10 13:39:15 [scrapy] DEBUG: Filtered offsite request to 'www.puntal.com.ar': <GET http://www.puntal.com.ar/v2/article.php?id=187334>
enlace completo http://www.puntal.com.ar/v2/article.php?id=187324
enlace completo http://www.puntal.com.ar/v2/article.php?id=187321
enlace completo http://www.puntal.com.ar/v2/article.php?id=187316
enlace completo http://www.puntal.com.ar/v2/article.php?id=187335
enlace completo http://www.puntal.com.ar/v2/article.php?id=187308
enlace completo http://www.puntal.com.ar/v2/article.php?id=187314
enlace completo http://www.puntal.com.ar/v2/article.php?id=187315
enlace completo http://www.puntal.com.ar/v2/article.php?id=187317
enlace completo http://www.puntal.com.ar/v2/article.php?id=187319
enlace completo http://www.puntal.com.ar/v2/article.php?id=187310
enlace completo http://www.puntal.com.ar/v2/article.php?id=187298
enlace completo http://www.puntal.com.ar/v2/article.php?id=187300
enlace completo http://www.puntal.com.ar/v2/article.php?id=187299
enlace completo http://www.puntal.com.ar/v2/article.php?id=187306
enlace completo http://www.puntal.com.ar/v2/article.php?id=187305
2016-07-10 13:39:15 [scrapy] INFO: Closing spider (finished)
2016-07-10 13:39:15 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 660,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 3,
'downloader/response_bytes': 50497,
'downloader/response_count': 3,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/301': 1,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 7, 10, 16, 39, 15, 726952),
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'offsite/domains': 1,
'offsite/filtered': 16,
'request_depth_max': 1,
'response_received_count': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2016, 7, 10, 16, 39, 15, 121104)}
2016-07-10 13:39:15 [scrapy] INFO: Spider closed (finished)
I tried the absolute links and correct. I was not really happening.
That i[1:] is strange and is the key problem. There is no need in slicing:
def parse(self, response):
urls = response.xpath('//div[#class="article-title"]/a/#href').extract()
for url in urls:
yield Request(urlparse.urljoin(response.url, url), callback=self.parse_url)
Note that I've also fixed the XPath expression - it needs to be started with // to look for the div elements at any level in the DOM tree.

Scrapy outputs [ into my .json file

A genuine Scrapy and Python noob here so please be patient with any silly mistakes. I'm trying to write a spider to recursively crawl a news site and return the headline, date, and first paragraph of the Article. I managed to crawl a single page for one item but the moment I try and expand beyond that it all goes wrong.
my Spider:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from basic.items import BasicItem
class BasicSpiderSpider(CrawlSpider):
name = "basic_spider"
allowed_domains = ["news24.com/"]
start_urls = (
'http://www.news24.com/SouthAfrica/News/56-children-hospitalised-for-food-poisoning-20150328',
)
rules = (Rule (SgmlLinkExtractor(allow=("", ))
, callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = Selector(response)
titles = hxs.xpath('//*[#id="aspnetForm"]')
items = []
item = BasicItem()
item['Headline'] = titles.xpath('//*[#id="article_special"]//h1/text()').extract()
item["Article"] = titles.xpath('//*[#id="article-body"]/p[1]/text()').extract()
item["Date"] = titles.xpath('//*[#id="spnDate"]/text()').extract()
items.append(item)
return items
I am still getting the same problem, though have noticed that there is a "[" for every time I try and run the spider, to try and figure out what the issue is I have run the following command:
c:\Scrapy Spiders\basic>scrapy parse --spider=basic_spider -c parse_items -d 2 -v http://www.news24.com/SouthAfrica/News/56-children-hospitalised-for-food-poisoning-20150328
which gives me the following output:
2015-03-30 15:28:21+0200 [scrapy] INFO: Scrapy 0.24.5 started (bot: basic)
2015-03-30 15:28:21+0200 [scrapy] INFO: Optional features available: ssl, http11
2015-03-30 15:28:21+0200 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'basic.spiders', 'SPIDER_MODULES': ['basic.spiders'], 'DEPTH_LIMIT': 1, 'DOW
NLOAD_DELAY': 2, 'BOT_NAME': 'basic'}
2015-03-30 15:28:21+0200 [scrapy] INFO: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2015-03-30 15:28:21+0200 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, D
efaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2015-03-30 15:28:21+0200 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddl
eware
2015-03-30 15:28:21+0200 [scrapy] INFO: Enabled item pipelines:
2015-03-30 15:28:21+0200 [basic_spider] INFO: Spider opened
2015-03-30 15:28:21+0200 [basic_spider] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-03-30 15:28:21+0200 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2015-03-30 15:28:21+0200 [scrapy] DEBUG: Web service listening on 127.0.0.1:6080
2015-03-30 15:28:22+0200 [basic_spider] DEBUG: Crawled (200) <GET http://www.news24.com/SouthAfrica/News/56-children-hospitalised-for-food-poisoning-20150328>
(referer: None)
2015-03-30 15:28:22+0200 [basic_spider] INFO: Closing spider (finished)
2015-03-30 15:28:22+0200 [basic_spider] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 282,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 145301,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 3, 30, 13, 28, 22, 177000),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2015, 3, 30, 13, 28, 21, 878000)}
2015-03-30 15:28:22+0200 [basic_spider] INFO: Spider closed (finished)
>>> DEPTH LEVEL: 1 <<<
# Scraped Items ------------------------------------------------------------
[{'Article': [u'Johannesburg - Fifty-six children were taken to\nPietermaritzburg hospitals after showing signs of food poisoning while at\nschool, KwaZulu-Na
tal emergency services said on Friday.'],
'Date': [u'2015-03-28 07:30'],
'Headline': [u'56 children hospitalised for food poisoning']}]
# Requests -----------------------------------------------------------------
[]
So, I can see that the Item is being scraped, but there is no usable item data put into the json file. this is how i'm running scrapy:
scrapy crawl basic_spider -o test.json
I've been looking at the last line, (return items) as changing it to either yield or print gives me no items scraped in the parse.
This usually means nothing was scraped, no items were extracted.
In your case, fix your allowed_domains setting:
allowed_domains = ["news24.com"]
Aside from that, just a bit cleaning up from a perfectionist:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class BasicSpiderSpider(CrawlSpider):
name = "basic_spider"
allowed_domains = ["news24.com"]
start_urls = [
'http://www.news24.com/SouthAfrica/News/56-children-hospitalised-for-food-poisoning-20150328',
]
rules = [
Rule(LinkExtractor(), callback="parse_items", follow=True),
]
def parse_items(self, response):
for title in response.xpath('//*[#id="aspnetForm"]'):
item = BasicItem()
item['Headline'] = title.xpath('//*[#id="article_special"]//h1/text()').extract()
item["Article"] = title.xpath('//*[#id="article-body"]/p[1]/text()').extract()
item["Date"] = title.xpath('//*[#id="spnDate"]/text()').extract()
yield item

Categories