scrapy a weird bug code that can't call pipeline - python

I write a small spider, when I run and it can't call pipeline.
After debug for a while, I find the bug code area.
The logic of the spider is that I crawl the first url to fetch cookie, then I crawl the second url to download the code picture with cookie, and I post some data I prepare to the third url. And If the text I get from the picture wrong then I download again to post the third url repeatedly, until I got the right text.
Let me show you the code:
# -*- coding: gbk -*-
import scrapy
from scrapy.http import FormRequest
import json
import os
from datetime import datetime
from scrapy.selector import Selector
from teacherCourse.handlePic import handle
from teacherCourse.items import DetailProfItem
from teacherCourse.items import DetailProfCourseItem
from teacherCourse.items import containItem
class GetTeacherCourseSpider(scrapy.Spider):
name = 'TeacherCourse'
# custom_settings = {
# 'ITEM_PIPELINES': {
# 'teacherCourse.pipelines.TeacherCoursePipeline': 300,
# }
# }
def __init__(self, selXNXQ='', titleCode=''):
self.getUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx' # first
self.vcodeUrl = 'http://jwxt.dgut.edu.cn/jwweb/sys/ValidateCode.aspx' # second
self.postUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB_rpt.aspx' # third
self.findSessionId = None # to save the cookies
self.XNXQ = selXNXQ
self.titleCode = titleCode
def start_requests(self):
request = scrapy.Request(self.getUrl,
callback = self.downloadPic)
yield request
def downloadPic(self, response):
# download the picture
# find the session id
self.findSessionId = response.headers.getlist('Set-Cookie')[0].decode().split(";")[0].split("=")
request = scrapy.Request(self.vcodeUrl,
cookies= {self.findSessionId[0]: self.findSessionId[1]},
callback = self.getAndHandleYzm)
yield request
def getAndHandleYzm(self, response):
yzm = handle(response.body)
yield FormRequest(self.postUrl,
formdata={'Sel_XNXQ': '20151',
'sel_zc': '011',
'txt_yzm': yzm,
'type': '2'},
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0] + '=' + self.findSessionId[1],
},
callback=self.parse)
def parse(self, response):
body = response.body.decode('gbk')
num = body.find('alert')
if num != -1:
# means CAPTCHA validation fails, need to re-request the CAPTCHA
yield scrapy.Request(self.vcodeUrl+'?t='+'%.f' % (datetime.now().microsecond / 1000),
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0]+'='+self.findSessionId[1]
},
callback=self.getAndHandleYzm)
else:
# parse data
self.parseData(body)
# item = containItem()
# item['first'] = len(body)
# return item
# the parse data part is a little bit long, but it doesn't matter.
# At the last line, I did yield a item
def parseData(self, body):
# parse body data
sel = Selector(text=body)
# get all the note text data
noteTables = sel.xpath('//table[#style="border:0px;"]').extract()
noteList = [] # to store all the note text
for noteTable in noteTables:
if '<b>' in noteTable:
sele = Selector(text = noteTable)
note = (sele.xpath('//table/tr/td/b/text()').extract())
noteText = (sele.xpath('//table/tr/td/text()').extract())
# combine note and noteText
if not noteText:
noteText.append('')
noteText.append('')
else:
if len(noteText) == 1:
noteText.append('')
noteList.append(noteText)
# get all the course data
courseTables = sel.xpath('//table[#class="page_table"]/tbody').extract()
AllDetailCourse = [] # all the teachers' course
for table in courseTables:
everyTeacherC = [] # every teacher's course
s = Selector(text = table)
trs = s.xpath('//tr').extract()
for tr in trs:
sel = Selector(text = tr)
snum = (sel.xpath('//td[1]/text()').extract())
course = (sel.xpath('//td[2]/text()').extract())
credit = (sel.xpath('//td[3]/text()').extract())
teachWay = (sel.xpath('//td[4]/text()').extract())
courseType = (sel.xpath('//td[5]/text()').extract())
classNum = (sel.xpath('//td[6]/text()').extract())
className = (sel.xpath('//td[7]/text()').extract())
stuNum = (sel.xpath('//td[8]/text()').extract())
week = (sel.xpath('//td[9]/text()').extract())
section = (sel.xpath('//td[10]/text()').extract())
location = (sel.xpath('//td[11]/text()').extract())
tmpList = []
tmpList.append(snum)
tmpList.append(course)
tmpList.append(credit)
tmpList.append(teachWay)
tmpList.append(courseType)
tmpList.append(classNum)
tmpList.append(className)
tmpList.append(stuNum)
tmpList.append(week)
tmpList.append(section)
tmpList.append(location)
# to know whether every variable is empty
detailCourse = []
for each in tmpList:
if not each:
each = ''
else:
each = each[0]
detailCourse.append(each)
everyTeacherC.append(detailCourse)
AllDetailCourse.append(everyTeacherC)
# get department, teacher, gender and title
sel = Selector(text = body)
temp1 = sel.xpath('//*[#group="group"]/table/tr/td/text()').extract()
# fill two tables, which will store in the database
i = 0
# every professor
for each in temp1:
tables = containItem() # all the data in every for loop to send to the pipeline
each = each.replace(u'\xa0', u' ')
each = each.split(' ')
depart = each[0].split('£º')
teacher = each[1].split('£º')
gender = each[2].split('£º')
title = each[3].split('£º')
# first table
profItem = DetailProfItem()
profItem['XNXQ'] = self.XNXQ
profItem['department'] = depart[1] # department
profItem['teacher'] = teacher[1] # teacher
profItem['gender'] = gender[1]
profItem['title'] = title[1]
profItem['note1'] = noteList[i][0]
profItem['note2'] = noteList[i][1]
tables['first'] = profItem # add the first table
# second table
# every professor's courses
profCourses = []
for j in range(len(AllDetailCourse[i])): # how many course for every professor
profCourseItem = DetailProfCourseItem() # every course for every professor
profCourseItem['snum'] = AllDetailCourse[i][j][0] # i means i-th professor, j means j-th course, third num means what position of the course
profCourseItem['course'] = AllDetailCourse[i][j][1]
profCourseItem['credit'] = AllDetailCourse[i][j][2]
profCourseItem['teachWay'] = AllDetailCourse[i][j][3]
profCourseItem['courseType'] = AllDetailCourse[i][j][4]
profCourseItem['classNum'] = AllDetailCourse[i][j][5]
profCourseItem['className'] = AllDetailCourse[i][j][6]
profCourseItem['stuNum'] = AllDetailCourse[i][j][7]
profCourseItem['week'] = AllDetailCourse[i][j][8]
profCourseItem['section'] = AllDetailCourse[i][j][9]
profCourseItem['location'] = AllDetailCourse[i][j][10]
profCourses.append(profCourseItem) # every professor's courses
tables['second'] = profCourseItem # add the second table
i += 1
yield tables
Any suggestions would be appreciate!
settings.py: (pipeline part)
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'teacherCourse.pipelines.TeacherCoursePipeline': 300,
}
items.py: (I don't think it's matter)
# detail professor course message
class DetailProfCourseItem(scrapy.Item):
snum = scrapy.Field() # serial number
course = scrapy.Field()
credit = scrapy.Field()
teachWay = scrapy.Field()
courseType = scrapy.Field()
classNum = scrapy.Field()
className = scrapy.Field()
stuNum = scrapy.Field()
week = scrapy.Field()
section = scrapy.Field()
location = scrapy.Field()
# the third item which contain first and second item
class containItem(scrapy.Item):
first = scrapy.Field() # for fist table
second = scrapy.Field() # for second table
pipeline code:
class TeacherCoursePipeline(object):
def process_item(self, item, spider):
print('I am called!!!!!')
print(item)
return item
And When I run spider scrapy crawl TeacherCourse
it output:
2016-07-19 17:39:18 [scrapy] INFO: Scrapy 1.1.0rc1 started (bot: teacherCourse)
2016-07-19 17:39:18 [scrapy] INFO: Overridden settings: {'BOT_NAME': 'teacherCourse', 'NEWSPIDER_MODULE': 'teacherCourse.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['teacherCourse.spiders']}
2016-07-19 17:39:18 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats', 'scrapy.extensions.logstats.LogStats']
2016-07-19 17:39:18 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-07-19 17:39:18 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-07-19 17:39:18 [scrapy] INFO: Enabled item pipelines:
['teacherCourse.pipelines.TeacherCoursePipeline']
2016-07-19 17:39:18 [scrapy] INFO: Spider opened
2016-07-19 17:39:18 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-07-19 17:39:18 [scrapy] DEBUG: Crawled (404) <GET http://jwxt.dgut.edu.cn/robots.txt> (referer: None)
2016-07-19 17:39:18 [scrapy] DEBUG: Crawled (200) <GET http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx> (referer: None)
2016-07-19 17:39:19 [scrapy] DEBUG: Crawled (200) <GET http://jwxt.dgut.edu.cn/jwweb/sys/ValidateCode.aspx> (referer: http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx)
2016-07-19 17:39:19 [scrapy] DEBUG: Crawled (200) <POST http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB_rpt.aspx> (referer: http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx)
2016-07-19 17:39:19 [scrapy] INFO: Closing spider (finished)
2016-07-19 17:39:19 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1330,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 3,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 230886,
'downloader/response_count': 4,
'downloader/response_status_count/200': 3,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 7, 19, 9, 39, 19, 861620),
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'request_depth_max': 2,
'response_received_count': 4,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'start_time': datetime.datetime(2016, 7, 19, 9, 39, 18, 774293)}
2016-07-19 17:39:19 [scrapy] INFO: Spider closed (finished)

The problem seems to be that the parse method only yields scrapy.Request objects, never scrapy.Item instances.
The else: branch calls the generator parseData(body) but doesn't use the data it can produce (namely containItem objects).
One way to solve this is to loop on the generator results and yield them one-by-one:
def parse(self, response):
body = response.body.decode('gbk')
num = body.find('alert')
if num != -1:
# means CAPTCHA validation fails, need to re-request the CAPTCHA
yield scrapy.Request(self.vcodeUrl+'?t='+'%.f' % (datetime.now().microsecond / 1000),
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0]+'='+self.findSessionId[1]
},
callback=self.getAndHandleYzm)
else:
# parse data
for i in self.parseData(body):
yield i
# item = containItem()
# item['first'] = len(body)
# return item

Related

Scrapy not working (noob level) - 0 pages crawled 0 items crawled

I've been trying to follow the Scrapy tutorial but I stuck and have no idea where is mistake.
It is working but no items are crawled.
I get the following output:
C:\Users\xxx\allegro>scrapy crawl AllegroPrices
2017-12-10 22:25:14 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: AllegroPrices)
2017-12-10 22:25:14 [scrapy.utils.log] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'allegro.spiders', 'SPIDER_MODULES': ['allegro.spiders'], 'ROBOTSTXT_OBEY': True, 'LOG_LEVEL': 'INFO', 'BOT_NAME': 'AllegroPrices'}
2017-12-10 22:25:15 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
2017-12-10 22:25:15 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2017-12-10 22:25:15 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'allegro.middlewares.AllegroSpiderMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2017-12-10 22:25:15 [scrapy.middleware] INFO: Enabled item pipelines:
['allegro.pipelines.AllegroPipeline']
2017-12-10 22:25:15 [scrapy.core.engine] INFO: Spider opened
2017-12-10 22:25:15 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2017-12-10 22:25:15 [AllegroPrices] INFO: Spider opened: AllegroPrices
2017-12-10 22:25:15 [scrapy.core.engine] INFO: Closing spider (finished)
2017-12-10 22:25:15 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 12, 10, 21, 25, 15, 527000),
'log_count/INFO': 8,
'start_time': datetime.datetime(2017, 12, 10, 21, 25, 15, 517000)}
2017-12-10 22:25:15 [scrapy.core.engine] INFO: Spider closed (finished)
My spider file:
# -*- coding: utf-8 -*-
import scrapy
from allegro.items import AllegroItem
class AllegroPrices(scrapy.Spider):
name = "AllegroPrices"
allowed_domains = ["allegro.pl"]
#Use working product URL below
start_urls = [
"http://allegro.pl/diablo-ii-lord-of-destruction-2-pc-big-box-eng-i6896736152.html", "http://allegro.pl/diablo-ii-2-pc-dvd-box-eng-i6961686788.html",
"http://allegro.pl/star-wars-empire-at-war-2006-dvd-box-i6995651106.html", "http://allegro.pl/heavy-gear-ii-2-pc-eng-cdkingpl-i7059163114.html"
]
def parse(self, response):
items = AllegroItem()
title = response.xpath('//h1[#class="title"]//text()').extract()
sale_price = response.xpath('//div[#class="price"]//text()').extract()
seller = response.xpath('//div[#class="btn btn-default btn-user"]/span/text()').extract()
items['product_name'] = ''.join(title).strip()
items['product_sale_price'] = ''.join(sale_price).strip()
items['product_seller'] = ''.join(seller).strip()
yield items
Settings:
# -*- coding: utf-8 -*-
# Scrapy settings for allegro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'AllegroPrices'
SPIDER_MODULES = ['allegro.spiders']
NEWSPIDER_MODULE = 'allegro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'allegro (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'allegro.middlewares.AllegroSpiderMiddleware': 543,
}
LOG_LEVEL = 'INFO'
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'allegro.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'allegro.pipelines.AllegroPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Pipeline:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class AllegroPipeline(object):
def process_item(self, item, spider):
return item
Items:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class AllegroItem(scrapy.Item):
# define the fields for your item here like:
product_name = scrapy.Field()
product_sale_price = scrapy.Field()
product_seller = scrapy.Field()
I have no problem to run it as standalone script without creating project and save to CSV file.
And I don't have to change USER-AGENT.
Maybe there is problem with some settings. You didn't put url to tutorial to check it.
Or simply you have wrong indentions and start_urls and parse() in not inside class. Indentions are very important in Python.
BTW: you forgot /a/ in xpath for seller.
import scrapy
#class AllegroItem(scrapy.Item):
# product_name = scrapy.Field()
# product_sale_price = scrapy.Field()
# product_seller = scrapy.Field()
class AllegroPrices(scrapy.Spider):
name = "AllegroPrices"
allowed_domains = ["allegro.pl"]
start_urls = [
"http://allegro.pl/diablo-ii-lord-of-destruction-2-pc-big-box-eng-i6896736152.html",
"http://allegro.pl/diablo-ii-2-pc-dvd-box-eng-i6961686788.html",
"http://allegro.pl/star-wars-empire-at-war-2006-dvd-box-i6995651106.html",
"http://allegro.pl/heavy-gear-ii-2-pc-eng-cdkingpl-i7059163114.html"
]
def parse(self, response):
title = response.xpath('//h1[#class="title"]//text()').extract()
sale_price = response.xpath('//div[#class="price"]//text()').extract()
seller = response.xpath('//div[#class="btn btn-default btn-user"]/a/span/text()').extract()
title = title[0].strip()
print(title, sale_price, seller)
yield {'title': title, 'price': sale_price, 'seller': seller}
#items = AllegroItem()
#items['product_name'] = ''.join(title).strip()
#items['product_sale_price'] = ''.join(sale_price).strip()
#items['product_seller'] = ''.join(seller).strip()
#yield items
# --- run it as standalone script without project and save in CSV ---
from scrapy.crawler import CrawlerProcess
#c = CrawlerProcess()
c = CrawlerProcess({
# 'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv',
'FEED_URI': 'output.csv'
})
c.crawl(AllegroPrices)
c.start()
Result in CSV file:
title,price,seller
STAR WARS: EMPIRE AT WAR [2006] DVD BOX,"24,90 zł",CDkingpl
DIABLO II: LORD OF DESTRUCTION 2 PC BIG BOX ENG,"149,00 zł",CDkingpl
HEAVY GEAR II 2 | PC ENG CDkingpl,"19,90 zł",CDkingpl
DIABLO II 2 | PC DVD BOX | ENG,"24,90 zł",CDkingpl

Python / Scrapy: CrawlSpider stops after fetching start_urls

I have wasted days to get my mind around Scrapy, reading the docs and other Scrapy Blogs and Q&A ... and now I am about to do what men hate most: Ask for directions ;-) The problem is: My spider opens, fetches the start_urls, but apparently does nothing with them. Instead it closes immediately and that was that. Apparently, I do not even get to the first self.log() statement.
What I've got so far is this:
# -*- coding: utf-8 -*-
import scrapy
# from scrapy.shell import inspect_response
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import HtmlResponse, FormRequest, Request
from KiPieSpider.items import *
from KiPieSpider.settings import *
class KiSpider(CrawlSpider):
name = "KiSpider"
allowed_domains = ['www.kiweb.de', 'kiweb.de']
start_urls = (
# ST Regra start page:
'https://www.kiweb.de/default.aspx?pageid=206',
# follow ST Regra links in the form of:
# https://www.kiweb.de/default.aspx?pageid=206&page=\d+
# https://www.kiweb.de/default.aspx?pageid=299&docid=\d{6}
# ST Thermo start page:
'https://www.kiweb.de/default.aspx?pageid=202&page=1',
# follow ST Thermo links in the form of:
# https://www.kiweb.de/default.aspx?pageid=202&page=\d+
# https://www.kiweb.de/default.aspx?pageid=299&docid=\d{6}
)
rules = (
# First rule that matches a given link is followed / parsed.
# Follow category pagination without further parsing:
Rule(
LinkExtractor(
# Extract links in the form:
allow=r'Default\.aspx?pageid=(202|206])&page=\d+',
# but only within the pagination table cell:
restrict_xpaths=('//td[#id="ctl04_teaser_next"]'),
),
follow=True,
),
# Follow links to category (202|206) articles and parse them:
Rule(
LinkExtractor(
# Extract links in the form:
allow=r'Default\.aspx?pageid=299&docid=\d+',
# but only within article preview cells:
restrict_xpaths=("//td[#class='TOC-zelle TOC-text']"),
),
# and parse the resulting pages for article content:
callback='parse_init',
follow=False,
),
)
# Once an article page is reached, check whether a login is necessary:
def parse_init(self, response):
self.log('Parsing article: %s' % response.url)
if not response.xpath('input[#value="Logout"]'):
# Note: response.xpath() is a shortcut of response.selector.xpath()
self.log('Not logged in. Logging in...\n')
return self.login(response)
else:
self.log('Already logged in. Continue crawling...\n')
return self.parse_item(response)
def login(self, response):
self.log("Trying to log in...\n")
self.username = self.settings['KI_USERNAME']
self.password = self.settings['KI_PASSWORD']
return FormRequest.from_response(
response,
formname='Form1',
formdata={
# needs name, not id attributes!
'ctl04$Header$ctl01$textbox_username': self.username,
'ctl04$Header$ctl01$textbox_password': self.password,
'ctl04$Header$ctl01$textbox_logindaten_typ': 'Username_Passwort',
'ctl04$Header$ctl01$checkbox_permanent': 'True',
},
callback = self.parse_item,
)
def parse_item(self, response):
articles = response.xpath('//div[#id="artikel"]')
items = []
for article in articles:
item = KiSpiderItem()
item['link'] = response.url
item['title'] = articles.xpath("div[#class='ct1']/text()").extract()
item['subtitle'] = articles.xpath("div[#class='ct2']/text()").extract()
item['article'] = articles.extract()
item['published'] = articles.xpath("div[#class='biblio']/text()").re(r"(\d{2}.\d{2}.\d{4}) PIE")
item['artid'] = articles.xpath("div[#class='biblio']/text()").re(r"PIE \[(d+)-\d+\]")
item['lang'] = 'de-DE'
items.append(item)
# return(items)
yield items
# what is the difference between return and yield?? found both on web.
When doing scrapy crawl KiSpider, this results in:
2017-03-09 18:03:33 [scrapy.utils.log] INFO: Scrapy 1.3.2 started (bot: KiPieSpider)
2017-03-09 18:03:33 [scrapy.utils.log] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'KiPieSpider.spiders', 'DEPTH_LIMIT': 3, 'CONCURRENT_REQUESTS': 8, 'SPIDER_MODULES': ['KiPieSpider.spiders'], 'BOT_NAME': 'KiPieSpider', 'DOWNLOAD_TIMEOUT': 60, 'USER_AGENT': 'KiPieSpider (info#defrent.de)', 'DOWNLOAD_DELAY': 0.25}
2017-03-09 18:03:33 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
2017-03-09 18:03:33 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2017-03-09 18:03:33 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2017-03-09 18:03:33 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2017-03-09 18:03:33 [scrapy.core.engine] INFO: Spider opened
2017-03-09 18:03:33 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2017-03-09 18:03:33 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2017-03-09 18:03:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.kiweb.de/default.aspx?pageid=206> (referer: None)
2017-03-09 18:03:34 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.kiweb.de/default.aspx?pageid=202&page=1> (referer: None)
2017-03-09 18:03:34 [scrapy.core.engine] INFO: Closing spider (finished)
2017-03-09 18:03:34 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 465,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 48998,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 3, 9, 17, 3, 34, 235000),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'response_received_count': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2017, 3, 9, 17, 3, 33, 295000)}
2017-03-09 18:03:34 [scrapy.core.engine] INFO: Spider closed (finished)
Is it that the login routine should not end with a callback, but some kind of return/yield statement? Or what am I doing wrong? Unfortunately, the docs and tutorials I have seen so far only give me a vague idea of how every bit connects to the others, especially Scrapy's docs seem to be written as a reference for people who already know a lot about Scrapy.
Somewhat frustrated greetings
Christopher
rules = (
# First rule that matches a given link is followed / parsed.
# Follow category pagination without further parsing:
Rule(
LinkExtractor(
# Extract links in the form:
# allow=r'Default\.aspx?pageid=(202|206])&page=\d+',
# but only within the pagination table cell:
restrict_xpaths=('//td[#id="ctl04_teaser_next"]'),
),
follow=True,
),
# Follow links to category (202|206) articles and parse them:
Rule(
LinkExtractor(
# Extract links in the form:
# allow=r'Default\.aspx?pageid=299&docid=\d+',
# but only within article preview cells:
restrict_xpaths=("//td[#class='TOC-zelle TOC-text']"),
),
# and parse the resulting pages for article content:
callback='parse_init',
follow=False,
),
)
you do not need allow parameter, because there is only one link in the tag selected by XPath.
I do not understand the regex in allow parameter but at least you should escape the ?.

Scrapy spider close prematurely

I've programmed Scrapy to scrap a couple of thousand url link that I've stored on the database. I've programmed a spider to call scrapy.Requests function to be passed with url from the database.However after scraping 1-2 page the spider closes prematurely (without error). I don't know why this happened.
Code:
# -*- coding: utf-8 -*-
import scrapy
import olsDBUtil
import tokopediautil
from datetime import datetime
import time
import logging
from scrapy.utils.log import configure_logging
class DataproductSpider(scrapy.Spider):
dbObj = olsDBUtil.olsDBUtil()
name = "dataProduct"
allowed_domains = ["tokopedia.com"]
newProductLink = list(dbObj.getNewProductLinks(10))
start_urls = list(newProductLink.pop())
# start_urls = dbObj.getNewProductLinks(NumOfLinks=2)
tObj = tokopediautil.TokopediaUtil()
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='log.txt',
format='%(levelname)s: %(message)s',
level=logging.INFO
)
def parse(self, response):
if response.status == 200:
thisIsProductPage = response.selector.xpath("/html/head/meta[#property='og:type']/#content").extract()[
0] == 'product'
if thisIsProductPage:
vProductID = self.dbObj.getProductIDbyURL(response.url)
vProductName = \
response.selector.xpath("//input[#type='hidden'][#name='product_name']/#value").extract()[0]
vProductDesc = response.selector.xpath("//p[#itemprop='description']/text()").extract()[0]
vProductPrice = \
response.selector.xpath("/html/head/meta[#property='product:price:amount']/#content").extract()[0]
vSiteProductID = \
response.selector.xpath("//input[#type='hidden'][#name='product_id']/#value").extract()[0]
vProductCategory = response.selector.xpath("//ul[#itemprop='breadcrumb']//text()").extract()[1:-1]
vProductCategory = ' - '.join(vProductCategory)
vProductUpdated = \
response.selector.xpath("//small[#class='product-pricelastupdated']/i/text()").extract()[0][26:36]
vProductUpdated = datetime.strptime(vProductUpdated, '%d-%M-%Y')
vProductVendor = response.selector.xpath("//a[#id='shop-name-info']/text()").extract()[0]
vProductStats = self.tObj.getItemSold(vSiteProductID)
vProductSold = vProductStats['item_sold']
vProductViewed = self.tObj.getProductView(vSiteProductID)
vSpecificPortalData = "item-sold - %s , Transaction Sucess - %s , Transaction Rejected - %s " % (
vProductStats['item_sold'], vProductStats['success'], vProductStats['reject'])
print "productID : " + str(vProductID)
print "product Name : " + vProductName
print "product Desc : " + vProductDesc
print "Product Price : " + str(vProductPrice)
print "Product SiteID : " + str(vSiteProductID)
print "Category : " + vProductCategory
print "Product Updated: " + vProductUpdated.strftime('%Y-%m-%d')
print "Product Vendor : " + vProductVendor
print "Product Sold : " + str(vProductSold)
print "Product Viewed : " + str(vProductViewed)
print "Site Specific Info: " + vSpecificPortalData
self.dbObj.storeNewProductData(
productID=vProductID,
productName=vProductName,
productPrice=vProductPrice,
productSiteProdID=vSiteProductID,
productVendor=vProductVendor,
productDesc=vProductDesc,
productQtyDilihat=vProductViewed,
productTerjual=vProductSold,
productCategory=vProductCategory,
productSiteSpecificInfo=vSpecificPortalData
)
self.dbObj.storeProductRunningData(
productID=vProductID,
productDilihat=str(vProductViewed),
productTerjual=str(vProductSold)
)
else:
print "Error Logged : Page Call Error"
LinkText = str(self.newProductLink.pop())
print "LinkText : %s" % LinkText
print "Total newProductLink is %s" % str(len(self.newProductLink))
yield scrapy.Request(url=LinkText, callback=self.parse)
Here's the scrapy Log :
INFO: Scrapy 1.3.0 started (bot: tokopedia)
INFO: Overridden settings: {'NEWSPIDER_MODULE': 'tokopedia.spiders', 'HTTPCACHE_EXPIRATION_SECS': 1800, 'SPIDER_MODULES': ['tokopedia.spiders'], 'HTTPCACHE_ENABLED': True, 'BOT_NAME': 'tokopedia', 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats',
'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware']
INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
INFO: Enabled item pipelines:
[]
INFO: Spider opened
INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
DEBUG: Telnet console listening on 127.0.0.1:6023
DEBUG: Crawled (200) <GET https://www.tokopedia.com/karmedia/penjelasan-pembatal-keislaman> (referer: None)
DEBUG: Starting new HTTPS connection (1): js.tokopedia.com
DEBUG: https://js.tokopedia.com:443 "GET /productstats/check?pid=27455429 HTTP/1.1" 200 61
DEBUG: Starting new HTTPS connection (1): www.tokopedia.com
DEBUG: https://www.tokopedia.com:443 "GET /provi/check?pid=27455429&callback=show_product_view HTTP/1.1" 200 31
INFO: Closing spider (finished)
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 333,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 20815,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 2, 10, 18, 4, 10, 355000),
'httpcache/firsthand': 1,
'httpcache/miss': 1,
'httpcache/store': 1,
'log_count/DEBUG': 6,
'log_count/INFO': 7,
'offsite/filtered': 1,
'request_depth_max': 1,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2017, 2, 10, 18, 4, 8, 922000)}
INFO: Spider closed (finished)
Changed the scrapy.Request call to absolute url link for the next product.. it worked. I don't understand why this happens.. somehow the list.pop() statement doesn't work.. even though I've change it to a string.
Try dont_filter=True in your scrapy.Request(). I was having a similar issue with the duplicate filter causing the spider (also using pop()) to close prematurely and I see you have an 'offsite/filtered': 1 there that may have caused a filtering issue.

scrapy - spider module def functions not getting invoked

My intention is to invoke start_requests method to login to the website. After login, scrape the website. Based on the log message, I see that
1. But, I see that start_request is not invoked.
2. call_back function of the parse is also not invoking.
Whats actually happening is spider is only loading the urls in the start_urls.
Question:
Why the spider is not crawling through other pages(say page 2, 3, 4)?
Why looking from spider is not working?
Note:
My method to calculate page number and url creation is correct. I verified it.
I referred this link to write this code Using loginform with scrapy
My code:
zauba.py (spider)
#!/usr/bin/env python
from scrapy.spiders import CrawlSpider
from scrapy.http import FormRequest
from scrapy.http.request import Request
from loginform import fill_login_form
import logging
logger = logging.getLogger('Zauba')
class zauba(CrawlSpider):
name = 'Zauba'
login_url = 'https://www.zauba.com/user'
login_user = 'scrapybot1#gmail.com'
login_password = 'scrapybot1'
logger.info('zauba')
start_urls = ['https://www.zauba.com/import-gold/p-1-hs-code.html']
def start_requests(self):
logger.info('start_request')
# let's start by sending a first request to login page
yield scrapy.Request(self.login_url, callback = self.parse_login)
def parse_login(self, response):
logger.warning('parse_login')
# got the login page, let's fill the login form...
data, url, method = fill_login_form(response.url, response.body,
self.login_user, self.login_password)
# ... and send a request with our login data
return FormRequest(url, formdata=dict(data),
method=method, callback=self.start_crawl)
def start_crawl(self, response):
logger.warning('start_crawl')
# OK, we're in, let's start crawling the protected pages
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
logger.info('parse')
text = response.xpath('//div[#id="block-system-main"]/div[#class="content"]/div[#style="width:920px; margin-bottom:12px;"]/span/text()').extract_first()
total_entries = int(text.split()[0].replace(',', ''))
total_pages = int(math.ceil((total_entries*1.0)/30))
logger.warning('*************** : ' + total_pages)
print('*************** : ' + total_pages)
for page in xrange(1, (total_pages + 1)):
url = 'https://www.zauba.com/import-gold/p-' + page +'-hs-code.html'
log.msg('url%d : %s' % (pages,url))
yield scrapy.Request(url, callback=self.extract_entries)
def extract_entries(self, response):
logger.warning('extract_entries')
row_trs = response.xpath('//div[#id="block-system-main"]/div[#class="content"]/div/table/tr')
for row_tr in row_trs[1:]:
row_content = row_tr.xpath('.//td/text()').extract()
if (row_content.__len__() == 9):
print row_content
yield {
'date' : row_content[0].replace(' ', ''),
'hs_code' : int(row_content[1]),
'description' : row_content[2],
'origin_country' : row_content[3],
'port_of_discharge' : row_content[4],
'unit' : row_content[5],
'quantity' : int(row_content[6].replace(',', '')),
'value_inr' : int(row_content[7].replace(',', '')),
'per_unit_inr' : int(row_content[8].replace(',', '')),
}
loginform.py
#!/usr/bin/env python
import sys
from argparse import ArgumentParser
from collections import defaultdict
from lxml import html
__version__ = '1.0' # also update setup.py
def _form_score(form):
score = 0
# In case of user/pass or user/pass/remember-me
if len(form.inputs.keys()) in (2, 3):
score += 10
typecount = defaultdict(int)
for x in form.inputs:
type_ = (x.type if isinstance(x, html.InputElement) else 'other'
)
typecount[type_] += 1
if typecount['text'] > 1:
score += 10
if not typecount['text']:
score -= 10
if typecount['password'] == 1:
score += 10
if not typecount['password']:
score -= 10
if typecount['checkbox'] > 1:
score -= 10
if typecount['radio']:
score -= 10
return score
def _pick_form(forms):
"""Return the form most likely to be a login form"""
return sorted(forms, key=_form_score, reverse=True)[0]
def _pick_fields(form):
"""Return the most likely field names for username and password"""
userfield = passfield = emailfield = None
for x in form.inputs:
if not isinstance(x, html.InputElement):
continue
type_ = x.type
if type_ == 'password' and passfield is None:
passfield = x.name
elif type_ == 'text' and userfield is None:
userfield = x.name
elif type_ == 'email' and emailfield is None:
emailfield = x.name
return (userfield or emailfield, passfield)
def submit_value(form):
"""Returns the value for the submit input, if any"""
for x in form.inputs:
if x.type == 'submit' and x.name:
return [(x.name, x.value)]
else:
return []
def fill_login_form(
url,
body,
username,
password,
):
doc = html.document_fromstring(body, base_url=url)
form = _pick_form(doc.xpath('//form'))
(userfield, passfield) = _pick_fields(form)
form.fields[userfield] = username
form.fields[passfield] = password
form_values = form.form_values() + submit_value(form)
return (form_values, form.action or form.base_url, form.method)
def main():
ap = ArgumentParser()
ap.add_argument('-u', '--username', default='username')
ap.add_argument('-p', '--password', default='secret')
ap.add_argument('url')
args = ap.parse_args()
try:
import requests
except ImportError:
print 'requests library is required to use loginform as a tool'
r = requests.get(args.url)
(values, action, method) = fill_login_form(args.url, r.text,
args.username, args.password)
print '''url: {0}
method: {1}
payload:'''.format(action, method)
for (k, v) in values:
print '- {0}: {1}'.format(k, v)
if __name__ == '__main__':
sys.exit(main())
The Log Message:
2016-10-02 23:31:28 [scrapy] INFO: Scrapy 1.1.3 started (bot: scraptest)
2016-10-02 23:31:28 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'scraptest.spiders', 'FEED_URI': 'medic.json', 'SPIDER_MODULES': ['scraptest.spiders'], 'BOT_NAME': 'scraptest', 'ROBOTSTXT_OBEY': True, 'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:39.0) Gecko/20100101 Firefox/39.0', 'FEED_FORMAT': 'json', 'AUTOTHROTTLE_ENABLED': True}
2016-10-02 23:31:28 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.feedexport.FeedExporter',
'scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.throttle.AutoThrottle']
2016-10-02 23:31:28 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-10-02 23:31:28 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-10-02 23:31:28 [scrapy] INFO: Enabled item pipelines:
[]
2016-10-02 23:31:28 [scrapy] INFO: Spider opened
2016-10-02 23:31:28 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-10-02 23:31:28 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6024
2016-10-02 23:31:29 [scrapy] DEBUG: Crawled (200) <GET https://www.zauba.com/robots.txt> (referer: None)
2016-10-02 23:31:38 [scrapy] DEBUG: Crawled (200) <GET https://www.zauba.com/import-gold/p-1-hs-code.html> (referer: None)
2016-10-02 23:31:38 [scrapy] INFO: Closing spider (finished)
2016-10-02 23:31:38 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 558,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 136267,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 10, 3, 6, 31, 38, 560012),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2016, 10, 3, 6, 31, 28, 927872)}
2016-10-02 23:31:38 [scrapy] INFO: Spider closed (finished)
I figured out the crapy mistake i did!!!!
I didn't place the functions inside the class. Thats why .... things didnt work as expected. Now, I added a tab space to all the fuctions and things started to work fine
Thanks #user2989777 and #Granitosaurus for coming forward to debug
Scrapy already has form request manager called FormRequest.
In most of the cases it will find the correct form by itself. You can try:
>>> scrapy shell "https://www.zauba.com/import-gold/p-1-hs-code.html"
from scrapy import FormRequest
login_data={'name':'mylogin', 'pass':'mypass'})
request = FormRequest.from_response(response, formdata=login_data)
print(request.body)
# b'form_build_id=form-Lf7bFJPTN57MZwoXykfyIV0q3wzZEQqtA5s6Ce-bl5Y&form_id=user_login_block&op=Log+in&pass=mypass&name=mylogin'
Once you log in any requests chained afterwards will have a session cookie attached to them so you only need to login once at the beginning of your chain.

Scrapy CrawlSpider is not following Links

I'm trying to crawl a page that uses next buttons to move to new pages using scrapy. I'm using an instance of crawl spider and have defined the Linkextractor to extract new pages to follow. However, the spider just crawls the start url and stops at that. I've added the spider code and the log. Anyone has any idea why the spider is not able to crawl the pages.
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from realcommercial.items import RealcommercialItem
from scrapy.selector import Selector
from scrapy.http import Request
class RealCommercial(CrawlSpider):
name = "realcommercial"
allowed_domains = ["realcommercial.com.au"]
start_urls = [
"http://www.realcommercial.com.au/for-sale/in-vic/list-1?nearbySuburb=false&autoSuggest=false&activeSort=list-date"
]
rules = [Rule(LinkExtractor( allow = ['/for-sale/in-vic/list-\d+?activeSort=list-date']),
callback='parse_response',
process_links='process_links',
follow=True),
Rule(LinkExtractor( allow = []),
callback='parse_response',
process_links='process_links',
follow=True)]
def parse_response(self, response):
sel = Selector(response)
sites = sel.xpath("//a[#class='details']")
#items = []
for site in sites:
item = RealcommercialItem()
link = site.xpath('#href').extract()
#print link, '\n\n'
item['link'] = link
link = 'http://www.realcommercial.com.au/' + str(link[0])
#print 'link!!!!!!=', link
new_request = Request(link, callback=self.parse_file_page)
new_request.meta['item'] = item
yield new_request
#items.append(item)
yield item
return
def process_links(self, links):
print 'inside process links'
for i, w in enumerate(links):
print w.url,'\n\n\n'
w.url = "http://www.realcommercial.com.au/" + w.url
print w.url,'\n\n\n'
links[i] = w
return links
def parse_file_page(self, response):
#item passed from request
#print 'parse_file_page!!!'
item = response.meta['item']
#selector
sel = Selector(response)
title = sel.xpath('//*[#id="listing_address"]').extract()
#print title
item['title'] = title
return item
Log
2015-11-29 15:42:55 [scrapy] INFO: Scrapy 1.0.3 started (bot: realcommercial)
2015-11-29 15:42:55 [scrapy] INFO: Optional features available: ssl, http11, bot
o
2015-11-29 15:42:55 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 're
alcommercial.spiders', 'FEED_FORMAT': 'csv', 'SPIDER_MODULES': ['realcommercial.
spiders'], 'FEED_URI': 'aaa.csv', 'BOT_NAME': 'realcommercial'}
2015-11-29 15:42:56 [scrapy] INFO: Enabled extensions: CloseSpider, FeedExporter
, TelnetConsole, LogStats, CoreStats, SpiderState
2015-11-29 15:42:57 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddl
eware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultH
eadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMidd
leware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2015-11-29 15:42:57 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddlewa
re, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2015-11-29 15:42:57 [scrapy] INFO: Enabled item pipelines:
2015-11-29 15:42:57 [scrapy] INFO: Spider opened
2015-11-29 15:42:57 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 i
tems (at 0 items/min)
2015-11-29 15:42:57 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2015-11-29 15:42:59 [scrapy] DEBUG: Crawled (200) <GET http://www.realcommercial
.com.au/for-sale/in-vic/list-1?nearbySuburb=false&autoSuggest=false&activeSort=l
ist-date> (referer: None)
2015-11-29 15:42:59 [scrapy] INFO: Closing spider (finished)
2015-11-29 15:42:59 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 303,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 30599,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 11, 29, 10, 12, 59, 418000),
'log_count/DEBUG': 2,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2015, 11, 29, 10, 12, 57, 780000)}
2015-11-29 15:42:59 [scrapy] INFO: Spider closed (finished)
I got the answer myself. There were two issues:
process_links was "http://www.realcommercial.com.au/" although it was already there. I thought it would give back the relative url.
The regular expression in link extractor was not correct.
I made changes to both of these and it worked.

Categories