I am writing a scrapy project which works perfectly. I have converted it to an executable using pyinstaller. Now I was expecting some trouble as to importing modules as I have read that a lot of people are having trouble with that. But for some reason I don't even get that far. As soon as I run the main.exe file, the console opens up and shows the following message:
Traceback (most recent call last):
File "rascraper\main.py", line 1,
This is the corresponding main.py file
from rascraper.spiders.spiderone import PostsSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def main():
process = CrawlerProcess(get_project_settings())
process.crawl(PostsSpider)
process.start()
if __name__ == '__main__':
main()
And This is my spider class
import scrapy
class PostsSpider(scrapy.Spider):
name = 'posts'
# artist = input(f'Artist Name:')
# filter = input(f'filter on Country? (y/n):')
#
# if filter == 'y':
# country = input(f'Country:')
# start_urls = [
# f'https://ra.co/dj/{artist}/past-events?country={country}'
# ]
#
# elif filter == 'n':
# start_urls = [
# f'https://ra.co/dj/{artist}/past-events'
# ]
HEADERS = {
'accept': '/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'nl-NL,nl;q=0.9,en-US;q=0.8,en;q=0.7,fr;q=0.6',
'authorization': 'df67dacc9c704696b908a618dd4f59be',
'cache-control': 'max-age=0',
'content-type': 'application/json',
'origin': 'https://ra.co',
'referer': 'https://ra.co/',
'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
}
def parse(self, response):
for post in response.css('li.Column-sc-18hsrnn-0.inVJeD'):
date = post.css('.Text-sc-1t0gn2o-0.jmZufm::text').get()
event = post.css('.Text-sc-1t0gn2o-0.Link__StyledLink-k7o46r-0.dXQVFW::text').get()
location = post.css('.Text-sc-1t0gn2o-0.Link__StyledLink-k7o46r-0.echVma::text').get()
venue = post.css('.Text-sc-1t0gn2o-0.Link__StyledLink-k7o46r-0.dxNiKF::text').get()
acts = post.css('.Text-sc-1t0gn2o-0.bYvpkM::text').get()
item = {}
item['Date'] = date
item['Event'] = event
item['Location'] = location
item['Venue'] = venue
item['Acts'] = acts
yield item
Where does this error come from and how can I solve it?
Making a standalone executable from scrapy project with PyInstaller
In order to create a single executable file you'll need to do the following steps:
Add this to all of your spiders (source):
import scrapy.utils.misc
import scrapy.core.scraper
def warn_on_generator_with_return_value_stub(spider, callable):
pass
scrapy.utils.misc.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
scrapy.core.scraper.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
So in my example spider.py will look like this:
import scrapy
import scrapy.utils.misc
import scrapy.core.scraper
def warn_on_generator_with_return_value_stub(spider, callable):
pass
scrapy.utils.misc.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
scrapy.core.scraper.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
class ExampleSpider(scrapy.Spider):
name = 'example_spider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def parse(self, response):
item = dict()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[#class="card-body"]/h4/text()').get()
yield item
Add this to main.py (if you don't add this then you'll get error whenever you try to run the executable from a directory outside of your project's directory):
import os
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', PATH_TO_SETTINGS)
In this example main.py:
import os
from rascraper.spiders.spider import ExampleSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def main():
os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'settings')
process = CrawlerProcess(get_project_settings())
process.crawl(ExampleSpider)
process.start()
if __name__ == '__main__':
main()
Run pyinstaller to generate a spec file: python -m PyInstaller --onefile --name example_exe main.py.
Change the spec file: Add all the files in your project to datas list.
Before:
# -*- mode: python ; coding: utf-8 -*-
block_cipher = None
a = Analysis(['main.py'],
pathex=[],
binaries=[],
datas=[],
hiddenimports=[],
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=[],
win_no_prefer_redirects=False,
win_private_assemblies=False,
cipher=block_cipher,
noarchive=False)
pyz = PYZ(a.pure, a.zipped_data,
cipher=block_cipher)
exe = EXE(pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
[],
name='example_exe',
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
upx_exclude=[],
runtime_tmpdir=None,
console=True,
disable_windowed_traceback=False,
target_arch=None,
codesign_identity=None,
entitlements_file=None )
After:
# -*- mode: python ; coding: utf-8 -*-
block_cipher = None
a = Analysis(['main.py'],
pathex=[],
binaries=[],
datas=[('items.py','.'),
('middlewares.py','.'),
('pipelines.py','.'),
('settings.py','.'),
('spiders','spiders'),
('..\\scrapy.cfg', '.')],
hiddenimports=[],
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=[],
win_no_prefer_redirects=False,
win_private_assemblies=False,
cipher=block_cipher,
noarchive=False)
pyz = PYZ(a.pure, a.zipped_data,
cipher=block_cipher)
exe = EXE(pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
[],
name='example_exe',
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
upx_exclude=[],
runtime_tmpdir=None,
console=True,
disable_windowed_traceback=False,
target_arch=None,
codesign_identity=None,
entitlements_file=None )
Build the spec file: python -m PyInstaller example_exe.spec
Result:
Now should have a standalone executable that you can run in any directory:
C:\Users\MY_USER\Desktop>example_exe.exe
...
...
[scrapy.core.engine] INFO: Spider opened
[scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
[scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
[scrapy.core.engine] DEBUG: Crawled (404) <GET https://scrapingclub.com/robots.txt> (referer: None)
[scrapy.core.engine] DEBUG: Crawled (200) <GET https://scrapingclub.com/exercise/detail_basic/> (referer: None)
[scrapy.core.scraper] DEBUG: Scraped from <200 https://scrapingclub.com/exercise/detail_basic/>
{'title': 'Long-sleeved Jersey Top', 'price': '$12.99'}
[scrapy.core.engine] INFO: Closing spider (finished)
[scrapy.statscollectors] INFO: Dumping Scrapy stats:
...
...
Specifically for OP's project:
The project tree looks like this:
C:.
│ main.py
│ scrapy.cfg
│
└───rascraper
│ items.py
│ middlewares.py
│ pipelines.py
│ settings.py
│ __init__.py
│
├───spiders
│ │ spiderone.py
│ │ __init__.py
│ │
│ └───__pycache__
│ spiderone.cpython-310.pyc
│ __init__.cpython-310.pyc
│
└───__pycache__
middlewares.cpython-310.pyc
pipelines.cpython-310.pyc
settings.cpython-310.pyc
__init__.cpython-310.pyc
So the datas list should be:
datas=[('rascraper\\items.py', '.'),
('rascraper\\middlewares.py', '.'),
('rascraper\\pipelines.py', '.'),
('rascraper\\settings.py', '.'),
('rascraper\\spiders', 'spiders'),
('scrapy.cfg', '.')],
Correction: In main.py it should just be os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'settings').
Related
I'm trying to extract data (title, price and description) from ajax but it doesn't work even by changing the user agent
Link : https://scrapingclub.com/exercise/detail_header/
Ajax (data want to extract) : https://scrapingclub.com/exercise/ajaxdetail_header/
import scrapy
class UseragentSpider(scrapy.Spider):
name = 'useragent'
allowed_domains = ['scrapingclub.com/exercise/ajaxdetail_header/']
start_urls = ['https://scrapingclub.com/exercise/ajaxdetail_header/']
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
def parse(self, response):
cardb= response.xpath("//div[#class='card-body']")
for thing in cardb:
title= thing.xpath(".//h3")
yield {'title' : title}
Error log :
2020-09-07 20:34:39 [scrapy.core.engine] INFO: Spider opened
2020-09-07 20:34:39 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-07 20:34:39 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-07 20:34:40 [scrapy.core.engine] DEBUG: Crawled (404) <GET https://scrapingclub.com/robots.txt> (referer: None)
2020-09-07 20:34:40 [scrapy.core.engine] DEBUG: Crawled (403) <GET https://scrapingclub.com/exercise/ajaxdetail_header/> (referer: None)
2020-09-07 20:34:40 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://scrapingclub.com/exercise/ajaxdetail_header/>: HTTP status code is not handled or not allowed
AJAX requests should send header
'X-Requested-With': 'XMLHttpRequest'
but not all servers check it. But this server check it. But it doesn't check User-Agent.
Server sends data as JSON so xpath will be useless.
I tested it with requests instead of scrapy because it was simpler for me.
import requests
headers = {
#'User-Agent': 'Mozilla/5.0',
'X-Requested-With': 'XMLHttpRequest',
}
url = 'https://scrapingclub.com/exercise/ajaxdetail_header/'
response = requests.get(url, headers=headers)
data = response.json()
print(data)
print('type:', type(data))
print('keys:', data.keys())
print('--- manually ---')
print('price:', data['price'])
print('title:', data['title'])
print('--- for-loop ---')
for key, value in data.items():
print('{}: {}'.format(key, value))
Result:
{'img_path': '/static/img/00959-A.jpg', 'price': '$24.99', 'description': 'Blouse in airy, crinkled fabric with a printed pattern. Small stand-up collar, concealed buttons at front, and flounces at front. Long sleeves with buttons at cuffs. Rounded hem. 100% polyester. Machine wash cold.', 'title': 'Crinkled Flounced Blouse'}
type: <class 'dict'>
keys: dict_keys(['img_path', 'price', 'description', 'title'])
--- manually ---
price: $24.99
title: Crinkled Flounced Blouse
--- for-loop ---
img_path: /static/img/00959-A.jpg
price: $24.99
description: Blouse in airy, crinkled fabric with a printed pattern. Small stand-up collar, concealed buttons at front, and flounces at front. Long sleeves with buttons at cuffs. Rounded hem. 100% polyester. Machine wash cold.
title: Crinkled Flounced Blouse
EDIT:
The same with Scrapy. I use function start_requests() to create Request() with header 'X-Requested-With'
You can put all code in one file and run python script.py without creating project.
import scrapy
import json
class MySpider(scrapy.Spider):
name = 'myspider'
def start_requests(self):
url = 'https://scrapingclub.com/exercise/ajaxdetail_header/'
headers = {
#'User-Agent': 'Mozilla/5.0',
'X-Requested-With': 'XMLHttpRequest',
}
yield scrapy.http.Request(url, headers=headers)
def parse(self, response):
print('url:', response.url)
data = response.json()
print(data)
print('type:', type(data))
print('keys:', data.keys())
print('--- manually ---')
print('price:', data['price'])
print('title:', data['title'])
print('--- for-loop ---')
for key, value in data.items():
print('{}: {}'.format(key, value))
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
#'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
#'FEED_FORMAT': 'csv', # csv, json, xml
#'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()
EDIT:
The same using setting DEFAULT_REQUEST_HEADERS
import scrapy
import json
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://scrapingclub.com/exercise/ajaxdetail_header/']
def parse(self, response):
print('url:', response.url)
#print('headers:', response.request.headers)
data = response.json()
print(data)
print('type:', type(data))
print('keys:', data.keys())
print('--- manually ---')
print('price:', data['price'])
print('title:', data['title'])
print('--- for-loop ---')
for key, value in data.items():
print('{}: {}'.format(key, value))
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
#'USER_AGENT': 'Mozilla/5.0',
'DEFAULT_REQUEST_HEADERS': {
#'User-Agent': 'Mozilla/5.0',
'X-Requested-With': 'XMLHttpRequest',
}
# save in file CSV, JSON or XML
#'FEED_FORMAT': 'csv', # csv, json, xml
#'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()
I've been trying to follow the Scrapy tutorial but I stuck and have no idea where is mistake.
It is working but no items are crawled.
I get the following output:
C:\Users\xxx\allegro>scrapy crawl AllegroPrices
2017-12-10 22:25:14 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: AllegroPrices)
2017-12-10 22:25:14 [scrapy.utils.log] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'allegro.spiders', 'SPIDER_MODULES': ['allegro.spiders'], 'ROBOTSTXT_OBEY': True, 'LOG_LEVEL': 'INFO', 'BOT_NAME': 'AllegroPrices'}
2017-12-10 22:25:15 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
2017-12-10 22:25:15 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2017-12-10 22:25:15 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'allegro.middlewares.AllegroSpiderMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2017-12-10 22:25:15 [scrapy.middleware] INFO: Enabled item pipelines:
['allegro.pipelines.AllegroPipeline']
2017-12-10 22:25:15 [scrapy.core.engine] INFO: Spider opened
2017-12-10 22:25:15 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2017-12-10 22:25:15 [AllegroPrices] INFO: Spider opened: AllegroPrices
2017-12-10 22:25:15 [scrapy.core.engine] INFO: Closing spider (finished)
2017-12-10 22:25:15 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 12, 10, 21, 25, 15, 527000),
'log_count/INFO': 8,
'start_time': datetime.datetime(2017, 12, 10, 21, 25, 15, 517000)}
2017-12-10 22:25:15 [scrapy.core.engine] INFO: Spider closed (finished)
My spider file:
# -*- coding: utf-8 -*-
import scrapy
from allegro.items import AllegroItem
class AllegroPrices(scrapy.Spider):
name = "AllegroPrices"
allowed_domains = ["allegro.pl"]
#Use working product URL below
start_urls = [
"http://allegro.pl/diablo-ii-lord-of-destruction-2-pc-big-box-eng-i6896736152.html", "http://allegro.pl/diablo-ii-2-pc-dvd-box-eng-i6961686788.html",
"http://allegro.pl/star-wars-empire-at-war-2006-dvd-box-i6995651106.html", "http://allegro.pl/heavy-gear-ii-2-pc-eng-cdkingpl-i7059163114.html"
]
def parse(self, response):
items = AllegroItem()
title = response.xpath('//h1[#class="title"]//text()').extract()
sale_price = response.xpath('//div[#class="price"]//text()').extract()
seller = response.xpath('//div[#class="btn btn-default btn-user"]/span/text()').extract()
items['product_name'] = ''.join(title).strip()
items['product_sale_price'] = ''.join(sale_price).strip()
items['product_seller'] = ''.join(seller).strip()
yield items
Settings:
# -*- coding: utf-8 -*-
# Scrapy settings for allegro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'AllegroPrices'
SPIDER_MODULES = ['allegro.spiders']
NEWSPIDER_MODULE = 'allegro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'allegro (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'allegro.middlewares.AllegroSpiderMiddleware': 543,
}
LOG_LEVEL = 'INFO'
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'allegro.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'allegro.pipelines.AllegroPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Pipeline:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class AllegroPipeline(object):
def process_item(self, item, spider):
return item
Items:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class AllegroItem(scrapy.Item):
# define the fields for your item here like:
product_name = scrapy.Field()
product_sale_price = scrapy.Field()
product_seller = scrapy.Field()
I have no problem to run it as standalone script without creating project and save to CSV file.
And I don't have to change USER-AGENT.
Maybe there is problem with some settings. You didn't put url to tutorial to check it.
Or simply you have wrong indentions and start_urls and parse() in not inside class. Indentions are very important in Python.
BTW: you forgot /a/ in xpath for seller.
import scrapy
#class AllegroItem(scrapy.Item):
# product_name = scrapy.Field()
# product_sale_price = scrapy.Field()
# product_seller = scrapy.Field()
class AllegroPrices(scrapy.Spider):
name = "AllegroPrices"
allowed_domains = ["allegro.pl"]
start_urls = [
"http://allegro.pl/diablo-ii-lord-of-destruction-2-pc-big-box-eng-i6896736152.html",
"http://allegro.pl/diablo-ii-2-pc-dvd-box-eng-i6961686788.html",
"http://allegro.pl/star-wars-empire-at-war-2006-dvd-box-i6995651106.html",
"http://allegro.pl/heavy-gear-ii-2-pc-eng-cdkingpl-i7059163114.html"
]
def parse(self, response):
title = response.xpath('//h1[#class="title"]//text()').extract()
sale_price = response.xpath('//div[#class="price"]//text()').extract()
seller = response.xpath('//div[#class="btn btn-default btn-user"]/a/span/text()').extract()
title = title[0].strip()
print(title, sale_price, seller)
yield {'title': title, 'price': sale_price, 'seller': seller}
#items = AllegroItem()
#items['product_name'] = ''.join(title).strip()
#items['product_sale_price'] = ''.join(sale_price).strip()
#items['product_seller'] = ''.join(seller).strip()
#yield items
# --- run it as standalone script without project and save in CSV ---
from scrapy.crawler import CrawlerProcess
#c = CrawlerProcess()
c = CrawlerProcess({
# 'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv',
'FEED_URI': 'output.csv'
})
c.crawl(AllegroPrices)
c.start()
Result in CSV file:
title,price,seller
STAR WARS: EMPIRE AT WAR [2006] DVD BOX,"24,90 zł",CDkingpl
DIABLO II: LORD OF DESTRUCTION 2 PC BIG BOX ENG,"149,00 zł",CDkingpl
HEAVY GEAR II 2 | PC ENG CDkingpl,"19,90 zł",CDkingpl
DIABLO II 2 | PC DVD BOX | ENG,"24,90 zł",CDkingpl
I have wasted days to get my mind around Scrapy, reading the docs and other Scrapy Blogs and Q&A ... and now I am about to do what men hate most: Ask for directions ;-) The problem is: My spider opens, fetches the start_urls, but apparently does nothing with them. Instead it closes immediately and that was that. Apparently, I do not even get to the first self.log() statement.
What I've got so far is this:
# -*- coding: utf-8 -*-
import scrapy
# from scrapy.shell import inspect_response
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import HtmlResponse, FormRequest, Request
from KiPieSpider.items import *
from KiPieSpider.settings import *
class KiSpider(CrawlSpider):
name = "KiSpider"
allowed_domains = ['www.kiweb.de', 'kiweb.de']
start_urls = (
# ST Regra start page:
'https://www.kiweb.de/default.aspx?pageid=206',
# follow ST Regra links in the form of:
# https://www.kiweb.de/default.aspx?pageid=206&page=\d+
# https://www.kiweb.de/default.aspx?pageid=299&docid=\d{6}
# ST Thermo start page:
'https://www.kiweb.de/default.aspx?pageid=202&page=1',
# follow ST Thermo links in the form of:
# https://www.kiweb.de/default.aspx?pageid=202&page=\d+
# https://www.kiweb.de/default.aspx?pageid=299&docid=\d{6}
)
rules = (
# First rule that matches a given link is followed / parsed.
# Follow category pagination without further parsing:
Rule(
LinkExtractor(
# Extract links in the form:
allow=r'Default\.aspx?pageid=(202|206])&page=\d+',
# but only within the pagination table cell:
restrict_xpaths=('//td[#id="ctl04_teaser_next"]'),
),
follow=True,
),
# Follow links to category (202|206) articles and parse them:
Rule(
LinkExtractor(
# Extract links in the form:
allow=r'Default\.aspx?pageid=299&docid=\d+',
# but only within article preview cells:
restrict_xpaths=("//td[#class='TOC-zelle TOC-text']"),
),
# and parse the resulting pages for article content:
callback='parse_init',
follow=False,
),
)
# Once an article page is reached, check whether a login is necessary:
def parse_init(self, response):
self.log('Parsing article: %s' % response.url)
if not response.xpath('input[#value="Logout"]'):
# Note: response.xpath() is a shortcut of response.selector.xpath()
self.log('Not logged in. Logging in...\n')
return self.login(response)
else:
self.log('Already logged in. Continue crawling...\n')
return self.parse_item(response)
def login(self, response):
self.log("Trying to log in...\n")
self.username = self.settings['KI_USERNAME']
self.password = self.settings['KI_PASSWORD']
return FormRequest.from_response(
response,
formname='Form1',
formdata={
# needs name, not id attributes!
'ctl04$Header$ctl01$textbox_username': self.username,
'ctl04$Header$ctl01$textbox_password': self.password,
'ctl04$Header$ctl01$textbox_logindaten_typ': 'Username_Passwort',
'ctl04$Header$ctl01$checkbox_permanent': 'True',
},
callback = self.parse_item,
)
def parse_item(self, response):
articles = response.xpath('//div[#id="artikel"]')
items = []
for article in articles:
item = KiSpiderItem()
item['link'] = response.url
item['title'] = articles.xpath("div[#class='ct1']/text()").extract()
item['subtitle'] = articles.xpath("div[#class='ct2']/text()").extract()
item['article'] = articles.extract()
item['published'] = articles.xpath("div[#class='biblio']/text()").re(r"(\d{2}.\d{2}.\d{4}) PIE")
item['artid'] = articles.xpath("div[#class='biblio']/text()").re(r"PIE \[(d+)-\d+\]")
item['lang'] = 'de-DE'
items.append(item)
# return(items)
yield items
# what is the difference between return and yield?? found both on web.
When doing scrapy crawl KiSpider, this results in:
2017-03-09 18:03:33 [scrapy.utils.log] INFO: Scrapy 1.3.2 started (bot: KiPieSpider)
2017-03-09 18:03:33 [scrapy.utils.log] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'KiPieSpider.spiders', 'DEPTH_LIMIT': 3, 'CONCURRENT_REQUESTS': 8, 'SPIDER_MODULES': ['KiPieSpider.spiders'], 'BOT_NAME': 'KiPieSpider', 'DOWNLOAD_TIMEOUT': 60, 'USER_AGENT': 'KiPieSpider (info#defrent.de)', 'DOWNLOAD_DELAY': 0.25}
2017-03-09 18:03:33 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
2017-03-09 18:03:33 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2017-03-09 18:03:33 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2017-03-09 18:03:33 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2017-03-09 18:03:33 [scrapy.core.engine] INFO: Spider opened
2017-03-09 18:03:33 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2017-03-09 18:03:33 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2017-03-09 18:03:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.kiweb.de/default.aspx?pageid=206> (referer: None)
2017-03-09 18:03:34 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.kiweb.de/default.aspx?pageid=202&page=1> (referer: None)
2017-03-09 18:03:34 [scrapy.core.engine] INFO: Closing spider (finished)
2017-03-09 18:03:34 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 465,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 48998,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 3, 9, 17, 3, 34, 235000),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'response_received_count': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2017, 3, 9, 17, 3, 33, 295000)}
2017-03-09 18:03:34 [scrapy.core.engine] INFO: Spider closed (finished)
Is it that the login routine should not end with a callback, but some kind of return/yield statement? Or what am I doing wrong? Unfortunately, the docs and tutorials I have seen so far only give me a vague idea of how every bit connects to the others, especially Scrapy's docs seem to be written as a reference for people who already know a lot about Scrapy.
Somewhat frustrated greetings
Christopher
rules = (
# First rule that matches a given link is followed / parsed.
# Follow category pagination without further parsing:
Rule(
LinkExtractor(
# Extract links in the form:
# allow=r'Default\.aspx?pageid=(202|206])&page=\d+',
# but only within the pagination table cell:
restrict_xpaths=('//td[#id="ctl04_teaser_next"]'),
),
follow=True,
),
# Follow links to category (202|206) articles and parse them:
Rule(
LinkExtractor(
# Extract links in the form:
# allow=r'Default\.aspx?pageid=299&docid=\d+',
# but only within article preview cells:
restrict_xpaths=("//td[#class='TOC-zelle TOC-text']"),
),
# and parse the resulting pages for article content:
callback='parse_init',
follow=False,
),
)
you do not need allow parameter, because there is only one link in the tag selected by XPath.
I do not understand the regex in allow parameter but at least you should escape the ?.
My intention is to invoke start_requests method to login to the website. After login, scrape the website. Based on the log message, I see that
1. But, I see that start_request is not invoked.
2. call_back function of the parse is also not invoking.
Whats actually happening is spider is only loading the urls in the start_urls.
Question:
Why the spider is not crawling through other pages(say page 2, 3, 4)?
Why looking from spider is not working?
Note:
My method to calculate page number and url creation is correct. I verified it.
I referred this link to write this code Using loginform with scrapy
My code:
zauba.py (spider)
#!/usr/bin/env python
from scrapy.spiders import CrawlSpider
from scrapy.http import FormRequest
from scrapy.http.request import Request
from loginform import fill_login_form
import logging
logger = logging.getLogger('Zauba')
class zauba(CrawlSpider):
name = 'Zauba'
login_url = 'https://www.zauba.com/user'
login_user = 'scrapybot1#gmail.com'
login_password = 'scrapybot1'
logger.info('zauba')
start_urls = ['https://www.zauba.com/import-gold/p-1-hs-code.html']
def start_requests(self):
logger.info('start_request')
# let's start by sending a first request to login page
yield scrapy.Request(self.login_url, callback = self.parse_login)
def parse_login(self, response):
logger.warning('parse_login')
# got the login page, let's fill the login form...
data, url, method = fill_login_form(response.url, response.body,
self.login_user, self.login_password)
# ... and send a request with our login data
return FormRequest(url, formdata=dict(data),
method=method, callback=self.start_crawl)
def start_crawl(self, response):
logger.warning('start_crawl')
# OK, we're in, let's start crawling the protected pages
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
logger.info('parse')
text = response.xpath('//div[#id="block-system-main"]/div[#class="content"]/div[#style="width:920px; margin-bottom:12px;"]/span/text()').extract_first()
total_entries = int(text.split()[0].replace(',', ''))
total_pages = int(math.ceil((total_entries*1.0)/30))
logger.warning('*************** : ' + total_pages)
print('*************** : ' + total_pages)
for page in xrange(1, (total_pages + 1)):
url = 'https://www.zauba.com/import-gold/p-' + page +'-hs-code.html'
log.msg('url%d : %s' % (pages,url))
yield scrapy.Request(url, callback=self.extract_entries)
def extract_entries(self, response):
logger.warning('extract_entries')
row_trs = response.xpath('//div[#id="block-system-main"]/div[#class="content"]/div/table/tr')
for row_tr in row_trs[1:]:
row_content = row_tr.xpath('.//td/text()').extract()
if (row_content.__len__() == 9):
print row_content
yield {
'date' : row_content[0].replace(' ', ''),
'hs_code' : int(row_content[1]),
'description' : row_content[2],
'origin_country' : row_content[3],
'port_of_discharge' : row_content[4],
'unit' : row_content[5],
'quantity' : int(row_content[6].replace(',', '')),
'value_inr' : int(row_content[7].replace(',', '')),
'per_unit_inr' : int(row_content[8].replace(',', '')),
}
loginform.py
#!/usr/bin/env python
import sys
from argparse import ArgumentParser
from collections import defaultdict
from lxml import html
__version__ = '1.0' # also update setup.py
def _form_score(form):
score = 0
# In case of user/pass or user/pass/remember-me
if len(form.inputs.keys()) in (2, 3):
score += 10
typecount = defaultdict(int)
for x in form.inputs:
type_ = (x.type if isinstance(x, html.InputElement) else 'other'
)
typecount[type_] += 1
if typecount['text'] > 1:
score += 10
if not typecount['text']:
score -= 10
if typecount['password'] == 1:
score += 10
if not typecount['password']:
score -= 10
if typecount['checkbox'] > 1:
score -= 10
if typecount['radio']:
score -= 10
return score
def _pick_form(forms):
"""Return the form most likely to be a login form"""
return sorted(forms, key=_form_score, reverse=True)[0]
def _pick_fields(form):
"""Return the most likely field names for username and password"""
userfield = passfield = emailfield = None
for x in form.inputs:
if not isinstance(x, html.InputElement):
continue
type_ = x.type
if type_ == 'password' and passfield is None:
passfield = x.name
elif type_ == 'text' and userfield is None:
userfield = x.name
elif type_ == 'email' and emailfield is None:
emailfield = x.name
return (userfield or emailfield, passfield)
def submit_value(form):
"""Returns the value for the submit input, if any"""
for x in form.inputs:
if x.type == 'submit' and x.name:
return [(x.name, x.value)]
else:
return []
def fill_login_form(
url,
body,
username,
password,
):
doc = html.document_fromstring(body, base_url=url)
form = _pick_form(doc.xpath('//form'))
(userfield, passfield) = _pick_fields(form)
form.fields[userfield] = username
form.fields[passfield] = password
form_values = form.form_values() + submit_value(form)
return (form_values, form.action or form.base_url, form.method)
def main():
ap = ArgumentParser()
ap.add_argument('-u', '--username', default='username')
ap.add_argument('-p', '--password', default='secret')
ap.add_argument('url')
args = ap.parse_args()
try:
import requests
except ImportError:
print 'requests library is required to use loginform as a tool'
r = requests.get(args.url)
(values, action, method) = fill_login_form(args.url, r.text,
args.username, args.password)
print '''url: {0}
method: {1}
payload:'''.format(action, method)
for (k, v) in values:
print '- {0}: {1}'.format(k, v)
if __name__ == '__main__':
sys.exit(main())
The Log Message:
2016-10-02 23:31:28 [scrapy] INFO: Scrapy 1.1.3 started (bot: scraptest)
2016-10-02 23:31:28 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'scraptest.spiders', 'FEED_URI': 'medic.json', 'SPIDER_MODULES': ['scraptest.spiders'], 'BOT_NAME': 'scraptest', 'ROBOTSTXT_OBEY': True, 'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:39.0) Gecko/20100101 Firefox/39.0', 'FEED_FORMAT': 'json', 'AUTOTHROTTLE_ENABLED': True}
2016-10-02 23:31:28 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.feedexport.FeedExporter',
'scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.throttle.AutoThrottle']
2016-10-02 23:31:28 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-10-02 23:31:28 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-10-02 23:31:28 [scrapy] INFO: Enabled item pipelines:
[]
2016-10-02 23:31:28 [scrapy] INFO: Spider opened
2016-10-02 23:31:28 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-10-02 23:31:28 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6024
2016-10-02 23:31:29 [scrapy] DEBUG: Crawled (200) <GET https://www.zauba.com/robots.txt> (referer: None)
2016-10-02 23:31:38 [scrapy] DEBUG: Crawled (200) <GET https://www.zauba.com/import-gold/p-1-hs-code.html> (referer: None)
2016-10-02 23:31:38 [scrapy] INFO: Closing spider (finished)
2016-10-02 23:31:38 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 558,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 136267,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 10, 3, 6, 31, 38, 560012),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2016, 10, 3, 6, 31, 28, 927872)}
2016-10-02 23:31:38 [scrapy] INFO: Spider closed (finished)
I figured out the crapy mistake i did!!!!
I didn't place the functions inside the class. Thats why .... things didnt work as expected. Now, I added a tab space to all the fuctions and things started to work fine
Thanks #user2989777 and #Granitosaurus for coming forward to debug
Scrapy already has form request manager called FormRequest.
In most of the cases it will find the correct form by itself. You can try:
>>> scrapy shell "https://www.zauba.com/import-gold/p-1-hs-code.html"
from scrapy import FormRequest
login_data={'name':'mylogin', 'pass':'mypass'})
request = FormRequest.from_response(response, formdata=login_data)
print(request.body)
# b'form_build_id=form-Lf7bFJPTN57MZwoXykfyIV0q3wzZEQqtA5s6Ce-bl5Y&form_id=user_login_block&op=Log+in&pass=mypass&name=mylogin'
Once you log in any requests chained afterwards will have a session cookie attached to them so you only need to login once at the beginning of your chain.
I write a small spider, when I run and it can't call pipeline.
After debug for a while, I find the bug code area.
The logic of the spider is that I crawl the first url to fetch cookie, then I crawl the second url to download the code picture with cookie, and I post some data I prepare to the third url. And If the text I get from the picture wrong then I download again to post the third url repeatedly, until I got the right text.
Let me show you the code:
# -*- coding: gbk -*-
import scrapy
from scrapy.http import FormRequest
import json
import os
from datetime import datetime
from scrapy.selector import Selector
from teacherCourse.handlePic import handle
from teacherCourse.items import DetailProfItem
from teacherCourse.items import DetailProfCourseItem
from teacherCourse.items import containItem
class GetTeacherCourseSpider(scrapy.Spider):
name = 'TeacherCourse'
# custom_settings = {
# 'ITEM_PIPELINES': {
# 'teacherCourse.pipelines.TeacherCoursePipeline': 300,
# }
# }
def __init__(self, selXNXQ='', titleCode=''):
self.getUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx' # first
self.vcodeUrl = 'http://jwxt.dgut.edu.cn/jwweb/sys/ValidateCode.aspx' # second
self.postUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB_rpt.aspx' # third
self.findSessionId = None # to save the cookies
self.XNXQ = selXNXQ
self.titleCode = titleCode
def start_requests(self):
request = scrapy.Request(self.getUrl,
callback = self.downloadPic)
yield request
def downloadPic(self, response):
# download the picture
# find the session id
self.findSessionId = response.headers.getlist('Set-Cookie')[0].decode().split(";")[0].split("=")
request = scrapy.Request(self.vcodeUrl,
cookies= {self.findSessionId[0]: self.findSessionId[1]},
callback = self.getAndHandleYzm)
yield request
def getAndHandleYzm(self, response):
yzm = handle(response.body)
yield FormRequest(self.postUrl,
formdata={'Sel_XNXQ': '20151',
'sel_zc': '011',
'txt_yzm': yzm,
'type': '2'},
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0] + '=' + self.findSessionId[1],
},
callback=self.parse)
def parse(self, response):
body = response.body.decode('gbk')
num = body.find('alert')
if num != -1:
# means CAPTCHA validation fails, need to re-request the CAPTCHA
yield scrapy.Request(self.vcodeUrl+'?t='+'%.f' % (datetime.now().microsecond / 1000),
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0]+'='+self.findSessionId[1]
},
callback=self.getAndHandleYzm)
else:
# parse data
self.parseData(body)
# item = containItem()
# item['first'] = len(body)
# return item
# the parse data part is a little bit long, but it doesn't matter.
# At the last line, I did yield a item
def parseData(self, body):
# parse body data
sel = Selector(text=body)
# get all the note text data
noteTables = sel.xpath('//table[#style="border:0px;"]').extract()
noteList = [] # to store all the note text
for noteTable in noteTables:
if '<b>' in noteTable:
sele = Selector(text = noteTable)
note = (sele.xpath('//table/tr/td/b/text()').extract())
noteText = (sele.xpath('//table/tr/td/text()').extract())
# combine note and noteText
if not noteText:
noteText.append('')
noteText.append('')
else:
if len(noteText) == 1:
noteText.append('')
noteList.append(noteText)
# get all the course data
courseTables = sel.xpath('//table[#class="page_table"]/tbody').extract()
AllDetailCourse = [] # all the teachers' course
for table in courseTables:
everyTeacherC = [] # every teacher's course
s = Selector(text = table)
trs = s.xpath('//tr').extract()
for tr in trs:
sel = Selector(text = tr)
snum = (sel.xpath('//td[1]/text()').extract())
course = (sel.xpath('//td[2]/text()').extract())
credit = (sel.xpath('//td[3]/text()').extract())
teachWay = (sel.xpath('//td[4]/text()').extract())
courseType = (sel.xpath('//td[5]/text()').extract())
classNum = (sel.xpath('//td[6]/text()').extract())
className = (sel.xpath('//td[7]/text()').extract())
stuNum = (sel.xpath('//td[8]/text()').extract())
week = (sel.xpath('//td[9]/text()').extract())
section = (sel.xpath('//td[10]/text()').extract())
location = (sel.xpath('//td[11]/text()').extract())
tmpList = []
tmpList.append(snum)
tmpList.append(course)
tmpList.append(credit)
tmpList.append(teachWay)
tmpList.append(courseType)
tmpList.append(classNum)
tmpList.append(className)
tmpList.append(stuNum)
tmpList.append(week)
tmpList.append(section)
tmpList.append(location)
# to know whether every variable is empty
detailCourse = []
for each in tmpList:
if not each:
each = ''
else:
each = each[0]
detailCourse.append(each)
everyTeacherC.append(detailCourse)
AllDetailCourse.append(everyTeacherC)
# get department, teacher, gender and title
sel = Selector(text = body)
temp1 = sel.xpath('//*[#group="group"]/table/tr/td/text()').extract()
# fill two tables, which will store in the database
i = 0
# every professor
for each in temp1:
tables = containItem() # all the data in every for loop to send to the pipeline
each = each.replace(u'\xa0', u' ')
each = each.split(' ')
depart = each[0].split('£º')
teacher = each[1].split('£º')
gender = each[2].split('£º')
title = each[3].split('£º')
# first table
profItem = DetailProfItem()
profItem['XNXQ'] = self.XNXQ
profItem['department'] = depart[1] # department
profItem['teacher'] = teacher[1] # teacher
profItem['gender'] = gender[1]
profItem['title'] = title[1]
profItem['note1'] = noteList[i][0]
profItem['note2'] = noteList[i][1]
tables['first'] = profItem # add the first table
# second table
# every professor's courses
profCourses = []
for j in range(len(AllDetailCourse[i])): # how many course for every professor
profCourseItem = DetailProfCourseItem() # every course for every professor
profCourseItem['snum'] = AllDetailCourse[i][j][0] # i means i-th professor, j means j-th course, third num means what position of the course
profCourseItem['course'] = AllDetailCourse[i][j][1]
profCourseItem['credit'] = AllDetailCourse[i][j][2]
profCourseItem['teachWay'] = AllDetailCourse[i][j][3]
profCourseItem['courseType'] = AllDetailCourse[i][j][4]
profCourseItem['classNum'] = AllDetailCourse[i][j][5]
profCourseItem['className'] = AllDetailCourse[i][j][6]
profCourseItem['stuNum'] = AllDetailCourse[i][j][7]
profCourseItem['week'] = AllDetailCourse[i][j][8]
profCourseItem['section'] = AllDetailCourse[i][j][9]
profCourseItem['location'] = AllDetailCourse[i][j][10]
profCourses.append(profCourseItem) # every professor's courses
tables['second'] = profCourseItem # add the second table
i += 1
yield tables
Any suggestions would be appreciate!
settings.py: (pipeline part)
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'teacherCourse.pipelines.TeacherCoursePipeline': 300,
}
items.py: (I don't think it's matter)
# detail professor course message
class DetailProfCourseItem(scrapy.Item):
snum = scrapy.Field() # serial number
course = scrapy.Field()
credit = scrapy.Field()
teachWay = scrapy.Field()
courseType = scrapy.Field()
classNum = scrapy.Field()
className = scrapy.Field()
stuNum = scrapy.Field()
week = scrapy.Field()
section = scrapy.Field()
location = scrapy.Field()
# the third item which contain first and second item
class containItem(scrapy.Item):
first = scrapy.Field() # for fist table
second = scrapy.Field() # for second table
pipeline code:
class TeacherCoursePipeline(object):
def process_item(self, item, spider):
print('I am called!!!!!')
print(item)
return item
And When I run spider scrapy crawl TeacherCourse
it output:
2016-07-19 17:39:18 [scrapy] INFO: Scrapy 1.1.0rc1 started (bot: teacherCourse)
2016-07-19 17:39:18 [scrapy] INFO: Overridden settings: {'BOT_NAME': 'teacherCourse', 'NEWSPIDER_MODULE': 'teacherCourse.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['teacherCourse.spiders']}
2016-07-19 17:39:18 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats', 'scrapy.extensions.logstats.LogStats']
2016-07-19 17:39:18 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-07-19 17:39:18 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-07-19 17:39:18 [scrapy] INFO: Enabled item pipelines:
['teacherCourse.pipelines.TeacherCoursePipeline']
2016-07-19 17:39:18 [scrapy] INFO: Spider opened
2016-07-19 17:39:18 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-07-19 17:39:18 [scrapy] DEBUG: Crawled (404) <GET http://jwxt.dgut.edu.cn/robots.txt> (referer: None)
2016-07-19 17:39:18 [scrapy] DEBUG: Crawled (200) <GET http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx> (referer: None)
2016-07-19 17:39:19 [scrapy] DEBUG: Crawled (200) <GET http://jwxt.dgut.edu.cn/jwweb/sys/ValidateCode.aspx> (referer: http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx)
2016-07-19 17:39:19 [scrapy] DEBUG: Crawled (200) <POST http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB_rpt.aspx> (referer: http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx)
2016-07-19 17:39:19 [scrapy] INFO: Closing spider (finished)
2016-07-19 17:39:19 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1330,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 3,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 230886,
'downloader/response_count': 4,
'downloader/response_status_count/200': 3,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 7, 19, 9, 39, 19, 861620),
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'request_depth_max': 2,
'response_received_count': 4,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'start_time': datetime.datetime(2016, 7, 19, 9, 39, 18, 774293)}
2016-07-19 17:39:19 [scrapy] INFO: Spider closed (finished)
The problem seems to be that the parse method only yields scrapy.Request objects, never scrapy.Item instances.
The else: branch calls the generator parseData(body) but doesn't use the data it can produce (namely containItem objects).
One way to solve this is to loop on the generator results and yield them one-by-one:
def parse(self, response):
body = response.body.decode('gbk')
num = body.find('alert')
if num != -1:
# means CAPTCHA validation fails, need to re-request the CAPTCHA
yield scrapy.Request(self.vcodeUrl+'?t='+'%.f' % (datetime.now().microsecond / 1000),
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0]+'='+self.findSessionId[1]
},
callback=self.getAndHandleYzm)
else:
# parse data
for i in self.parseData(body):
yield i
# item = containItem()
# item['first'] = len(body)
# return item