I'm building a simple spider to crawl a structured site and download *.txt files. I've managed to get everything working except for a custom FilesPipeline class.
My goal is to download *.txt files into directories according to their url location. I can achieve my goal if I edit the Scrapy class directly (shown below)
files.py -> FilesPipeline::file_path()
...
# return 'full/%s%s' % (media_guid, media_ext)
return url.split('example.com/')[1]
I want to overload the class properly but haven't been successful. I'm not sure what I should be doing differently. The spider will run with no warnings or errors but wont download files.
settings.py
ITEM_PIPELINES = {
'myspider.pipelines.MySpiderFilesPipeline': 1,
'myspider.pipelines.MySpiderPipeline': 300,
}
spider.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from myspider.items import MySpiderItem
class SpideySpider(CrawlSpider):
name = 'spidey'
allowed_domains = ['example.com']
start_urls = ['http://example.com/']
rules = (
Rule(LinkExtractor(allow='', restrict_xpaths='//tr/td/a', deny_extensions='html'), callback='parse_item', follow=True),
)
def parse_item(self, response):
links = response.xpath('//tr/td/a')
for link in links:
i = MySpiderItem()
i['title'] = response.xpath('//title/text()').extract()
i['href'] = link.xpath('#href').extract()
i['text'] = link.xpath('text()').extract()
i["current_url"] = response.url
referring_url = response.request.headers.get('Referer', None)
i['referring_url'] = referring_url
i['depth'] = response.meta['depth']
if i['text'][0]:
if re.match('^#.*\.txt$', i['text'][0]) is not None:
i['file_urls'] = [ response.urljoin(i['href'][0]) ]
yield i
pipelines.py
import scrapy
from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.contrib.pipeline.files import FilesPipeline, FSFilesStore
import json
import re
class MySpiderPipeline(object):
def __init__(self):
self.file = open('items.json', 'wb')
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if re.match('^#.*\.html$', item['text'][0]) is not None:
valid = False
raise DropItem("HTML File")
if re.match('^#.*\.txt$', item['text'][0]) is not None:
pass
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
class MySpiderFilesPipeline(FilesPipeline):
_url_breakstring = "example.com/"
def get_media_requests(self, item, info):
return [Request(x) for x in item.get(self.files_urls_field, [])]
def file_path(self, request, response=None, info=None):
return url.split(_url_breakstring)[1]
# media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
# media_ext = os.path.splitext(url)[1] # change to request.url after deprecation
# return 'full/%s%s' % (media_guid, media_ext)
For pipeline class add __init__ method for example:
class GCSFilePipeline(ImagesPipeline):
def __init__(self, store_uri, download_func=None, settings=None):
super().__init__(store_uri, settings=settings, download_func=download_func)
Related
I have a problem with my script such that the same file name, and pdf is downloading. I have checked the output of my results without downloadfile and I get unique data. It's when I use the pipeline that it somehow produces duplicates for download.
Here's my script:
import scrapy
from environment.items import fcpItem
class fscSpider(scrapy.Spider):
name = 'fsc'
start_urls = ['https://fsc.org/en/members']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
content = response.xpath("(//div[#class='content__wrapper field field--name-field-content field--type-entity-reference-revisions field--label-hidden field__items']/div[#class='content__item even field__item'])[position() >1]")
loader = fcpItem()
names_add = response.xpath(".//div[#class = 'field__item resource-item']/article//span[#class='media-caption file-caption']/text()").getall()
url = response.xpath(".//div[#class = 'field__item resource-item']/article/div[#class='actions']/a//#href").getall()
pdf=[response.urljoin(x) for x in url if '#' is not x]
names = [x.split(' ')[0] for x in names_add]
for nm, pd in zip(names, pdf):
loader['names'] = nm
loader['pdfs'] = [pd]
yield loader
items.py
class fcpItem(scrapy.Item):
names = Field()
pdfs = Field()
results = Field()
pipelines.py
class DownfilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, item=None):
items = item['names']+'.pdf'
return items
settings.py
from pathlib import Path
import os
BASE_DIR = Path(__file__).resolve().parent.parent
FILES_STORE = os.path.join(BASE_DIR, 'fsc')
ROBOTSTXT_OBEY = False
FILES_URLS_FIELD = 'pdfs'
FILES_RESULT_FIELD = 'results'
ITEM_PIPELINES = {
'environment.pipelines.pipelines.DownfilesPipeline': 150
}
I am using css instead of xpath.
From the chrome debug panel, the tag is root of item of PDF list.
Under that div tag has title of PDF and tag for file download URL
Between root tag and tag two child's and sibling relation so xpath is not clean method and hard, a css much better is can easley pick up from root to . it don't necessary relation ship path. css can skip relationship and just sub/or grand sub is not matter. It also provides not necessary to consider index problem which is URL array and title array sync by index match.
Other key point are URL path decoding and file_urls needs to set array type even if single item.
fsc_spider.py
import scrapy
import urllib.parse
from quotes.items import fcpItem
class fscSpider(scrapy.Spider):
name = 'fsc'
start_urls = [
'https://fsc.org/en/members',
]
def parse(self, response):
for book in response.css('div.field__item.resource-item'):
url = urllib.parse.unquote(book.css('div.actions a::attr(href)').get(), encoding='utf-8', errors='replace')
url_left = url[0:url.rfind('/')]+'/'
title = book.css('span.media-caption.file-caption::text').get()
item = fcpItem()
item['original_file_name'] = title.replace(' ','_')
item['file_urls'] = ['https://fsc.org'+url_left+title.replace(' ','%20')]
yield item
items.py
import scrapy
class fcpItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field
original_file_name = scrapy.Field()
pipelines.py
import scrapy
from scrapy.pipelines.files import FilesPipeline
class fscPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
file_name: str = request.url.split("/")[-1].replace('%20','_')
return file_name
settings.py
BOT_NAME = 'quotes'
FILES_STORE = 'downloads'
SPIDER_MODULES = ['quotes.spiders']
NEWSPIDER_MODULE = 'quotes.spiders'
FEED_EXPORT_ENCODING = 'utf-8'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = { 'quotes.pipelines.fscPipeline': 1}
file structure
execution
quotes>scrapy crawl fsc
result
The problem is that you are overwriting the same scrapy item every iteration.
What you need to do is create a new item for each time your parse method yields. I have tested this and confirmed that it does produce the results you desire.
I made and inline not in my example below on the line that needs to be changed.
For example:
import scrapy
from environment.items import fcpItem
class fscSpider(scrapy.Spider):
name = 'fsc'
start_urls = ['https://fsc.org/en/members']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
content = response.xpath("(//div[#class='content__wrapper field field--name-field-content field--type-entity-reference-revisions field--label-hidden field__items']/div[#class='content__item even field__item'])[position() >1]")
names_add = response.xpath(".//div[#class = 'field__item resource-item']/article//span[#class='media-caption file-caption']/text()").getall()
url = response.xpath(".//div[#class = 'field__item resource-item']/article/div[#class='actions']/a//#href").getall()
pdf=[response.urljoin(x) for x in url if '#' is not x]
names = [x.split(' ')[0] for x in names_add]
for nm, pd in zip(names, pdf):
loader = fcpItem() # Here you create a new item each iteration
loader['names'] = nm
loader['pdfs'] = [pd]
yield loader
My project uses SerpAPI to generate a list of sites, scrapes them for any about/contact pages, and then scrapes the emails from those pages.
It had been working completely fine until I decided to pickle the list of urls generated, and then load the list into my spider.
My main.py:
# Search google using SerpAPI
search = GoogleSearch({"q": input("What are you searching? "), "location": input("Where is the location? "),
"api_key": input("What is your API key? "), "output": "html",
"num": "200", "gl": "us"})
# Filter html response for links
results = search.get_dict()
organic_results = results['organic_results']
links = []
for result in organic_results:
links.append(str(result['link']))
# Filter links to remove unwanted sites
to_remove = [
'wikipedia', 'yelp', 'google', 'britannica', 'tripadvisor', 'amazon', 'ebay', 'craigslist', 'apple',
'microsoft', 'homeadvisor', 'bing', 'businessinsider'
]
links = [i for i in links if not re.search("|".join(to_remove), i)]
set(links)
# Pickle lists and dump into separate txt files
base_path = Path(__file__).parent
file_path = (base_path / "../sites1.txt").resolve()
with open(file_path, 'wb') as fp:
pickle.dump(links, fp)
# process = CrawlerProcess(get_project_settings())
#
# process.crawl(EmailSpider)
#
# process.start()
Spider:
import pickle
import re
import tldextract
from pathlib import Path
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import EmailscrapeItem
class EmailSpider(CrawlSpider):
name = 'email'
start_urls = []
allowed_domains = []
base_path = Path(__file__).parents[2]
file_path = (base_path / "../sites1.txt").resolve()
with open(file_path, 'rb') as fp:
for i in pickle.load(fp):
start_urls.append(i)
for url in start_urls:
extracted_domain = tldextract.extract(url)
domain = "{}.{}".format(extracted_domain.domain, extracted_domain.suffix)
allowed_domains.append(domain)
rules = [
Rule(LinkExtractor(allow=r'contact/'), callback='parse'),
Rule(LinkExtractor(allow=r'contact-us/'), callback='parse'),
Rule(LinkExtractor(allow=r'about'), callback='parse'),
Rule(LinkExtractor(allow=r'about-us'), callback='parse')
]
def parse(self, response, **kwargs):
items = EmailscrapeItem()
regex = re.compile(
r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+#[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
)
# extract emails with mailto:: attachment
for res in response.xpath("//a[starts-with(#href, 'mailto')]/text()"):
items['email'] = res.get()
yield items
# extract emails using regex
html = str(response.text)
mail_list = re.findall(regex, html)
for mail in mail_list:
items['email'] = mail
yield items
And pipelines:
import re
from scrapy import signals
from scrapy.exporters import CsvItemExporter
from scrapy.exceptions import DropItem
class EmailscrapePipeline(object):
def __init__(self):
self.exporter = None
self.email_list = set()
self.file = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('emails.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
regex = re.compile(
r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+#[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
)
if not item['email']:
raise DropItem("Item is None or empty")
if not re.search(regex, str(item['email'])):
raise DropItem("Item is not an email.")
if item['email'] in self.email_list:
raise DropItem("Duplicate item email found: %s" % item)
else:
self.email_list.add(item['email'])
return item
I have no errors that appear when I run the spider via command line.
"Most" sites return a DEBUG (200).
If anyone could point me in a good direction that'd be great. I've reduced Timeout to 15 seconds, so I'm not sure as to why it freezes.
My spider looks like this
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from ProjectName.items import ProjectName
class SpidernameSpider(CrawlSpider):
name = 'spidername'
allowed_domains = ['webaddress']
start_urls = ['webaddress/query1']
rules = (
Rule(LinkExtractor(restrict_css='horizontal css')),
Rule(LinkExtractor(restrict_css='vertical css'),
callback='parse_item')
)
def parse_item(self, response):
item = ProjectName()
1_css = 'css1::text'
item['1'] = response.css(1_css).extract()
item = ProjectName()
2_css = 'css2::text'
item['2'] = response.css(2_css).extract()
return item
and my pipeline like this:
from scrapy.exceptions import DropItem
class RemoveIncompletePipeline(object):
def reminc_item(self, item, spider):
if item['1']:
return item
else:
raise DropItem("Missing content in %s" % item)
Everything works fine, when the value for field 1 is missing then, the coresponding item is taken out from the output.
But, when I change start_urls, in order to do the job for multiple queries, like this:
f = open("queries.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
or like this:
start_urls = [i.strip() for i in open('queries.txt').readlines()]
Then the output contains the items with missing value for field 1.
What's going on? And how I can avoid that?
For the record queries.txt looks like that:
webaddress/query1
webaddress/query2
According to the docs you should override start_requests method.
This method must return an iterable with the first Requests to crawl
for this spider.
This is the method called by Scrapy when the spider is opened for
scraping when no particular URLs are specified. If particular URLs are
specified, the make_requests_from_url() is used instead to create the
Requests. This method is also called only once from Scrapy, so it’s
safe to implement it as a generator.
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from ProjectName.items import ProjectName
class SpidernameSpider(CrawlSpider):
name = 'spidername'
allowed_domains = ['webaddress']
start_urls = ['webaddress/query1']
rules = (
Rule(LinkExtractor(restrict_css='horizontal css')),
Rule(LinkExtractor(restrict_css='vertical css'),
callback='parse_item')
)
def start_requests(self):
return [scrapy.Request(i.strip(), callback=self.parse_item) for i in open('queries.txt').readlines()]
def parse_item(self, response):
item = ProjectName()
1_css = 'css1::text'
item['1'] = response.css(1_css).extract()
item = ProjectName()
2_css = 'css2::text'
item['2'] = response.css(2_css).extract()
return item
UPD:
Just put this code into your spider class
def start_requests(self):
return [scrapy.Request(i.strip(), callback=self.parse_item) for i in open('queries.txt').readlines()]
UPD:
Your have a wrong logic in your parse_item method. You need to fix it.
def parse_item(self, response):
for job in response.css('div.card-top')
item = ProjectName()
# just quick example.
item['city'] = job.xpath('string(//span[#class="serp-location"])').extract()[0].replace(' ', '').replace('\n', '')
# TODO: you should fill other item fields
# ...
yeild item
This is my program to download images through image pipeline. It works well and download images but the problem ** is it rename images in sha1 hash after that I am unable to identify them. Can there be any solution so that I can use the **model_name as of the images to be download?
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from selenium import webdriver
from urlparse import urljoin
import time
class CompItem(scrapy.Item):
model_name = scrapy.Field()
images = scrapy.Field()
image_urls = scrapy.Field()
image_name = scrapy.Field()
class criticspider(CrawlSpider):
name = "buysmaart_images"
allowed_domains = ["http://buysmaart.com/"]
start_urls = ["http://buysmaart.com/productdetails/550/Samsung-Galaxy-Note-4", "http://buysmaart.com/productdetails/115/HTC-One-M8-Eye", "http://buysmaart.com/productdetails/506/OPPO-N1", "http://buysmaart.com/productdetails/342/LG-G2-D802T"]
def __init__(self, *args, **kwargs):
super(criticspider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Firefox()
self.browser.implicitly_wait(2)
def parse_start_url(self, response):
self.browser.get(response.url)
time.sleep(8)
sel = Selector(text=self.browser.page_source)
item = CompItem()
photos = sel.xpath('//ul[contains(#id,"productImageUl")]/li')
print len(photos)
all_photo_urls = []
for photo in photos:
item['image_name'] = sel.xpath('.//h3[contains(#class,"ng-binding")]/text()').extract()[0].encode('ascii','ignore')
#tmp_url = photo.xpath('.//img/#src').extract()[0].encode('ascii','ignore')
image_url = photo.xpath('.//img/#src').extract()[0]
all_photo_urls.append(image_url)
item['image_urls'] = all_photo_urls
yield item
pipeline
from scrapy.contrib.pipeline.images import DownloadImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class DownloadImagesPipeline(object):
def process_item(self, item, spider):
def get_media_requests(self, item, info):
return [Request(x, meta={'image_names': item["image_name"]})
for x in item.get('image_urls', [])]
def get_images(self, response, request, info):
for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
if re.compile('^[0-9,a-f]+.jpg$').match(key):
key = self.change_filename(key, response)
yield key, image, buf
def change_filename(self, key, response):
return "%s.jpg" % response.meta['image_name'][0]
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings
BOT_NAME = 'download_images'
SPIDER_MODULES = ['download_images.spiders']
NEWSPIDER_MODULE = 'download_images.spiders'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGES_STORE= '/home/john/Desktop/download_images/31_jul'
Scrapy 1.3.3 solution(override image_downloaded methods):
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.misc import md5sum
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url, meta={'image_names': item["image_names"]})
def image_downloaded(self, response, request, info):
checksum = None
for path, image, buf in self.get_images(response, request, info):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
width, height = image.size
path = 'full/%s' % response.meta['image_names'][0] # **Here Changed**
self.store.persist_file(
path, buf, info,
meta={'width': width, 'height': height},
headers={'Content-Type': 'image/jpeg'})
return checksum
The solution is to override the image_key method of your DownloadImagesPipeline class.
def image_key(self, url):
return 'image_name.here'
For example if you want the image name of the URL you can use
url.split('/')[-1]
as the name of the image.
Note that this method is deprecated and can be removed in a future release.
Alternatively you can set the image_name for your image in your Spider:
item['image_name'] = ['whatever_you_want']
In this case you have to extend your pipeline a bit more to utilize the name of the image you provided:
def get_media_requests(self, item, info):
return [Request(x, meta={'image_names': item["image_name"]})
for x in item.get('image_urls', [])]
def get_images(self, response, request, info):
for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
if re.compile('^[0-9,a-f]+.jpg$').match(key):
key = self.change_filename(key, response)
yield key, image, buf
def change_filename(self, key, response):
return "%s.jpg" % response.meta['image_name'][0]
And of course your pipeline should extend ImagesPipeline.
It will give answer for custom image names as well as to which folder (custom named) such images to be saved.
#spider.py
import scrapy
from ..items import DusharaItem
class DusharaSpider(scrapy.Spider):
name='dushara'
start_urls=['https://www.indiaglitz.com/dushara-photos-tamil-actress-3129970-8771']
def parse(self,response):
selector = response.xpath('//div[#class="gallmain gallerycontainer-8771"]/div[#class="gallery_detail gal-8771"]')
for sel in selector:
item = DusharaItem()
item['image_urls'] = sel.xpath('./img/#src').extract_first()
#item['image_urls'] = [sel.xpath('./img/#src').extract_first()] # for default scraping process
item['folder_names_1'] = 'Actress'
item['folder_names_2'] = 'Tamil'
item['image_names'] = sel.xpath('./img/#src').extract_first().split('/')[-1] # it should contain image extension like .jpg
yield item
#items.py
import scrapy
class DusharaItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
folder_names_1 = scrapy.Field()
folder_names_2 = scrapy.Field()
image_names = scrapy.Field()
#pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class DusharaPipeline(ImagesPipeline):
def get_media_requests(self, item,info):
url = item['image_urls']
folder_names_1 = item['folder_names_1']
folder_names_2 = item['folder_names_2']
image_names = item['image_names']
yield scrapy.Request(url=url, meta={'folder_names_1': folder_names_1, 'folder_names_2': folder_names_2, 'image_names': image_names})
def file_path(self, request, response=None, info=None, *, item=None):
folder_names_1 = request.meta['folder_names_1']
folder_names_2 = request.meta['folder_names_2']
image_names = request.meta['image_names']
return '/'+folder_names_1+'/'+folder_names_2+'/'+image_names
#settings.py
ITEM_PIPELINES = {'dushara.pipelines.DusharaPipeline': 300}
#ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1} # for default scraping process
IMAGES_STORE = r'D:\Scraped'
class AljazeeraSpider(XMLFeedSpider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
'http://www.aljazeera.com/',
]
def parse(self, response):
hxs = HtmlXPathSelector(response) # The xPath selector
titles = hxs.select('//div[contains(#class,"SkyScrapperBoxes")]/div[contains(#class,"skyscLines")]')
if not titles:
MailNotify().send_mail("Aljazeera", "Scraper Report")
items = []
for titles in titles:
item = NewsItem()
item['title'] = escape(''.join(titles.select('a/text()').extract()))
item['link'] = "http://www.aljazeera.com" + escape(''.join(titles.select('a/#href').extract()))
item['description'] = ''
item = Request(item['link'], meta={'item': item}, callback=self.parse_detail)
items.append(item)
return items
def parse_detail(self, response):
item = response.meta['item']
sel = HtmlXPathSelector(response)
detail = sel.select('//td[#class = "DetailedSummary"]')
item['details'] = remove_html_tags(escape(''.join(detail.select('p').extract())))
item['location'] = ''
published_date = sel.select('//span[#id = "ctl00_cphBody_lblDate"]')
item['published_date'] = escape(''.join(published_date.select('text()').extract()))
return item
I am currently working on Scrapy to crawl the website. I have some knowledge about unittest in python. But,How can I write the unittest to check that link is working, and item['location'], item['details'] are returning the value or not? I have learned Scrapy contract but cannot understand anything.So, how can write the unittest in this case?
If we are talking specifically about how to test the spiders (not pipelines, or loaders), then what we did is provided a "fake response" from a local HTML file. Sample code:
import os
from scrapy.http import Request, TextResponse
def fake_response(file_name=None, url=None):
"""Create a Scrapy fake HTTP response from a HTML file"""
if not url:
url = 'http://www.example.com'
request = Request(url=url)
if file_name:
if not file_name[0] == '/':
responses_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(responses_dir, file_name)
else:
file_path = file_name
file_content = open(file_path, 'r').read()
else:
file_content = ''
response = TextResponse(url=url, request=request, body=file_content,
encoding='utf-8')
return response
Then, in your TestCase class, call the fake_response() function and feed the response to the parse() callback:
from unittest.case import TestCase
class MyTestCase(TestCase):
def setUp(self):
self.spider = MySpider()
def test_parse(self):
response = fake_response('input.html')
item = self.spider.parse(response)
self.assertEqual(item['title'], 'My Title')
# ...
Aside from that, you should definitely start using Item Loaders with input and output processors - this would help to achieve a better modularity and, hence, isolation - spider would just yield item instances, data preparation and modification would be incapsulated inside the loader, which you would test separately.