Scrapy stops scraping yet continues to run - python

My project uses SerpAPI to generate a list of sites, scrapes them for any about/contact pages, and then scrapes the emails from those pages.
It had been working completely fine until I decided to pickle the list of urls generated, and then load the list into my spider.
My main.py:
# Search google using SerpAPI
search = GoogleSearch({"q": input("What are you searching? "), "location": input("Where is the location? "),
"api_key": input("What is your API key? "), "output": "html",
"num": "200", "gl": "us"})
# Filter html response for links
results = search.get_dict()
organic_results = results['organic_results']
links = []
for result in organic_results:
links.append(str(result['link']))
# Filter links to remove unwanted sites
to_remove = [
'wikipedia', 'yelp', 'google', 'britannica', 'tripadvisor', 'amazon', 'ebay', 'craigslist', 'apple',
'microsoft', 'homeadvisor', 'bing', 'businessinsider'
]
links = [i for i in links if not re.search("|".join(to_remove), i)]
set(links)
# Pickle lists and dump into separate txt files
base_path = Path(__file__).parent
file_path = (base_path / "../sites1.txt").resolve()
with open(file_path, 'wb') as fp:
pickle.dump(links, fp)
# process = CrawlerProcess(get_project_settings())
#
# process.crawl(EmailSpider)
#
# process.start()
Spider:
import pickle
import re
import tldextract
from pathlib import Path
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import EmailscrapeItem
class EmailSpider(CrawlSpider):
name = 'email'
start_urls = []
allowed_domains = []
base_path = Path(__file__).parents[2]
file_path = (base_path / "../sites1.txt").resolve()
with open(file_path, 'rb') as fp:
for i in pickle.load(fp):
start_urls.append(i)
for url in start_urls:
extracted_domain = tldextract.extract(url)
domain = "{}.{}".format(extracted_domain.domain, extracted_domain.suffix)
allowed_domains.append(domain)
rules = [
Rule(LinkExtractor(allow=r'contact/'), callback='parse'),
Rule(LinkExtractor(allow=r'contact-us/'), callback='parse'),
Rule(LinkExtractor(allow=r'about'), callback='parse'),
Rule(LinkExtractor(allow=r'about-us'), callback='parse')
]
def parse(self, response, **kwargs):
items = EmailscrapeItem()
regex = re.compile(
r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+#[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
)
# extract emails with mailto:: attachment
for res in response.xpath("//a[starts-with(#href, 'mailto')]/text()"):
items['email'] = res.get()
yield items
# extract emails using regex
html = str(response.text)
mail_list = re.findall(regex, html)
for mail in mail_list:
items['email'] = mail
yield items
And pipelines:
import re
from scrapy import signals
from scrapy.exporters import CsvItemExporter
from scrapy.exceptions import DropItem
class EmailscrapePipeline(object):
def __init__(self):
self.exporter = None
self.email_list = set()
self.file = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('emails.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
regex = re.compile(
r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+#[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+'
)
if not item['email']:
raise DropItem("Item is None or empty")
if not re.search(regex, str(item['email'])):
raise DropItem("Item is not an email.")
if item['email'] in self.email_list:
raise DropItem("Duplicate item email found: %s" % item)
else:
self.email_list.add(item['email'])
return item
I have no errors that appear when I run the spider via command line.
"Most" sites return a DEBUG (200).
If anyone could point me in a good direction that'd be great. I've reduced Timeout to 15 seconds, so I'm not sure as to why it freezes.

Related

Scrapy to print results in real time rather than waiting for crawl to finish

Is it possible for scrapy to print the results in real-time? I'm planning to crawl large sites and fear that if my vpn connection cuts off, crawl effort will just be wasted since it won't print any results.
I'm currently using VPN with rotating user agents and I know it's ideal to use rotating proxies instead of VPN but that will be for the future script upgrade.
import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
results = open('results.csv','w')
class TestSpider(CrawlSpider):
name = "test"
with open("domains.txt", "r") as d:
allowed_domains = [url.strip() for url in d.readlines()]
with open("urls.txt", "r") as f:
start_urls = [url.strip() for url in f.readlines()]
f.close()
rules = (Rule(LinkExtractor(allow=('/'), deny=('9','10')), follow=True, callback='parse_item'),)
def parse_item(self, response):
for pattern in ['Albert Einstein', 'Bob Marley']:
result = re.findall(pattern, response.text)
print(response.url,">",pattern,'>',len(result), file = results)
Many thanks in advance.
Updates
The script from harada works perfectly without any changes at all apart from the save file. All I needed to do was to make some modifications to the current files as below in order for everything to work.
spider - defined items
import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import TestItem
class TestSpider(CrawlSpider):
name = "test"
with open("domains.txt", "r") as d:
allowed_domains = [url.strip() for url in d.readlines()]
with open("urls.txt", "r") as f:
start_urls = [url.strip() for url in f.readlines()]
f.close()
rules = (Rule(LinkExtractor(allow=('/'), deny=('9','10')), follow=True, callback='parse_item'),)
def parse_item(self, response):
items = TestItem()
for pattern in ['Albert Einstein', 'Bob Marley']:
result = re.findall(pattern, response.text)
url = response.url
count = len(result)
items['url'] = url
items['pattern'] = pattern
items['count'] = count
yield(items)
items.py - added items as fields
import scrapy
class TestItem(scrapy.Item):
url = scrapy.Field()
pattern = scrapy.Field()
count = scrapy.Field()
settings.py - uncommented ITEM_PIPELINES
ITEM_PIPELINES = {
'test.pipelines.TestPipeline': 300,
}
You can add a script to your pipeline that can save the data you have at that time to a file. Add a counter to the pipeline as a variable, and when the pipeline reaches a certain threshold (let's say, each 1000 items yielded), it should write to a file. The code would look something like this. I tried to make it as general as possible.
class MyPipeline:
def __init__(self):
# variable that keeps track of the total number of items yielded
self.total_count = 0
self.data = []
def process_item(self, item, spider):
self.data.append(item)
self.total_count += 1
if self.total_count % 1000 == 0:
# write to your file of choice....
# I'm not sure how your data is stored throughout the crawling process
# If it's a variable of the pipeline like self.data,
# then just write that to the file
with open("test.txt", "w") as myfile:
myfile.write(f'{self.data}')
return item

Scrapy MultiCSVItemPipeline exports some empty items

I got multiple Spiders with different items and i want to export each item into a different csv file. I used the code example from How can scrapy export items to separate csv files per item, but there is a problem.
Right now my spider will only write the "page" item. All items is filled in the shell but the files keep being empty. I debugged the pipeline, but I didn't found an error so far.
Here is my spider:
import csv
import scrapy
from BeautifulSoup import BeautifulSoup
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.spiders import Rule
from DataSpiders import CSV_PATH
from ScrapingItems import TrierDeItem
from SuperSpider import SuperSpider
HTML_PATH = 'pages/trier.de/'
class TrierDeSpider(scrapy.Spider, SuperSpider):
name = 'trierDeSpider'
allowed_domains = ['trier.de']
denied_domains = []
start_urls = [
'https://www.trier.de/rathaus-buerger-in/trier-in-zahlen/',
'https://trier.de/startseite/',
'https://www.trier.de/leben-in-trier/',
'https://www.trier.de/kultur-freizeit/',
'https://www.trier.de/wirtschaft-arbeit/',
'https://www.trier.de/bildung-wissenschaft/',
'https://www.trier.de/bauen-wohnen/',
'https://www.trier.de/umwelt-verkehr/',
]
# Set starting point for the spider and starts crawling from start_urls
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse', follow=True),)
def parse(self, response):
"""
Parse for Links Page Body. Follow allowed Domains by adding them to the request. Parse the current page with
callback and the method parse_page.
:param response:
:return:
"""
for link in LxmlLinkExtractor(allow=self.allowed_domains, deny=self.denied_domains).extract_links(response):
yield scrapy.Request(response.urljoin(link.url), callback=self.parse_page)
def parse_page(self, response):
"""
Parse the current page for information.
:param response:
:return:
"""
trier_de_item = TrierDeItem()
yield self.parse_general_page_info(response, HTML_PATH)
# extract the page url
trier_de_item["url"] = response.url
# extract the crawling datetime
trier_de_item["crawling_date_time"] = response.headers['Date']
# extract page title
trier_de_item["title"] = response.css('title::text').extract()
# extract description tags
trier_de_item["description"] = response.xpath('//meta[#name="description"]/#content').extract()
trier_de_item["og_description"] = response.xpath('//meta[#name="og:description"]/#content').extract()
# extract all page headers
trier_de_item["news_title"] = response.xpath('//div[#class="dachzeile"]/text()').extract()
# extract topic
trier_de_item["topic"] = response.xpath('//div[#class="topic"]/text()').extract()
# extract headlines
trier_de_item['headlines'] = response.xpath('//h1/text()').extract()
# check if page contains a table
table = response.xpath('//table[#class="datentabelle"]').extract()
if len(table) > 0:
self.parse_table(response.body, trier_de_item['headlines'][0])
yield trier_de_item
#staticmethod
def parse_table(body_html, title):
'''
Parse HTML Page with table and save to csv file
:param body_html:
:param title:
:return:
'''
title = title.replace('/', '')
try:
# Create Filename from title
filename = title + '.csv'
soup = BeautifulSoup(body_html)
soup.prettify('utf-8')
content = []
# find all tables in html
tables = soup.findAll('table')
for table in tables:
# find reach table row
for row in table.findAll('tr'):
# extract each table header and row and extract text to line from each row
line = []
for header in row.findAll('th'):
if ' ' in header.text:
line.append('')
else:
line.append(header.text)
for row in row.findAll('td'):
if ' ' in row.text:
line.append('')
else:
line.append(row.text)
content.append(line)
# Open a new csv file an write each line to the file
with open(CSV_PATH + filename, 'wb') as csv_file:
wr = csv.writer(csv_file)
for line in content:
wr.writerow(line)
except Exception as e:
print(e)
pass
SuperSpider:
import urlparse
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from DataSpiders import write_html
from DataSpiders.ScrapingItems import PageItem, BaseItem
ALLOWED_FILE_TYPES = ('.pdf', '.csv', '.xls', '.xlsx')
class SuperSpider:
def __init__(self):
pass
def url_join(self, urls, response):
'''
Join URL with response
:param urls:
:param response:
:return:
'''
joined_urls = []
for url in urls:
joined_urls.append(response.urljoin(url))
return joined_urls
def parse_general_page_info(self, response, HTML_PATH):
page_item = PageItem()
page_item["url"] = response.url
# extract respones body
if 'jsp' in response.url:
url = response.url.split('.jsp')
write_html(url[0], response.body, HTML_PATH)
elif '?' in response.url:
url = response.url.split('?')
write_html(url[0], response.body, HTML_PATH)
else:
write_html(response.url, response.body, HTML_PATH)
# Search for files that contain any allowed file type
found_files = []
domain = response.url.split('/')[2]
for a in response.xpath('//a[#href]/#href'):
link = a.extract()
if link.endswith(ALLOWED_FILE_TYPES):
link = urlparse.urljoin(domain, link)
found_files.append(link)
# extract all refering links
extractor = LxmlLinkExtractor()
linklist = []
for link in extractor.extract_links(response):
# extract links which contain a file in url and add those to 'found_files' for downloading
if '?imgUid' in link.url:
fullpath = link.url
path = fullpath.split('.de')[1]
found_files.append(urlparse.urljoin(domain, path))
else:
linklist.append(link.url)
page_item["links"] = linklist
# add all files to lokaloItem
page_item["file_urls"] = self.url_join(found_files, response)
# extract page title
page_item["title"] = response.css('title::text').extract()
# extract all image urls
relative_img_urls = response.css("img::attr(src)").extract()
page_item["image_urls"] = self.url_join(relative_img_urls, response)
return page_item
def parse_base_page_information(self, response):
baseItem = BaseItem()
baseItem["url"] = response.url
# extract page title
baseItem["title"] = response.css('title::text').extract()
baseItem["crawling_date_time"] = response.headers['Date']
# extract description tags
baseItem["description"] = response.xpath('//meta[#name="description"]/#content').extract()
baseItem["og_description"] = response.xpath('//meta[#name="og:description"]/#content').extract()
baseItem['headlines'] = response.xpath('//h1/text()').extract()
return baseItem
ScrapingItems:
from scrapy import Item, Field
class PageItem(Item):
url = Field()
title = Field()
image_urls = Field()
file_urls = Field()
links = Field()
class BaseItem(Item):
url = Field()
title = Field()
crawling_date_time = Field()
description = Field()
og_description = Field()
headlines = Field()
class TrierDeItem(BaseItem):
news_title = Field()
tag = Field()
topic = Field()
And the Multi CSV Pipeline:
class MultiCSVItemPipeline(object):
CSVPath = "csv_data/"
SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([(name, open(self.CSVPath + name + '.csv', 'ab')) for name in self.SaveTypes])
self.exporters = dict([(name, CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
def item_type(item):
'''
Returns the scraping item name
:param item:
:return:
'''
return type(item).__name__.replace('Item', '').lower()
I haven't found a solution to this right now, but I tried several things that failed.
yield list of items, which doesn't work with scrapy
yield only one item and create two parse methods for page_item and trier_item
delete all SaveTypes but 'trierde'. The spider didn't write anything
So, relating to these options I tried I believe, that there is some error with the pipeline itself...
I appreciate any help anybody can offer.
Additional Info:
Before changing my pipeline to MultiCSV I was able to save each item to csv.
After I wasn't able to fix the problem with the Scrapy exporter I decided to create my own exporter.
Here's the code for everyone who want's to export multiple, different Items to different csv files in one or more spiders. It worked for me so far, but I'm still checking the code for errors. Feel free to reply, if you got some ideas for improvement.
class MultiCSVItemPipeline(object):
# Subfolder path, where the csv files are stored
CSVPath = "csv_data/"
# All allowed items
SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']
# List for already checked csv headers
CheckedHeaders = []
def __init__(self):
import sys
reload(sys)
sys.setdefaultencoding('utf8')
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
# Check if items exists and create new ones if not
for file in set(self.SaveTypes):
f = open(self.CSVPath + file + '.csv', 'a+')
f.close()
def spider_closed(self, spider):
# not needed anymore
# [e.finish_exporting() for e in self.exporters.values()]
# [f.close() for f in self.files.values()]
pass
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
try:
# Check if csv file contains header, but only those, that aren't checked
if what not in self.CheckedHeaders:
self.check_header(what, item)
self.write_item_to_row(item, what)
except Exception as e:
logging.error("########################################################")
logging.error("Error writing to " + what + ".csv file ")
logging.error("Error Message: " + e.message)
logging.error("Error Reason: " + e.reason)
logging.error("Error Object: " + e.object)
logging.error("########################################################")
return item
def write_item_to_row(self, item, what):
"""
Write a single item to a row in csv file
:param item:
:param what:
:return:
"""
ofile = open(self.CSVPath + what + '.csv', "ab")
writer = csv.writer(ofile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
item_dict = item.__dict__['_values']
row = []
for k in item_dict:
d = item_dict[k]
# Ig item is not a list join the element to string, replace all delimiters and set encoding to utf-8
if not isinstance(d, types.ListType):
value = ''.join(item_dict[k]).replace('\t', '').replace('\n', '').encode('utf8')
else:
value = ','.join(item_dict[k]).replace('\t', '').replace('\n', '').encode('utf8')
row.append(value)
writer.writerow(row)
ofile.close()
def check_header(self, what, item):
"""
Check if the file contains header elements and create if missing
:param what:
:param item:
:return:
"""
try:
with open(self.CSVPath + what + '.csv', 'ab+') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
item_dict = item.__dict__['_values']
# If file is empty, create new csv header
if os.stat(self.CSVPath + what + '.csv').st_size == 0:
self.write_csv_header(item_dict, writer)
else:
# Read first row and check header elements
read_csv = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
first_row = read_csv.next()
# if not all headers are set in the csv file, print warning
if not self.check_key_in_csv_header(item_dict, first_row):
# TODO: Add missing header to the csv file
logging.warning("Wrong headers for file " + what + ".csv")
self.CheckedHeaders.append(what)
csvfile.close()
return True
except Exception as e:
logging.error(e.message)
return False
#staticmethod
def write_csv_header(item_dict, writer):
"""
Write header of a csv file.
Header is writen from each keys in the scrapy item
:param item_dict:
:param writer:
:return:
"""
first_row = []
for k in item_dict:
# Join each Key to a string, delete delimiters and encode to utf-8
value = ''.join(k).replace('\t', '').replace('\n', '').encode('utf8')
first_row.append(value)
writer.writerow(first_row)
#staticmethod
def check_key_in_csv_header(item_dict, row):
"""
Check, for each item key, if it's contained in the first line of the csv
k (key) stands for each dictionary key of the scrapy item.
:param item_dict:
:param row:
:return:
"""
for k in item_dict:
if k not in row:
return False
return True

Scrapy File Downloads with Custom Names - Subclass Inheritance Issue

I'm building a simple spider to crawl a structured site and download *.txt files. I've managed to get everything working except for a custom FilesPipeline class.
My goal is to download *.txt files into directories according to their url location. I can achieve my goal if I edit the Scrapy class directly (shown below)
files.py -> FilesPipeline::file_path()
...
# return 'full/%s%s' % (media_guid, media_ext)
return url.split('example.com/')[1]
I want to overload the class properly but haven't been successful. I'm not sure what I should be doing differently. The spider will run with no warnings or errors but wont download files.
settings.py
ITEM_PIPELINES = {
'myspider.pipelines.MySpiderFilesPipeline': 1,
'myspider.pipelines.MySpiderPipeline': 300,
}
spider.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from myspider.items import MySpiderItem
class SpideySpider(CrawlSpider):
name = 'spidey'
allowed_domains = ['example.com']
start_urls = ['http://example.com/']
rules = (
Rule(LinkExtractor(allow='', restrict_xpaths='//tr/td/a', deny_extensions='html'), callback='parse_item', follow=True),
)
def parse_item(self, response):
links = response.xpath('//tr/td/a')
for link in links:
i = MySpiderItem()
i['title'] = response.xpath('//title/text()').extract()
i['href'] = link.xpath('#href').extract()
i['text'] = link.xpath('text()').extract()
i["current_url"] = response.url
referring_url = response.request.headers.get('Referer', None)
i['referring_url'] = referring_url
i['depth'] = response.meta['depth']
if i['text'][0]:
if re.match('^#.*\.txt$', i['text'][0]) is not None:
i['file_urls'] = [ response.urljoin(i['href'][0]) ]
yield i
pipelines.py
import scrapy
from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.contrib.pipeline.files import FilesPipeline, FSFilesStore
import json
import re
class MySpiderPipeline(object):
def __init__(self):
self.file = open('items.json', 'wb')
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if re.match('^#.*\.html$', item['text'][0]) is not None:
valid = False
raise DropItem("HTML File")
if re.match('^#.*\.txt$', item['text'][0]) is not None:
pass
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
class MySpiderFilesPipeline(FilesPipeline):
_url_breakstring = "example.com/"
def get_media_requests(self, item, info):
return [Request(x) for x in item.get(self.files_urls_field, [])]
def file_path(self, request, response=None, info=None):
return url.split(_url_breakstring)[1]
# media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
# media_ext = os.path.splitext(url)[1] # change to request.url after deprecation
# return 'full/%s%s' % (media_guid, media_ext)
For pipeline class add __init__ method for example:
class GCSFilePipeline(ImagesPipeline):
def __init__(self, store_uri, download_func=None, settings=None):
super().__init__(store_uri, settings=settings, download_func=download_func)

How can I start to write Unit test in web Scrapy using python?

class AljazeeraSpider(XMLFeedSpider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
'http://www.aljazeera.com/',
]
def parse(self, response):
hxs = HtmlXPathSelector(response) # The xPath selector
titles = hxs.select('//div[contains(#class,"SkyScrapperBoxes")]/div[contains(#class,"skyscLines")]')
if not titles:
MailNotify().send_mail("Aljazeera", "Scraper Report")
items = []
for titles in titles:
item = NewsItem()
item['title'] = escape(''.join(titles.select('a/text()').extract()))
item['link'] = "http://www.aljazeera.com" + escape(''.join(titles.select('a/#href').extract()))
item['description'] = ''
item = Request(item['link'], meta={'item': item}, callback=self.parse_detail)
items.append(item)
return items
def parse_detail(self, response):
item = response.meta['item']
sel = HtmlXPathSelector(response)
detail = sel.select('//td[#class = "DetailedSummary"]')
item['details'] = remove_html_tags(escape(''.join(detail.select('p').extract())))
item['location'] = ''
published_date = sel.select('//span[#id = "ctl00_cphBody_lblDate"]')
item['published_date'] = escape(''.join(published_date.select('text()').extract()))
return item
I am currently working on Scrapy to crawl the website. I have some knowledge about unittest in python. But,How can I write the unittest to check that link is working, and item['location'], item['details'] are returning the value or not? I have learned Scrapy contract but cannot understand anything.So, how can write the unittest in this case?
If we are talking specifically about how to test the spiders (not pipelines, or loaders), then what we did is provided a "fake response" from a local HTML file. Sample code:
import os
from scrapy.http import Request, TextResponse
def fake_response(file_name=None, url=None):
"""Create a Scrapy fake HTTP response from a HTML file"""
if not url:
url = 'http://www.example.com'
request = Request(url=url)
if file_name:
if not file_name[0] == '/':
responses_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(responses_dir, file_name)
else:
file_path = file_name
file_content = open(file_path, 'r').read()
else:
file_content = ''
response = TextResponse(url=url, request=request, body=file_content,
encoding='utf-8')
return response
Then, in your TestCase class, call the fake_response() function and feed the response to the parse() callback:
from unittest.case import TestCase
class MyTestCase(TestCase):
def setUp(self):
self.spider = MySpider()
def test_parse(self):
response = fake_response('input.html')
item = self.spider.parse(response)
self.assertEqual(item['title'], 'My Title')
# ...
Aside from that, you should definitely start using Item Loaders with input and output processors - this would help to achieve a better modularity and, hence, isolation - spider would just yield item instances, data preparation and modification would be incapsulated inside the loader, which you would test separately.

Scrapy isn't extracting data

This is a scrapy code and I want to scrape data from mouthshut.com and it includes the strong tag in between. I am able to run it and have title coming but they are blank. Why it isn't extracting any data?
import scrapy
from scrapy.selector import Selector
from shut.items import ShutItem
class criticspider(scrapy.Spider):
name ="shut"
allowed_domains =["mouthshut.com"]
start_urls =["http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930"]
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//li[#class="profile"]')
items = []
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[#style=" font-size: 15px;font-weight: 700;"]//a/text()').extract()
#item['date'] = site.select('div[#class="review_stats"]//div[#class="date"]/text()').extract()
#item['desc'] = site.select('div[#class="review_body"]//span[#class="blurb blurb_expanded"]/text()').extract()
items.append(item)
return items
You should use a pipeline to extract data from your spider! Here is a sample that extract data to json files:
pipelines.py
# -*- coding: utf-8 -*-
# python import
from scrapy import signals, log
from scrapy.contrib.exporter import JsonItemExporter
from datetime import datetime
import os
# project import
from items import tgju
from pymongo import MongoClient
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
def get_items(module):
md = module.__dict__
return (str(md[c].__name__) for c in md if (isinstance(md[c], type) and md[c].__module__ == module.__name__))
class JsonPipeline(object):
def __init__(self):
self.files = dict()
self.exporter = dict()
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
for key in get_items(tgju):
path = os.path.join('temp', key)
if not os.path.exists(path):
os.makedirs(path)
self.files[key] = open(os.path.join(path,
'%s_%s_%s.json' % (spider.name,
key.lower(),
datetime.now().strftime('%Y%m%dT%H%M%S'))),
'w+b')
self.exporter[key] = JsonItemExporter(self.files[key])
self.exporter[key].start_exporting()
def spider_closed(self, spider):
for key in get_items(tgju):
self.exporter[key].finish_exporting()
self.files.pop(key).close()
def process_item(self, item, spider):
try:
log.msg('-----------------%s------------------' % item.__class__.__name__)
self.exporter[item.__class__.__name__].export_item(item)
except KeyError:
pass
return item
Add this line to your settings files:
ITEM_PIPELINES = {
'pipelines.JsonPipeline': 800,
}
And try yield each item instead of return.
Update:
Also change your spider to this one...
import scrapy
from scrapy.selector import Selector
from shut.items import ShutItem
class criticspider(scrapy.Spider):
name ="shut"
allowed_domains =["mouthshut.com"]
start_urls =["http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930"]
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//li[#class="profile"]')
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[#style=" font-size: 15px;font-weight: 700;"]//a/text()').extract()
#item['date'] = site.select('div[#class="review_stats"]//div[#class="date"]/text()').extract()
#item['desc'] = site.select('div[#class="review_body"]//span[#class="blurb blurb_expanded"]/text()').extract()
yield item
def parse(self,response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="reviewtitle fl"]')
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[#style=" font-size: 15px;font-weight: 700;"]/a/text()').extract()
#item['date'] = site.select('div[#class="review_stats"]//div[#class="date"]/text()').extract()
#item['desc'] = site.select('div[#class="review_body"]//span[#class="blurb blurb_expanded"]/text()').extract()
yield item
this is work well.
2015-01-21 19:06:33+0800 [shut] DEBUG: Scraped from <200 http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930>
{'title': [u'Vodafone 3G - Useless in Bangalore',
u'Worst Mobile Operator Ever',
u'Worst 3g connectivity of vodafone in bangalore',
u'Pathetic Network 3G',
u'HOW DO THEY STILL DO BUSINESS WITH SUCH SERVICES!!',
u'Bad customer service',
u'Vodafone Kolkata \u2013 My worst ever experience.',
u'Network connectivity - permanent nemesis',
u'VODAFONE MOBILE OPERATOR',
u'Beware of Vodafone billing plans',
u'Vodafone changed my billing plan without my notice',
u'Pathetic service. They deduct balance unnecessari',
u'Worst service from Vodafone',
u'Forget Vodafone',
u'Vodafone Data Services sucks',
u'Outgoing calls has been barred',
u'Vodafone Sucks',
u'Worst Customer satisfaction I have ever Faced',
u'Untrained Customer Care... Seems like headline de',
u'3rd Party downloads - shameless way to make money!']}
here you should know:
1. yield is much better then list in scrapy.
2. li node is not the parent of strong.
3. the value of strong stype has some blank.

Categories