Why does my Scrapy spider only scrape some of my data? - python

I'm trying to use Scrapy to scrape IMDb data (episode information and cast list) for each episode of Law & Order: SVU. After I run the code below, I export it to CSV via the command line with "scrapy crawl svu -o svu.csv".
The code below successfully pulls episode information, but the CSV does not contain the cast list. How do I fix the code to extract and export both the episode information and the cast list?
My thoughts & attempts:
I believe that the cast list is extracted because it is
visible in the terminal when the spider runs, so it may be an export issue.
If I comment out my first Yield statement (episode information), the cast list is successfully exported. This makes me think it isn't just an export issue.
Thanks for the help!
import scrapy
class SvuSpider(scrapy.Spider):
name = "svu"
start_urls = [
'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
]
def parse(self, response):
# Gather episode information
yield {
'season': response.xpath("//div[#class='bp_heading']/text()")[0].extract(),
'episode': response.xpath("//div[#class='bp_heading']/text()")[1].extract(),
'episode_name': response.xpath("//h1[#itemprop='name']/text()").extract_first().strip(),
'date_published': response.xpath("//div[#class='subtext']/a/meta[#itemprop='datePublished']/#content").extract(),
'rating_value': response.xpath("//span[#itemprop='ratingValue']/text()").extract(),
'rating_count': response.xpath("//span[#itemprop='ratingCount']/text()").extract()
}
# Follow link to full cast list
for a in response.xpath("//div[#class='see-more']/a"):
yield response.follow(a, callback=self.parse_cast)
# Follow link to next episode
for a in response.xpath("//a[#class='bp_item np_next']"):
yield response.follow(a, callback=self.parse)
def parse_cast(self,response):
# Gather cast list data
for actor in response.xpath("//table[#class='cast_list']"):
yield {
'actor': response.xpath("//td[#itemprop='actor']/a/span[#itemprop='name']/text()").extract(),
'character': response.xpath("//td[#class='character']/a/text()").extract()
}

I added changes to your code. Addition I show you how to use Items and Pipelines.
spiders/svu.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import EpisodeItem, CastItem
class SvuSpider(scrapy.Spider):
name = "svu"
start_urls = [
'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
]
def parse(self, response):
# Gather episode information
item = EpisodeItem(
season=response.xpath("//div[#class='bp_heading']/text()")[0].extract(),
episode=response.xpath("//div[#class='bp_heading']/text()")[1].extract(),
episode_name=response.xpath("//h1[#itemprop='name']/text()").extract_first().strip(),
date_published=response.xpath("//div[#class='subtext']/a/meta[#itemprop='datePublished']/#content").extract(),
rating_value=response.xpath("//span[#itemprop='ratingValue']/text()").extract(),
rating_count=response.xpath("//span[#itemprop='ratingCount']/text()").extract()
)
yield item
# Follow link to full cast list
for a in response.xpath("//div[#class='see-more']/a"):
yield response.follow(a, callback=self.parse_cast)
# Follow link to next episode
for a in response.xpath("//a[#class='bp_item np_next']"):
yield response.follow(a, callback=self.parse)
def parse_cast(self,response):
# Gather cast list data
for actor in response.xpath("//table[#class='cast_list']"):
character = response.xpath("//td[#class='character']/a/text()").extract()
character.extend(response.xpath("//td[#class='character']/text()").extract())
character = [c.strip().replace('\n ', '') for c in character if c.strip()]
item = CastItem(
actor=response.xpath("//td[#itemprop='actor']/a/span[#itemprop='name']/text()").extract(),
character=character
)
yield item
items.py
from scrapy import Item, Field
class EpisodeItem(Item):
season = Field()
episode = Field()
episode_name = Field()
date_published = Field()
rating_value = Field()
rating_count = Field()
class CastItem(Item):
actor = Field()
character = Field()
pipelines.py
from scrapy import signals
from scrapy.exporters import CsvItemExporter
from .items import CastItem, EpisodeItem
class IMDBPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
item_names = ['episode', 'cast']
self.files = self.files = {n: open('%s.csv' % n, 'w+b') for n in item_names}
self.exporters = {n: CsvItemExporter(f) for n, f in self.files.items()}
for exporter in self.exporters.values():
exporter.start_exporting()
def spider_closed(self, spider):
for exporter in self.exporters.values():
exporter.finish_exporting()
for file in self.files.values():
file.close()
def process_item(self, item, spider):
if isinstance(item, EpisodeItem):
self.exporters['episode'].export_item(item)
if isinstance(item, CastItem):
self.exporters['cast'].export_item(item)
return item
Add to settings file:
ITEM_PIPELINES = {
'PROJECT_NAME.pipelines.IMDBPipeline': 300,
}
Be carefull. You need to replace PROJECT_NAME to yours.

Related

Scrapy yield only last data and merge scrapy data into one

I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql
I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old

Is it possible to run pipelines and crawl multiple URL at the same time in scrapy?

My spider looks like this
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from ProjectName.items import ProjectName
class SpidernameSpider(CrawlSpider):
name = 'spidername'
allowed_domains = ['webaddress']
start_urls = ['webaddress/query1']
rules = (
Rule(LinkExtractor(restrict_css='horizontal css')),
Rule(LinkExtractor(restrict_css='vertical css'),
callback='parse_item')
)
def parse_item(self, response):
item = ProjectName()
1_css = 'css1::text'
item['1'] = response.css(1_css).extract()
item = ProjectName()
2_css = 'css2::text'
item['2'] = response.css(2_css).extract()
return item
and my pipeline like this:
from scrapy.exceptions import DropItem
class RemoveIncompletePipeline(object):
def reminc_item(self, item, spider):
if item['1']:
return item
else:
raise DropItem("Missing content in %s" % item)
Everything works fine, when the value for field 1 is missing then, the coresponding item is taken out from the output.
But, when I change start_urls, in order to do the job for multiple queries, like this:
f = open("queries.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
or like this:
start_urls = [i.strip() for i in open('queries.txt').readlines()]
Then the output contains the items with missing value for field 1.
What's going on? And how I can avoid that?
For the record queries.txt looks like that:
webaddress/query1
webaddress/query2
According to the docs you should override start_requests method.
This method must return an iterable with the first Requests to crawl
for this spider.
This is the method called by Scrapy when the spider is opened for
scraping when no particular URLs are specified. If particular URLs are
specified, the make_requests_from_url() is used instead to create the
Requests. This method is also called only once from Scrapy, so it’s
safe to implement it as a generator.
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from ProjectName.items import ProjectName
class SpidernameSpider(CrawlSpider):
name = 'spidername'
allowed_domains = ['webaddress']
start_urls = ['webaddress/query1']
rules = (
Rule(LinkExtractor(restrict_css='horizontal css')),
Rule(LinkExtractor(restrict_css='vertical css'),
callback='parse_item')
)
def start_requests(self):
return [scrapy.Request(i.strip(), callback=self.parse_item) for i in open('queries.txt').readlines()]
def parse_item(self, response):
item = ProjectName()
1_css = 'css1::text'
item['1'] = response.css(1_css).extract()
item = ProjectName()
2_css = 'css2::text'
item['2'] = response.css(2_css).extract()
return item
UPD:
Just put this code into your spider class
def start_requests(self):
return [scrapy.Request(i.strip(), callback=self.parse_item) for i in open('queries.txt').readlines()]
UPD:
Your have a wrong logic in your parse_item method. You need to fix it.
def parse_item(self, response):
for job in response.css('div.card-top')
item = ProjectName()
# just quick example.
item['city'] = job.xpath('string(//span[#class="serp-location"])').extract()[0].replace(' ', '').replace('\n', '')
# TODO: you should fill other item fields
# ...
yeild item

Limit how much elements scrapy can collect

I am using scrapy to collect some data. My scrapy program collects 100 elements at one session. I need to limit it to 50 or any random number. How can i do that? Any solution is welcomed. Thanks in advance
# -*- coding: utf-8 -*-
import re
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["raleigh.craigslist.org"]
start_urls = [
"http://raleigh.craigslist.org/search/bab"
]
BASE_URL = 'http://raleigh.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/ral/bab/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
This is what CloseSpider extension and CLOSESPIDER_ITEMCOUNT setting were made for:
An integer which specifies a number of items. If the spider scrapes
more than that amount if items and those items are passed by the item
pipeline, the spider will be closed with the reason
closespider_itemcount. If zero (or non set), spiders won’t be closed
by number of passed items.
I tried alecxe answer but I had to combine all 3 limits to make it work, so leaving it here just in case someone else is having the same issue:
class GenericWebsiteSpider(scrapy.Spider):
"""This generic website spider extracts text from websites"""
name = "generic_website"
custom_settings = {
'CLOSESPIDER_PAGECOUNT': 15,
'CONCURRENT_REQUESTS': 15,
'CLOSESPIDER_ITEMCOUNT': 15
}
...

Scrapy pipeline to export csv file in the right format

I made the improvement according to the suggestion from alexce below. What I need is like the picture below. However each row/line should be one review: with date, rating, review text and link.
I need to let item processor process each review of every page.
Currently TakeFirst() only takes the first review of the page. So 10 pages, I only have 10 lines/rows as in the picture below.
Spider code is below:
import scrapy
from amazon.items import AmazonItem
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = [
'http://www.amazon.co.uk/product-reviews/B0042EU3A2/'.format(page) for page in xrange(1,114)
]
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
item = AmazonItem()
item['rating'] = sel.xpath('div/div[2]/span[1]/span/#title').extract()
item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract()
item['review'] = sel.xpath('div/div[6]/text()').extract()
item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/#href').extract()
yield item
I started from scratch and the following spider should be run with
scrapy crawl amazon -t csv -o Amazon.csv --loglevel=INFO
so that opening the CSV-File with a spreadsheet shows for me
Hope this helps :-)
import scrapy
class AmazonItem(scrapy.Item):
rating = scrapy.Field()
date = scrapy.Field()
review = scrapy.Field()
link = scrapy.Field()
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = ['http://www.amazon.co.uk/product-reviews/B0042EU3A2/' ]
def parse(self, response):
for sel in response.xpath('//table[#id="productReviews"]//tr/td/div'):
item = AmazonItem()
item['rating'] = sel.xpath('./div/span/span/span/text()').extract()
item['date'] = sel.xpath('./div/span/nobr/text()').extract()
item['review'] = sel.xpath('./div[#class="reviewText"]/text()').extract()
item['link'] = sel.xpath('.//a[contains(.,"Permalink")]/#href').extract()
yield item
xpath_Next_Page = './/table[#id="productReviews"]/following::*//span[#class="paging"]/a[contains(.,"Next")]/#href'
if response.xpath(xpath_Next_Page):
url_Next_Page = response.xpath(xpath_Next_Page).extract()[0]
request = scrapy.Request(url_Next_Page, callback=self.parse)
yield request
If using -t csv (as proposed by Frank in comments) does not work for you for some reason, you can always use built-in CsvItemExporter directly in the custom pipeline, e.g.:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class AmazonPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('output.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
which you need to add to ITEM_PIPELINES:
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
Also, I would use an Item Loader with input and output processors to join the review text and replace new lines with spaces. Create an ItemLoader class:
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Join, MapCompose
class AmazonItemLoader(ItemLoader):
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.replace("\n", " "))
review_out = Join()
Then, use it to construct an Item:
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
loader = AmazonItemLoader(item=AmazonItem(), selector=sel)
loader.add_xpath('rating', './/div/div[2]/span[1]/span/#title')
loader.add_xpath('date', './/div/div[2]/span[2]/nobr/text()')
loader.add_xpath('review', './/div/div[6]/text()')
loader.add_xpath('link', './/div/div[7]/div[2]/div/div[1]/span[3]/a/#href')
yield loader.load_item()

Scrapy isn't extracting data

This is a scrapy code and I want to scrape data from mouthshut.com and it includes the strong tag in between. I am able to run it and have title coming but they are blank. Why it isn't extracting any data?
import scrapy
from scrapy.selector import Selector
from shut.items import ShutItem
class criticspider(scrapy.Spider):
name ="shut"
allowed_domains =["mouthshut.com"]
start_urls =["http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930"]
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//li[#class="profile"]')
items = []
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[#style=" font-size: 15px;font-weight: 700;"]//a/text()').extract()
#item['date'] = site.select('div[#class="review_stats"]//div[#class="date"]/text()').extract()
#item['desc'] = site.select('div[#class="review_body"]//span[#class="blurb blurb_expanded"]/text()').extract()
items.append(item)
return items
You should use a pipeline to extract data from your spider! Here is a sample that extract data to json files:
pipelines.py
# -*- coding: utf-8 -*-
# python import
from scrapy import signals, log
from scrapy.contrib.exporter import JsonItemExporter
from datetime import datetime
import os
# project import
from items import tgju
from pymongo import MongoClient
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
def get_items(module):
md = module.__dict__
return (str(md[c].__name__) for c in md if (isinstance(md[c], type) and md[c].__module__ == module.__name__))
class JsonPipeline(object):
def __init__(self):
self.files = dict()
self.exporter = dict()
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
for key in get_items(tgju):
path = os.path.join('temp', key)
if not os.path.exists(path):
os.makedirs(path)
self.files[key] = open(os.path.join(path,
'%s_%s_%s.json' % (spider.name,
key.lower(),
datetime.now().strftime('%Y%m%dT%H%M%S'))),
'w+b')
self.exporter[key] = JsonItemExporter(self.files[key])
self.exporter[key].start_exporting()
def spider_closed(self, spider):
for key in get_items(tgju):
self.exporter[key].finish_exporting()
self.files.pop(key).close()
def process_item(self, item, spider):
try:
log.msg('-----------------%s------------------' % item.__class__.__name__)
self.exporter[item.__class__.__name__].export_item(item)
except KeyError:
pass
return item
Add this line to your settings files:
ITEM_PIPELINES = {
'pipelines.JsonPipeline': 800,
}
And try yield each item instead of return.
Update:
Also change your spider to this one...
import scrapy
from scrapy.selector import Selector
from shut.items import ShutItem
class criticspider(scrapy.Spider):
name ="shut"
allowed_domains =["mouthshut.com"]
start_urls =["http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930"]
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//li[#class="profile"]')
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[#style=" font-size: 15px;font-weight: 700;"]//a/text()').extract()
#item['date'] = site.select('div[#class="review_stats"]//div[#class="date"]/text()').extract()
#item['desc'] = site.select('div[#class="review_body"]//span[#class="blurb blurb_expanded"]/text()').extract()
yield item
def parse(self,response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="reviewtitle fl"]')
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[#style=" font-size: 15px;font-weight: 700;"]/a/text()').extract()
#item['date'] = site.select('div[#class="review_stats"]//div[#class="date"]/text()').extract()
#item['desc'] = site.select('div[#class="review_body"]//span[#class="blurb blurb_expanded"]/text()').extract()
yield item
this is work well.
2015-01-21 19:06:33+0800 [shut] DEBUG: Scraped from <200 http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930>
{'title': [u'Vodafone 3G - Useless in Bangalore',
u'Worst Mobile Operator Ever',
u'Worst 3g connectivity of vodafone in bangalore',
u'Pathetic Network 3G',
u'HOW DO THEY STILL DO BUSINESS WITH SUCH SERVICES!!',
u'Bad customer service',
u'Vodafone Kolkata \u2013 My worst ever experience.',
u'Network connectivity - permanent nemesis',
u'VODAFONE MOBILE OPERATOR',
u'Beware of Vodafone billing plans',
u'Vodafone changed my billing plan without my notice',
u'Pathetic service. They deduct balance unnecessari',
u'Worst service from Vodafone',
u'Forget Vodafone',
u'Vodafone Data Services sucks',
u'Outgoing calls has been barred',
u'Vodafone Sucks',
u'Worst Customer satisfaction I have ever Faced',
u'Untrained Customer Care... Seems like headline de',
u'3rd Party downloads - shameless way to make money!']}
here you should know:
1. yield is much better then list in scrapy.
2. li node is not the parent of strong.
3. the value of strong stype has some blank.

Categories