Saving data to seprate csv files in scrapy - python

I made a scraper for yellow pages. There is a categories.txt file that is read by the script and then it generates links according to those categories:
settings = get_project_settings()
categories = settings.get('CATEGORIES')
links = []
for category in categories:
link = 'https://www.yellowpages.com/search?search_terms=' + category + '&geo_location_terms=NY&page=1'
links.append(link)
then this links list is passed to start urls:
class YpSpider(CrawlSpider):
def __init__(self, *a, **kw):
super().__init__(*a, **kw)
dispatcher.connect(self.spider_closed, signals.spider_closed)
name = 'yp'
allowed_domains = ['yellowpages.com']
start_urls = links
rules = (
Rule(LinkExtractor(restrict_xpaths='//a[#class="business-name"]', allow=''), callback='parse_item',
follow=True),
Rule(LinkExtractor(restrict_xpaths='//a[#class="next ajax-page"]', allow=''),
follow=True),
)
it will save all the data from all the links in a csv file named parent.csv. This parent.csv file will have a column named keyword which will help in seperating data from different categories and make seperate csv files for each of them. This is implemented in spider closed function:
def spider_closed(self, spider):
with open('parent.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
with open('{}.csv'.format(row[0]), 'a') as f:
writer = csv.writer(f)
writer.writerow(row)
The problem i am facing is to get the category name in my parse method corresponding to every link so that it may be saved in parent.csv and used to seperate diiferent categories afterwards:
def parse_item(self, response):
item = YellowItem()
item['keyword'] = # here i need the corresponding category for every link

I think you should change the way you generate links. You can, for example, override the start_requests method and pass the category to the request through either it's cb_kwargs or meta attribute.
I would also suggest, that you change the implementation to get the settings from the crawler calling the spider by overriding from_crawler.
Here's how I would do it:
class YpSpider(CrawlSpider):
def __init__(self, crawler, *args, **kwargs):
super().__init__(*args, **kwargs)
self.crawler = crawler
self.categories = crawler.settings.get('CATEGORIES')
#classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def start_requests(self):
for category in self.categories:
yield Request(
f'https://www.yellowpages.com/search?search_terms={category}&geo_location_terms=NY&page=1',
self.parse,
cb_kwargs={'category': category}
)
def parse(self, response, category):
# do sth
...
for url in response.xpath('//a[#class="business-name"]/#href').extract():
yield Request(
url,
self.parse_item,
cb_kwargs={'category': category}
)
yield Request(
response.xpath('//a[#class="next ajax-page"]/#href').extract_first(),
self.parse,
cb_kwargs={'category': category}
)
def parse_item(self, response, category):
item = YellowItem()
item['keyword'] = category
# do sth else
...

Related

Scrapy saving 200 status urls with empty items in a file

I notice in scrapy log that some urls returned 200 status but contained no items. It seems to be the stability of the site as re-crawling these urls 1-2 times again yield items. I would like to save these urls in a separate file for re-crawling.
I tried to create a dictionary in the spider class to store these urls but there is no easy way to save the dictionary into a file.
Another way I tried is to create a 2nd item class for urls and use item pipeline. It still outputs empty file though. I am not too advanced to write my own pipeline. Here is my code.
import scrapy
class MyItem(scrapy.Item):
productCode = scrapy.Field()
productName = scrapy.Field()
...
class UrlItem(scrapy.Item):
eurl = scrapy.Field()
parse
class MySpider(scrapy.Spider):
custom_settings = {
'FEEDS':{
'%(filename)s.csv':{'format':'csv', 'encoding':'utf-8',},
},
'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter',},}
def parsePage(self, response):
products = response.xpath(...)
if len(products) == 0:
url = UrlItem()
url['eurl'] = response.url
yield url
else:
item = MyItem()
item['...'] = ...
...
yield item
pipeline
from .items import MyItem, UrlItem
import csv
class UrlPipeline:
def open_spider(self, spider):
self.file = open('%s.csv' % "noProductsUrls", 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, url, spider):
if isinstance(url, UrlItem):
csvWriter = csv.writer(self.file)
csvWriter.writerow(ItemAdapter(url))
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class MyPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if isinstance(item, MyItem):
adapter = ItemAdapter(item)
if adapter['productCode'] in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(adapter['productCode'])
return item
settings file
'project.pipelines.MyPipeline': 300,
'project.pipelines.UrlPipeline': 300,
The thing is also I have the exporter setting in spider class already that saves a csv. In the pipeline I just want to add one more csv. Do the two conflict? Or it is better to structure both csv files in the pipeline?
Update: I opted for #marcos solution below which is superior.
There is a way to save csv in the spider class which is based on this post.
def __init__(self):
self.outfile = open("urls.csv", "w", newline = "")
self.writer = csv.writer(self.outfile)
def closed(self,reason):
self.outfile.close()
add the following in def parse
if len(products) == 0:
self.writer.writerow([response.url])
I would suggest you just yield a retry request whenever no product is found on the page, unless you have a very specific reason to store those URLs.
The code would look like:
class MySpider(scrapy.Spider):
custom_settings = {
'FEEDS':{
'%(filename)s.csv':{'format':'csv', 'encoding':'utf-8',},
},
'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter',},}
def parsePage(self, response):
products = response.xpath(...)
if not len(products):
yield self._retry_request(response)
return
item = MyItem()
item['...'] = ...
...
yield item
def _retry_request(self, response, max_retries=5):
retries = response.meta.get('retry_time', 0)
if retries < max_retries:
return response.request.replace(
meta={**response.meta, 'retry_time': retries + 1},
dont_filter=True,
)
else:
self.logger.warning(f'Max retries reached for {response.url}')

Why does my Scrapy spider only scrape some of my data?

I'm trying to use Scrapy to scrape IMDb data (episode information and cast list) for each episode of Law & Order: SVU. After I run the code below, I export it to CSV via the command line with "scrapy crawl svu -o svu.csv".
The code below successfully pulls episode information, but the CSV does not contain the cast list. How do I fix the code to extract and export both the episode information and the cast list?
My thoughts & attempts:
I believe that the cast list is extracted because it is
visible in the terminal when the spider runs, so it may be an export issue.
If I comment out my first Yield statement (episode information), the cast list is successfully exported. This makes me think it isn't just an export issue.
Thanks for the help!
import scrapy
class SvuSpider(scrapy.Spider):
name = "svu"
start_urls = [
'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
]
def parse(self, response):
# Gather episode information
yield {
'season': response.xpath("//div[#class='bp_heading']/text()")[0].extract(),
'episode': response.xpath("//div[#class='bp_heading']/text()")[1].extract(),
'episode_name': response.xpath("//h1[#itemprop='name']/text()").extract_first().strip(),
'date_published': response.xpath("//div[#class='subtext']/a/meta[#itemprop='datePublished']/#content").extract(),
'rating_value': response.xpath("//span[#itemprop='ratingValue']/text()").extract(),
'rating_count': response.xpath("//span[#itemprop='ratingCount']/text()").extract()
}
# Follow link to full cast list
for a in response.xpath("//div[#class='see-more']/a"):
yield response.follow(a, callback=self.parse_cast)
# Follow link to next episode
for a in response.xpath("//a[#class='bp_item np_next']"):
yield response.follow(a, callback=self.parse)
def parse_cast(self,response):
# Gather cast list data
for actor in response.xpath("//table[#class='cast_list']"):
yield {
'actor': response.xpath("//td[#itemprop='actor']/a/span[#itemprop='name']/text()").extract(),
'character': response.xpath("//td[#class='character']/a/text()").extract()
}
I added changes to your code. Addition I show you how to use Items and Pipelines.
spiders/svu.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import EpisodeItem, CastItem
class SvuSpider(scrapy.Spider):
name = "svu"
start_urls = [
'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
]
def parse(self, response):
# Gather episode information
item = EpisodeItem(
season=response.xpath("//div[#class='bp_heading']/text()")[0].extract(),
episode=response.xpath("//div[#class='bp_heading']/text()")[1].extract(),
episode_name=response.xpath("//h1[#itemprop='name']/text()").extract_first().strip(),
date_published=response.xpath("//div[#class='subtext']/a/meta[#itemprop='datePublished']/#content").extract(),
rating_value=response.xpath("//span[#itemprop='ratingValue']/text()").extract(),
rating_count=response.xpath("//span[#itemprop='ratingCount']/text()").extract()
)
yield item
# Follow link to full cast list
for a in response.xpath("//div[#class='see-more']/a"):
yield response.follow(a, callback=self.parse_cast)
# Follow link to next episode
for a in response.xpath("//a[#class='bp_item np_next']"):
yield response.follow(a, callback=self.parse)
def parse_cast(self,response):
# Gather cast list data
for actor in response.xpath("//table[#class='cast_list']"):
character = response.xpath("//td[#class='character']/a/text()").extract()
character.extend(response.xpath("//td[#class='character']/text()").extract())
character = [c.strip().replace('\n ', '') for c in character if c.strip()]
item = CastItem(
actor=response.xpath("//td[#itemprop='actor']/a/span[#itemprop='name']/text()").extract(),
character=character
)
yield item
items.py
from scrapy import Item, Field
class EpisodeItem(Item):
season = Field()
episode = Field()
episode_name = Field()
date_published = Field()
rating_value = Field()
rating_count = Field()
class CastItem(Item):
actor = Field()
character = Field()
pipelines.py
from scrapy import signals
from scrapy.exporters import CsvItemExporter
from .items import CastItem, EpisodeItem
class IMDBPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
item_names = ['episode', 'cast']
self.files = self.files = {n: open('%s.csv' % n, 'w+b') for n in item_names}
self.exporters = {n: CsvItemExporter(f) for n, f in self.files.items()}
for exporter in self.exporters.values():
exporter.start_exporting()
def spider_closed(self, spider):
for exporter in self.exporters.values():
exporter.finish_exporting()
for file in self.files.values():
file.close()
def process_item(self, item, spider):
if isinstance(item, EpisodeItem):
self.exporters['episode'].export_item(item)
if isinstance(item, CastItem):
self.exporters['cast'].export_item(item)
return item
Add to settings file:
ITEM_PIPELINES = {
'PROJECT_NAME.pipelines.IMDBPipeline': 300,
}
Be carefull. You need to replace PROJECT_NAME to yours.

Scrapy: Checking the value in a csv file before adding

I want to check the title of an item in the csv file, and then add to the csv file if it does not exists. I searched almost any responses related to duplicate values. Mostly, they are about DuplicatesPipeline and the others did not work for me.
This is my custom pipeline which is the pipelines.py
class CheckCsvPipeline(object):
def __init__(self):
csv_path = r"C:\Users\HP\PycharmProjects\ToScrape\book\items.csv"
self.csvfile = open(csv_path, 'r')
self.readCsv = csv.reader(self.csvfile, delimiter=',')
def process_item(self, item, spider):
for row in self.readCsv:
if item['title'] in row:
raise DropItem("This title exists: %s" %item)
else:
return item
Here is my spider:
import scrapy
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield scrapy.Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[text()="next"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
price = response.xpath('//*[#class="price_color"]/text()').extract_first()
yield {'title':title, 'price': price}
I run the spider with the following code, but it still adds the existing values.
scrapy crawl books -o items.csv
I suggest you to maintain a list of titles in your spider, and then inside pipeline, check if title already exists in that lists, then do not yield it.
class CheckCsvPipeline(object):
def __init__(self):
pass
def process_item(self, item, spider):
if item['title'] in spider.allTitles:
raise DropItem("This title exists: %s" % item)
else:
return item
in your spider, do this
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
allTitles = []
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield scrapy.Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[text()="next"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
self.allTitles.extend([ title ])
price = response.xpath('//*[#class="price_color"]/text()').extract_first()
yield {'title':title, 'price': price}

Scrapy pipeline to export csv file in the right format

I made the improvement according to the suggestion from alexce below. What I need is like the picture below. However each row/line should be one review: with date, rating, review text and link.
I need to let item processor process each review of every page.
Currently TakeFirst() only takes the first review of the page. So 10 pages, I only have 10 lines/rows as in the picture below.
Spider code is below:
import scrapy
from amazon.items import AmazonItem
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = [
'http://www.amazon.co.uk/product-reviews/B0042EU3A2/'.format(page) for page in xrange(1,114)
]
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
item = AmazonItem()
item['rating'] = sel.xpath('div/div[2]/span[1]/span/#title').extract()
item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract()
item['review'] = sel.xpath('div/div[6]/text()').extract()
item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/#href').extract()
yield item
I started from scratch and the following spider should be run with
scrapy crawl amazon -t csv -o Amazon.csv --loglevel=INFO
so that opening the CSV-File with a spreadsheet shows for me
Hope this helps :-)
import scrapy
class AmazonItem(scrapy.Item):
rating = scrapy.Field()
date = scrapy.Field()
review = scrapy.Field()
link = scrapy.Field()
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = ['http://www.amazon.co.uk/product-reviews/B0042EU3A2/' ]
def parse(self, response):
for sel in response.xpath('//table[#id="productReviews"]//tr/td/div'):
item = AmazonItem()
item['rating'] = sel.xpath('./div/span/span/span/text()').extract()
item['date'] = sel.xpath('./div/span/nobr/text()').extract()
item['review'] = sel.xpath('./div[#class="reviewText"]/text()').extract()
item['link'] = sel.xpath('.//a[contains(.,"Permalink")]/#href').extract()
yield item
xpath_Next_Page = './/table[#id="productReviews"]/following::*//span[#class="paging"]/a[contains(.,"Next")]/#href'
if response.xpath(xpath_Next_Page):
url_Next_Page = response.xpath(xpath_Next_Page).extract()[0]
request = scrapy.Request(url_Next_Page, callback=self.parse)
yield request
If using -t csv (as proposed by Frank in comments) does not work for you for some reason, you can always use built-in CsvItemExporter directly in the custom pipeline, e.g.:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class AmazonPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('output.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
which you need to add to ITEM_PIPELINES:
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
Also, I would use an Item Loader with input and output processors to join the review text and replace new lines with spaces. Create an ItemLoader class:
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Join, MapCompose
class AmazonItemLoader(ItemLoader):
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.replace("\n", " "))
review_out = Join()
Then, use it to construct an Item:
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
loader = AmazonItemLoader(item=AmazonItem(), selector=sel)
loader.add_xpath('rating', './/div/div[2]/span[1]/span/#title')
loader.add_xpath('date', './/div/div[2]/span[2]/nobr/text()')
loader.add_xpath('review', './/div/div[6]/text()')
loader.add_xpath('link', './/div/div[7]/div[2]/div/div[1]/span[3]/a/#href')
yield loader.load_item()

Scrapy crawl in order

I can't figure out how to make scrapy crawl links in order
I've got a page with articles and in each one there is a title but the article doesn't match the title
Also in settings.py I added:
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
I've got something like this:
class Getgot(Spider):
name = "getem"
allowed_domains = ["somesite.us"]
start_urls = ["file:local.html"]
el = '//div[#article]'
def parse(self,response):
hxs = HtmlXPathSelector(response)
s = hxs.select('//article')
filename = ("links.txt")
filly = open(filename, "w")
for i in s:
t = i.select('a/#href').extract()
filly.write(str(t[0])+'\n')
yield Request(str(t[0]),callback=self.parse_page)
def parse_page(self,res):
hxs = HtmlXPathSelector(res)
s = hxs.select('//iframe').extract()
if s:
filename = ("frames.txt")
filly = open(filename, "a")
filly.write(str(s[0])+'\n')
else:
filename = ("/frames.txt")
filly = open(filename, "a")
filly.write('[]\n')
I'm not sure I understand how your question and your code are related. Where is the title ?
A few tips: 1) update your scrapy syntax with the latest version 2) don't write any files from the spider, write it in a pipeline or export feed. 3) if you need to transfer data from one function to the next, use the meta attribute.
def parse(self, response):
for link in response.xpath("//article/a/#href").extract():
yield Request(link, callback=self.parse_page, meta={'link':link})
def parse_page(self, response):
for frame in response.xpath("//iframe").extract():
item = MyItem()
item['link'] = response.meta['link']
item['frame'] = frame
yield item
And then you export it to csv or json or whatever, to store the link and the frame together.

Categories