I am new to python and scrapy. I am not getting item data in pipeline. Nothing is being written in csv. Error is
DmozSpider' object has no attribute getitem
Any help will be appreciated:
spider file
import scrapy
import sys
import os
from tutorial.items import TutorialItem
from pprint import pprint
class DmozSpider(scrapy.Spider):
name = "myspider"
allowed_domains = ["www.xyz.co.id"]
start_urls = ["http://www.xyz.co.id/search?q=abc"]
def parse(self, response):
var = response.xpath("//a[#class='img']/#href").extract()[0]
item = TutorialItem()
item['title'] = var
yield item
pipeline file
import csv
class TutorialPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('items.csv', 'wb'))
def process_item(self, domain, item):
print item['title']
self.csvwriter.writerow([item['title']])
return item
items file
import scrapy
class TutorialItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
desc = scrapy.Field()
price = scrapy.Field()
Settings file
ITEM_PIPELINES = {
'tutorial.pipelines.TutorialPipeline': 300,
}
The definition of your pipeline method process_item() is incorrect. The bug is in the stated parameters self, domain, item. The official description in the documentation is:
process_item(self, item, spider)
Change the method in your class TutorialPipeline accrodingly to:
def process_item(self, item, spider):
print item['title']
self.csvwriter.writerow([item['title']])
return item
Try item.get('title') instead of item['title']
Related
I notice in scrapy log that some urls returned 200 status but contained no items. It seems to be the stability of the site as re-crawling these urls 1-2 times again yield items. I would like to save these urls in a separate file for re-crawling.
I tried to create a dictionary in the spider class to store these urls but there is no easy way to save the dictionary into a file.
Another way I tried is to create a 2nd item class for urls and use item pipeline. It still outputs empty file though. I am not too advanced to write my own pipeline. Here is my code.
import scrapy
class MyItem(scrapy.Item):
productCode = scrapy.Field()
productName = scrapy.Field()
...
class UrlItem(scrapy.Item):
eurl = scrapy.Field()
parse
class MySpider(scrapy.Spider):
custom_settings = {
'FEEDS':{
'%(filename)s.csv':{'format':'csv', 'encoding':'utf-8',},
},
'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter',},}
def parsePage(self, response):
products = response.xpath(...)
if len(products) == 0:
url = UrlItem()
url['eurl'] = response.url
yield url
else:
item = MyItem()
item['...'] = ...
...
yield item
pipeline
from .items import MyItem, UrlItem
import csv
class UrlPipeline:
def open_spider(self, spider):
self.file = open('%s.csv' % "noProductsUrls", 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, url, spider):
if isinstance(url, UrlItem):
csvWriter = csv.writer(self.file)
csvWriter.writerow(ItemAdapter(url))
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class MyPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if isinstance(item, MyItem):
adapter = ItemAdapter(item)
if adapter['productCode'] in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(adapter['productCode'])
return item
settings file
'project.pipelines.MyPipeline': 300,
'project.pipelines.UrlPipeline': 300,
The thing is also I have the exporter setting in spider class already that saves a csv. In the pipeline I just want to add one more csv. Do the two conflict? Or it is better to structure both csv files in the pipeline?
Update: I opted for #marcos solution below which is superior.
There is a way to save csv in the spider class which is based on this post.
def __init__(self):
self.outfile = open("urls.csv", "w", newline = "")
self.writer = csv.writer(self.outfile)
def closed(self,reason):
self.outfile.close()
add the following in def parse
if len(products) == 0:
self.writer.writerow([response.url])
I would suggest you just yield a retry request whenever no product is found on the page, unless you have a very specific reason to store those URLs.
The code would look like:
class MySpider(scrapy.Spider):
custom_settings = {
'FEEDS':{
'%(filename)s.csv':{'format':'csv', 'encoding':'utf-8',},
},
'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter',},}
def parsePage(self, response):
products = response.xpath(...)
if not len(products):
yield self._retry_request(response)
return
item = MyItem()
item['...'] = ...
...
yield item
def _retry_request(self, response, max_retries=5):
retries = response.meta.get('retry_time', 0)
if retries < max_retries:
return response.request.replace(
meta={**response.meta, 'retry_time': retries + 1},
dont_filter=True,
)
else:
self.logger.warning(f'Max retries reached for {response.url}')
I've scrapped the urls i want from a page. Now I want to filter them for keywords using a pipeline:
class GumtreeCouchesPipeline(object):
keywords = ['leather', 'couches']
def process_item(self, item, spider):
if any(key in item['url'] for key in keywords):
return item
Problem is its returning nothing now.
The spider:
import scrapy
from gumtree_couches.items import adItem
from urllib.parse import urljoin
class GumtreeSpider(scrapy.Spider):
name = 'GumtreeCouches'
allowed_domains = ['https://someurl']
start_urls = ['https://someurl']
def parse(self, response):
item = adItem()
for ad_links in response.xpath('//div[#class="view"][1]//a'):
relative_url = ad_links.xpath('#href').extract_first()
item['title'] = ad_links.xpath('text()').extract_first()
item['url'] = response.urljoin(relative_url)
yield item
How can I filter all the scraped urls for keywords using the pipeline?
Thanks!
This should fix your problem:
class GumtreeCouchesPipeline(object):
keywords = ['leather', 'couches']
def process_item(self, item, spider):
if any(key in item['url'] for key in self.keywords):
return item
Notice that I'm using self.keywords to refer to the keywords class attribute.
If you look at your spider logs, you should find some errors saying something like: NameError: name 'keywords' is not defined.
Anyway, I'd recommend you to implement this pipeline like this:
from scrapy.exceptions import DropItem
class GumtreeCouchesPipeline(object):
keywords = ['leather', 'couches']
def process_item(self, item, spider):
if not any(key in item['url'] for key in self.keywords):
raise DropItem('missing keyword in URL')
return item
This way, you'll have the information about the dropped items in the job stats once it's finished.
From reading the documentation I think you have to cater for all paths e.g.
from scrapy.exceptions import DropItem
def process_item(self, item, spider):
keywords = ['leather', 'couches']
if item['url']:
if any(key in item['url'] for key in keywords):
return item
else
raise DropItem("Missing specified keywords.")
else
return item
My spider looks like this
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from ProjectName.items import ProjectName
class SpidernameSpider(CrawlSpider):
name = 'spidername'
allowed_domains = ['webaddress']
start_urls = ['webaddress/query1']
rules = (
Rule(LinkExtractor(restrict_css='horizontal css')),
Rule(LinkExtractor(restrict_css='vertical css'),
callback='parse_item')
)
def parse_item(self, response):
item = ProjectName()
1_css = 'css1::text'
item['1'] = response.css(1_css).extract()
item = ProjectName()
2_css = 'css2::text'
item['2'] = response.css(2_css).extract()
return item
and my pipeline like this:
from scrapy.exceptions import DropItem
class RemoveIncompletePipeline(object):
def reminc_item(self, item, spider):
if item['1']:
return item
else:
raise DropItem("Missing content in %s" % item)
Everything works fine, when the value for field 1 is missing then, the coresponding item is taken out from the output.
But, when I change start_urls, in order to do the job for multiple queries, like this:
f = open("queries.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
or like this:
start_urls = [i.strip() for i in open('queries.txt').readlines()]
Then the output contains the items with missing value for field 1.
What's going on? And how I can avoid that?
For the record queries.txt looks like that:
webaddress/query1
webaddress/query2
According to the docs you should override start_requests method.
This method must return an iterable with the first Requests to crawl
for this spider.
This is the method called by Scrapy when the spider is opened for
scraping when no particular URLs are specified. If particular URLs are
specified, the make_requests_from_url() is used instead to create the
Requests. This method is also called only once from Scrapy, so it’s
safe to implement it as a generator.
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from ProjectName.items import ProjectName
class SpidernameSpider(CrawlSpider):
name = 'spidername'
allowed_domains = ['webaddress']
start_urls = ['webaddress/query1']
rules = (
Rule(LinkExtractor(restrict_css='horizontal css')),
Rule(LinkExtractor(restrict_css='vertical css'),
callback='parse_item')
)
def start_requests(self):
return [scrapy.Request(i.strip(), callback=self.parse_item) for i in open('queries.txt').readlines()]
def parse_item(self, response):
item = ProjectName()
1_css = 'css1::text'
item['1'] = response.css(1_css).extract()
item = ProjectName()
2_css = 'css2::text'
item['2'] = response.css(2_css).extract()
return item
UPD:
Just put this code into your spider class
def start_requests(self):
return [scrapy.Request(i.strip(), callback=self.parse_item) for i in open('queries.txt').readlines()]
UPD:
Your have a wrong logic in your parse_item method. You need to fix it.
def parse_item(self, response):
for job in response.css('div.card-top')
item = ProjectName()
# just quick example.
item['city'] = job.xpath('string(//span[#class="serp-location"])').extract()[0].replace(' ', '').replace('\n', '')
# TODO: you should fill other item fields
# ...
yeild item
My Hacker News spider outputs all the results on one line, instead of one each line, as it can be seen here.
All on the same line
Here is my code.
import scrapy
import string
import urlparse
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors import LinkExtractor
class HnItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
score = scrapy.Field()
class HnSpider(scrapy.Spider):
name = 'hackernews'
allowed_domains = ["news.ycombinator.com"]
start_urls = ["https://news.ycombinator.com/"]
def parse(self, response):
sel = response
selector_list = response.xpath('.//table[#class="itemlist"]')
for sel in selector_list:
item = HnItem()
item['title'] = sel.xpath('.//td[#class="title"]/text()').extract()
item['link'] = sel.xpath('.//tr[#class="athing"]/td[3]/a/#href').extract()
item['score'] = sel.xpath('.//td[#class="subtext"]/span/text()').extract()
yield item
and my settings.py file
BOT_NAME = 'hnews'
SPIDER_MODULES = ['hnews.spiders']
NEWSPIDER_MODULE = 'hnews.spiders'
USER_AGENT = 'hnews (+http://www.yourdomain.com)'
FEED_URI = '/used/scrapy/hnews/%(name)s/%(time)s.csv'
FEED_FORMAT = 'csv'
I've tried to implement this among many other solutions but no luck so far. I'm still very new at this, so bear with me if possible.
It is happening because your item pipeline is getting all the lists at once. For expample: The item['title'] is getting a list of all the titles at once which is then transferred to the item pipeline and then written to the csv file directly.
The solution is to iterate over the list and yield it to the item pipeline one at a time. Here's a modified code:
import scrapy
from scrapy.selector import Selector
class HnItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
score = scrapy.Field()
class HnSpider(scrapy.Spider):
name = 'hackernews'
allowed_domains = ["news.ycombinator.com"]
start_urls = ["https://news.ycombinator.com/"]
def parse(self, response):
sel = Selector(response)
item = HnItem()
title_list = sel.xpath('.//td[#class="title"]/a/text()').extract()[:-2]
link_list= sel.xpath('.//tr[#class="athing"]/td[3]/a/#href').extract()
score_list = sel.xpath('.//td[#class="subtext"]/span/text()').extract()
for x in range(0,len(title_list)):
item['title'] = title_list[x]
item['link'] = link_list[x]
item['score'] = score_list[x]
yield item
I made the improvement according to the suggestion from alexce below. What I need is like the picture below. However each row/line should be one review: with date, rating, review text and link.
I need to let item processor process each review of every page.
Currently TakeFirst() only takes the first review of the page. So 10 pages, I only have 10 lines/rows as in the picture below.
Spider code is below:
import scrapy
from amazon.items import AmazonItem
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = [
'http://www.amazon.co.uk/product-reviews/B0042EU3A2/'.format(page) for page in xrange(1,114)
]
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
item = AmazonItem()
item['rating'] = sel.xpath('div/div[2]/span[1]/span/#title').extract()
item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract()
item['review'] = sel.xpath('div/div[6]/text()').extract()
item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/#href').extract()
yield item
I started from scratch and the following spider should be run with
scrapy crawl amazon -t csv -o Amazon.csv --loglevel=INFO
so that opening the CSV-File with a spreadsheet shows for me
Hope this helps :-)
import scrapy
class AmazonItem(scrapy.Item):
rating = scrapy.Field()
date = scrapy.Field()
review = scrapy.Field()
link = scrapy.Field()
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = ['http://www.amazon.co.uk/product-reviews/B0042EU3A2/' ]
def parse(self, response):
for sel in response.xpath('//table[#id="productReviews"]//tr/td/div'):
item = AmazonItem()
item['rating'] = sel.xpath('./div/span/span/span/text()').extract()
item['date'] = sel.xpath('./div/span/nobr/text()').extract()
item['review'] = sel.xpath('./div[#class="reviewText"]/text()').extract()
item['link'] = sel.xpath('.//a[contains(.,"Permalink")]/#href').extract()
yield item
xpath_Next_Page = './/table[#id="productReviews"]/following::*//span[#class="paging"]/a[contains(.,"Next")]/#href'
if response.xpath(xpath_Next_Page):
url_Next_Page = response.xpath(xpath_Next_Page).extract()[0]
request = scrapy.Request(url_Next_Page, callback=self.parse)
yield request
If using -t csv (as proposed by Frank in comments) does not work for you for some reason, you can always use built-in CsvItemExporter directly in the custom pipeline, e.g.:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class AmazonPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('output.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
which you need to add to ITEM_PIPELINES:
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
Also, I would use an Item Loader with input and output processors to join the review text and replace new lines with spaces. Create an ItemLoader class:
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Join, MapCompose
class AmazonItemLoader(ItemLoader):
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.replace("\n", " "))
review_out = Join()
Then, use it to construct an Item:
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
loader = AmazonItemLoader(item=AmazonItem(), selector=sel)
loader.add_xpath('rating', './/div/div[2]/span[1]/span/#title')
loader.add_xpath('date', './/div/div[2]/span[2]/nobr/text()')
loader.add_xpath('review', './/div/div[6]/text()')
loader.add_xpath('link', './/div/div[7]/div[2]/div/div[1]/span[3]/a/#href')
yield loader.load_item()