scrapy pipeline exporter object is not getting instantiated - python

I am using scrapy to parse a table containing links and save it in json. The links from table contain additional detail and they will be fetched and stored into another json. (following this example: https://docs.scrapy.org/en/latest/topics/exporters.html)
To achieve this I am using a pipeline to check item type and store result in appropriate json. However, I am stuck in some weird error. Please refer below:
from scrapy import signals
from scrapy.exporters import JsonItemExporter
from for_icu import items
class ListPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
print("spider_opened")
file_table = open('%s_table.json' % spider.name, 'w+b')
self.files[spider].append(file_table)
self.exporter1 = JsonItemExporter(file_table)
self.exporter1.start_exporting()
file_detail = open('%s_detail.json' % spider.name, 'w+b')
self.files[spider].append(file_detail)
self.exporter2 = JsonItemExporter(file_detail)
self.exporter2.start_exporting()
def spider_closed(self, spider):
print("spider_closed")
self.exporter1.finish_exporting()
self.exporter2.finish_exporting()
for file in self.files.pop(spider):
file.close()
def process_item(self, item, spider):
print("process_item")
if isinstance(item, items.UniListItem):
self.exporter1.export_item(item)
return item
elif isinstance(item, items.UniDetail):
self.exporter22.export_item(item)
return item
Error:
2017-12-27 11:41:15 [scrapy.core.scraper] ERROR: Error processing {'country': ('Finland',),
'country_code': ('fi ',),
'u_link': ('http://www.xxxxxxx.xxx/xxxxxxx/xxxx.htm',),
'u': (' pisto',)}
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/website_scrapy/for_icu/for_icu/pipelines.py", line 31, in process_item
self.exporter.export_item(item)
AttributeError: 'ListPipeline' object has no attribute 'exporter1'
Please let me know what I am missing here... being stuck at this from past couple of hours ...

I was unable to have exporter work, so I used simple filewriter for the task:
class ListPipeline(object):
unilist = []
unidetail = []
def close_spider(self, spider):
print("spider_closed")
file_table = open('%s_table.json' % spider.name, 'w')
line = json.dumps(self.unilist)
file_table.write(line)
file_table.close()
file_detail = open('%s_detail.json' % spider.name, 'w')
line = json.dumps(self.unidetail)
file_detail.write(line)
file_detail.close()
self.unilist.clear()
self.unidetail.clear()
def process_item(self, item, spider):
print("process_item")
if isinstance(item, items.UniListItem):
self.unilist.append(dict((item)))
return item
elif isinstance(item, items.UniDetail):
self.unidetail.append(dict((item)))
return item
This makes me achieve what I want but it would be better if one can use builtin exporters. If someone knows how to make it work, please update.

Related

writing to file using scrapy pipeline

I am trying to a file using the scrapy pipelines.py, item is being parsed correctly and it shows in terminal when I run.
this is my pipleines.py
import datetime,csv
class AmazonfullPipeline(object):
keys = ["Product_Name","Price","Amazon_Stock","rating","ASIN","Rank1","Rank1_category","Rank2","Rank2_category",
"UPC","Item_Model_Number"]
def __init__(self):
now = datetime.datetime.now()
current_date = now.strftime("%d%b")
file_name = "TestFile"
infile = open("{}_{}.csv".format(current_date,file_name),"w").close()
dict_writer = csv.DictWriter(infile, self.keys)
dict_writer.writeheader()
def process_item(self, item, spider):
self.dict_writer.writerow(item)
Error Message:
dict_writer = csv.DictWriter(infile, self.keys)
File "/usr/lib/python3.6/csv.py", line 140, in __init__
self.writer = writer(f, dialect, *args, **kwds)
TypeError: argument 1 must have a "write" method
You have several problems:
You close file descriptor before usage;
You did not set class variable. Use self.dict_writer, not dict_writer in __init__.
Check code:
import datetime,csv
class AmazonfullPipeline(object):
keys = ["Product_Name","Price","Amazon_Stock","rating","ASIN","Rank1","Rank1_category","Rank2","Rank2_category",
"UPC","Item_Model_Number"]
def __init__(self):
now = datetime.datetime.now()
current_date = now.strftime("%d%b")
file_name = "TestFile"
infile = open("{}_{}.csv".format(current_date,file_name),"w") # <- remove close() here
self.dict_writer = csv.DictWriter(infile, self.keys) # <- add self. here
self.dict_writer.writeheader() # <- add self. here
def process_item(self, item, spider):
self.dict_writer.writerow(item)

Scrapy CSV column export

I´d like to export data to several columns in csv but I always obtain this kind of file:
csv
I´d like to obtain two columns one "articulo" and another one "price"
My pipelines:
import scrapy
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
import csv
class MercadoPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemexporter(file)
self.exporter.fields_to_export = ['articulo','precio']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.closed()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
Can you help me please?
Here you are:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
from mercado.items import MercadoItem
class MercadoSpider(CrawlSpider):
name = 'mercado'
item_count = 0
allowed_domain = ['www.autodoc.es']
start_urls = ['https://www.autodoc.es/search?brandNo%5B0%5D=101']
rules = {
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//span[#class="next"]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//a[#class="ga-click"]')),
callback = 'parse_item', follow = False)
}
def parse_item(self, response):
ml_item = MercadoItem()
#info de producto
ml_item['articulo'] = response.xpath('normalize-space(//*[#id="content"]/div[4]/div[2]/div[1]/div[1]/div/span[1]/span/text())').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[#id="content"]/div[4]/div[3]/div[2]/p[2]/text())').extract()
self.item_count += 1
if self.item_count > 20:
raise CloseSpider('item_exceeded')
yield ml_item
There is nothing wrong with the output of your code.
You are getting the two csv columns you want, but the program you are using to view the data is not interpreting it correctly.
By default, CsvItemExporter uses , as the delimiter, and the program seems to expect something else (and possibly even different quoting).
There are two possibilities to solve your problem:
Change the program's settings so it reads the file correctly
Change the way CsvItemExporter exports data (it will pass any additional keyword arguments to the underlying csv.writer object)

How to order csv items in a multiCSV exporter scrapy?

I have been putting together a piece of code, I found on stack overflow, on how to split url's output in separate csv files and I came up with the code below. However, I can no longer use field_to_export in the code. I wonder how can I set the field to export so that they are exported like: field_to_export = ['itemA', 'itemB', 'itemC'].
from scrapy import signals
from scrapy.exporters import CsvItemExporter
import re
class appPipeline(object):
urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]
names = [name.group(1) for l in urls for name in [re.search(r'https://www.google.co.uk/', l, re.M|re.I)] if name]
def __init__(self):
self.files = {}
self.exporters = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.files = dict([ (name, open('results/'+name+'.csv','w+b')) for name in self.names])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.names])
### this line is not working self.exportes.fields_to_export = ['itemA','itemB','itemC']
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
myItem = item['myItem']
if myItem in set(self.names):
self.exporters[myItem].export_item(item)
return item
So far I have been trying to override the keys in items. I have tried to serialize the items and I have been looking how to sort values in a dictionary by a list of keys. None of them worked.
Thanks for you help.

How to realize a counter in scrapy pipeline?

Pipelines.py
class DotabuffPipeline(object):
def open_spider(self, spider):
self.match_dict = {}
def process_item(self, item, spider):
ID = item['matchID']
if ID in self.match_dict:
self.match_dict[ID] = self.match_dict[ID] + 1
if self.match_dict[ID]==5:
return item
else:
self.match_dict[ID] = 1
firstspider.py
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
import json
from dotabuff.items import DotabuffItem
class DotaSpider(CrawlSpider):
name = "dotaspider"
allow_domains = ['www.dotabuff.com']
start_urls = []
with open('spiders/Steam.json','r') as f:
steam_data = json.load(f)
f.close
steam_members = steam_data['members']
for member in steam_members:
url = 'http://www.dotabuff.com/players/%s/matches?page=1' %str(member-76561197960265728)
start_urls.append(url)
rules = (Rule(LinkExtractor(allow=(r'http://www.dotabuff.com/players/\d+/matches\?page=\d+')), callback="parse_item", follow= True),)
def parse_item(self, response):
sel = Selector(response)
matches = sel.xpath('//td[#class="cell-large"]/a/#href').extract()
for match in matches:
item = DotabuffItem()
match = match.split('/')[-1]
item['matchID'] = match
yield item
I scrapy some match numbers from www.dotabuff.com, and i have five steam id in a json. I want to find out the matches we five played together. So i define a dict used as a counter to count the number of appearance.But it doesn't work.
Traceback (most recent call last):
File "e:\anaconda2\lib\site-packages\twisted\internet\defer.py", line 150, in
maybeDeferred
result = f(*args, **kw)
File "e:\anaconda2\lib\site-packages\scrapy\xlib\pydispatch\robustapply.py", l
ine 57, in robustApply
return receiver(*arguments, **named)
File "e:\anaconda2\lib\site-packages\scrapy\extensions\feedexport.py", line 19
3, in item_scraped
slot.exporter.export_item(item)
File "e:\anaconda2\lib\site-packages\scrapy\exporters.py", line 111, in export
_item
itemdict = dict(self._get_serialized_fields(item))
File "e:\anaconda2\lib\site-packages\scrapy\exporters.py", line 63, in _get_se
rialized_fields
field_iter = six.iterkeys(item)
File "e:\anaconda2\lib\site-packages\six.py", line 593, in iterkeys
return d.iterkeys(**kw)
AttributeError: 'NoneType' object has no attribute 'iterkeys'
Looking at the docs for pipelines in scrapy here, it says
This method is called for every item pipeline component and must
either return a dict with data, Item (or any descendant class) object
or raise a DropItem exception.
Your process_item method doesn't obey this rule and can return None, which is not iterable.

Exporting to CSV format incorrect in scrapy

I'm trying out print out a CSV file after scraping using piplines but the formatting is a bit weird because instead of printing it top to bottom it is printing it all at once after scraping page 1 and then all of page 2 in one column. I have attached piplines.py and one line from csv output(quite large). So how do I make to print column wise instead all at once from one page
pipline.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class CSVPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.fields_to_export = ['names','stars','subjects','reviews']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
and output.csv
names stars subjects
Vivek0388,NikhilVashisth,DocSharad,Abhimanyu_swarup,Suresh N,kaushalhkapadia,JyotiMallick,Nitin T,mhdMumbai,SunilTukrel(COLUMN 2) 5 of 5 stars,4 of 5 stars,1 of 5 stars,5 of 5 stars,3 of 5 stars,4 of 5 stars,5 of 5 stars,5 of 5 stars,4 of 5 stars,4 of 5 stars(COLUMN 3) Best Stay,Awesome View... Nice Experience!,Highly mismanaged and dishonest.,A Wonderful Experience,Good place with average front office,Honeymoon,Awesome Resort,Amazing,ooty's beauty!!,Good stay and food
It should look something like this
Vivek0388 5 of 5
NikhilVashisth 5 of 5
DocSharad 5 of 5
...so on
EDIT:
items = [{'reviews:':"",'subjects:':"",'names:':"",'stars:':""} for k in range(1000)]
if(sites and len(sites) > 0):
for site in sites:
i+=1
items[i]['names'] = item['names']
items[i]['stars'] = item['stars']
items[i]['subjects'] = item['subjects']
items[i]['reviews'] = item['reviews']
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
for k in range(1000):
yield items[k]
Figured it out, csv zip it and then for loop it through it and write row. This was MUCH less complicated once you read the docs.
import csv
import itertools
class CSVPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('items.csv', 'wb'), delimiter=',')
self.csvwriter.writerow(['names','starts','subjects','reviews'])
def process_item(self, item, ampa):
rows = zip(item['names'],item['stars'],item['subjects'],item['reviews'])
for row in rows:
self.csvwriter.writerow(row)
return item

Categories