writing to file using scrapy pipeline

writing to file using scrapy pipeline - python

I am trying to a file using the scrapy pipelines.py, item is being parsed correctly and it shows in terminal when I run.
this is my pipleines.py
import datetime,csv
class AmazonfullPipeline(object):
keys = ["Product_Name","Price","Amazon_Stock","rating","ASIN","Rank1","Rank1_category","Rank2","Rank2_category",
"UPC","Item_Model_Number"]
def __init__(self):
now = datetime.datetime.now()
current_date = now.strftime("%d%b")
file_name = "TestFile"
infile = open("{}_{}.csv".format(current_date,file_name),"w").close()
dict_writer = csv.DictWriter(infile, self.keys)
dict_writer.writeheader()
def process_item(self, item, spider):
self.dict_writer.writerow(item)
Error Message:
dict_writer = csv.DictWriter(infile, self.keys)
File "/usr/lib/python3.6/csv.py", line 140, in __init__
self.writer = writer(f, dialect, *args, **kwds)
TypeError: argument 1 must have a "write" method

You have several problems:
You close file descriptor before usage;
You did not set class variable. Use self.dict_writer, not dict_writer in __init__.
Check code:
import datetime,csv
class AmazonfullPipeline(object):
keys = ["Product_Name","Price","Amazon_Stock","rating","ASIN","Rank1","Rank1_category","Rank2","Rank2_category",
"UPC","Item_Model_Number"]
def __init__(self):
now = datetime.datetime.now()
current_date = now.strftime("%d%b")
file_name = "TestFile"
infile = open("{}_{}.csv".format(current_date,file_name),"w") # <- remove close() here
self.dict_writer = csv.DictWriter(infile, self.keys) # <- add self. here
self.dict_writer.writeheader() # <- add self. here
def process_item(self, item, spider):
self.dict_writer.writerow(item)

Related

valueError: I/O operation on closed file while working on json file

import json
class file:
def __init__(self,filename='c.json'):
with open(filename,'r') as f:
self.file_data = json.load(f)
def __repr__(self):
return f"{self.file_data}"
class crud(file):
def __init__(self,filename='c.json'):
with open(filename, 'w') as f:
self.file = f
#adding data
def add(self,data):
json.dump(data,self.file,indent=4)
temp = self.file_data["contacts"]
temp.append(data)
d = {"name": None, "phone": None, "email": None}
f =crud()
f.add(d)

with is context manager in python. It ensures to check if the file request to open, is if present then run the next block/section of code. After this section of code is completed, the context manager closes the connection to file.
In your case, change your code to the following
import os
class crud(file):
def __init__(self,filename='c.json'):
if os.path.isfile(filename):
self.file = open(filename, 'w')
Otherwise in 'crud.add' method, at line temp = self.file_data["contacts"], python interpreter will causes the error, as the file which was open, was closed immediately after with context is completed.

How to convert a normal function into a function inside a class?

I'm trying to organize my code I already have by implementing classes and execute methods on classes instantiations. I have put some hours into figuring out how to use classes, but still haven't figured it out. Could someone help me?
This is the original code:
def readSignalAcquisitionData(fileName):
f = open(fileName, 'r')
# dummy read
f.readline()
timeStamps = []
dataInput = []
for ln in f:
# parse info
timeStr, dataStr = ln.split(',')
timeStamps.append(float(timeStr))
dataInput.append(float(dataStr))
f.close()
return timeStamps, dataInput
And this is what I currently have:
class SignalDataIOUnit:
def __init__(self, fileName):
self.fileName = fileName
def readSignalAcquisitionData(self):
f = open(self.fileName, 'r')
self.timeStamps = []
self.dataInput = []
for ln in f:
# parse info
self.timeStr, self.dataStr = ln.split(',')
self.timeStamps.append(float(self.timeStr))
self.dataInput.append(float(self.dataStr))
f.close()
return self.timeStamps, self.dataInput
def writeFilteredData(self, fileName, timeStamps, dataOut):
pass
fileName="LabsWeek03_inputData.csv"
timeStamps, dataInput = SignalDataIOUnit.readSignalAcquisitionData(fileName)
print(timeStamps)
When I try running it through the terminal I get these error messages:
Traceback (most recent call last):
File "SignalDataEvaluationUnit_OOP.py", line 26, in <module>
timeStamps, dataInput = SignalDataIOUnit.readSignalAcquisitionData(fileName)
File "SignalDataEvaluationUnit_OOP.py", line 7, in readSignalAcquisitionData
f = open(self.fileName, 'r')
AttributeError: 'str' object has no attribute 'fileName'

As #deceze♦ says in comment, you haven't instantiated the class SignalDataIOUnit, that's why it doesn't work.
To make it work, you have 2 choices:
Instantiating SignalDataIOUnit object and call the method readSignalAcquisitionData:
timeStamps, dataInput = SignalDataIOUnit(fileName).readSignalAcquisitionData()
Use Python's #staticmethod decorator:
class SignalDataIOUnit:
def __init__(self, fileName):
self.fileName = fileName
#staticmethod
def readSignalAcquisitionData(fileName):
...
then just call it as usual
timeStamps, dataInput = SignalDataIOUnit.readSignalAcquisitionData(fileName)

yes, you should use like this
fileName="LabsWeek03_inputData.csv"
timeStamps, dataInput = SignalDataIOUnit(fileName).readSignalAcquisitionData()
print(timeStamps)

scrapy pipeline exporter object is not getting instantiated

I am using scrapy to parse a table containing links and save it in json. The links from table contain additional detail and they will be fetched and stored into another json. (following this example: https://docs.scrapy.org/en/latest/topics/exporters.html)
To achieve this I am using a pipeline to check item type and store result in appropriate json. However, I am stuck in some weird error. Please refer below:
from scrapy import signals
from scrapy.exporters import JsonItemExporter
from for_icu import items
class ListPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
print("spider_opened")
file_table = open('%s_table.json' % spider.name, 'w+b')
self.files[spider].append(file_table)
self.exporter1 = JsonItemExporter(file_table)
self.exporter1.start_exporting()
file_detail = open('%s_detail.json' % spider.name, 'w+b')
self.files[spider].append(file_detail)
self.exporter2 = JsonItemExporter(file_detail)
self.exporter2.start_exporting()
def spider_closed(self, spider):
print("spider_closed")
self.exporter1.finish_exporting()
self.exporter2.finish_exporting()
for file in self.files.pop(spider):
file.close()
def process_item(self, item, spider):
print("process_item")
if isinstance(item, items.UniListItem):
self.exporter1.export_item(item)
return item
elif isinstance(item, items.UniDetail):
self.exporter22.export_item(item)
return item
Error:
2017-12-27 11:41:15 [scrapy.core.scraper] ERROR: Error processing {'country': ('Finland',),
'country_code': ('fi ',),
'u_link': ('http://www.xxxxxxx.xxx/xxxxxxx/xxxx.htm',),
'u': (' pisto',)}
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/website_scrapy/for_icu/for_icu/pipelines.py", line 31, in process_item
self.exporter.export_item(item)
AttributeError: 'ListPipeline' object has no attribute 'exporter1'
Please let me know what I am missing here... being stuck at this from past couple of hours ...

I was unable to have exporter work, so I used simple filewriter for the task:
class ListPipeline(object):
unilist = []
unidetail = []
def close_spider(self, spider):
print("spider_closed")
file_table = open('%s_table.json' % spider.name, 'w')
line = json.dumps(self.unilist)
file_table.write(line)
file_table.close()
file_detail = open('%s_detail.json' % spider.name, 'w')
line = json.dumps(self.unidetail)
file_detail.write(line)
file_detail.close()
self.unilist.clear()
self.unidetail.clear()
def process_item(self, item, spider):
print("process_item")
if isinstance(item, items.UniListItem):
self.unilist.append(dict((item)))
return item
elif isinstance(item, items.UniDetail):
self.unidetail.append(dict((item)))
return item
This makes me achieve what I want but it would be better if one can use builtin exporters. If someone knows how to make it work, please update.

python class file context manager

I am trying to open a file in a class and close it on exit in this manner.
class PlanetaryImage(object):
#classmethod
def open(cls, filename):
with open(filename, 'rb') as fp:
return cls(fp, filename)
def __init__(self, stream, filename=None, memory_layout='DISK'):
self.filename = filename
self._parse_data(stream)
def _parse_data(self, stream):
data_stream = stream
try:
if self.data_filename is not None:
dirpath = os.path.dirname(self.filename)
data_file = os.path.abspath(
os.path.join(dirpath, self.data_filename))
data_stream = open(data_file, 'rb')
data_stream.seek(self.start_byte)
if self.format in self.BAND_STORAGE_TYPE:
return getattr(self, self.BAND_STORAGE_TYPE[self.format])(data_stream)
raise Exception('Unkown format (%s)' % self.format)
finally:
data_stream.close()
There are certain cases where I am having to use open one more file in _parse_data function. I wanted to use with but the if statements make it difficult. Any suggestions on how to make the try section more pythonic.

There's no reason for _parse_data to try to open a file. It should be the caller's responsibility to either use PlanetaryImage.open with a file name or to provide an open file handle to __init__. _parse_data should do just one thing: parse the data from its stream argument.
class PlanetaryImage(object):
#classmethod
def open(cls, filename):
with open(filename, 'rb') as fp:
return cls(fp, filename)
def __init__(self, stream, memory_layout='DISK'):
self._parse_data(stream)
def _parse_data(self, data_stream):
try:
data_stream.seek(self.start_byte)
if self.format in self.BAND_STORAGE_TYPE:
return getattr(self, self.BAND_STORAGE_TYPE[self.format])(data_stream)
raise Exception('Unkown format (%s)' % self.format)
finally:
data_stream.close()
Now, there are simply two options for using the class:
with open(filename, 'rb') as fp:
x = PlanetaryImage(fp)
...
or
x = PlanetaryImage(filename)
....

how to open a csv in universal new line mode through django upload?

I am trying to upload a csv file in a django form:
class CSVUploadForm(forms.Form):
csv_file = forms.FileField(label='Select a CSV file to import:',)
def clean(self):
file_csv = self.cleaned_data['csv_file']
records = csv.reader(open('/mypath/'+file_csv.name, 'rU'), dialect=csv.excel_tab)
I need to open the file in universal new line mode. I can do that with "open" method above, but that will not work for this form because the file I am dealing with is an in memory uploaded version of the csv.
How do I pass the universal new line mode flag rU to something like this:
records = csv.reader(file_csv, dialect=csv.excel_tab)
?

You can use str.splitlines() -- which automatically splits on universale line-breaks -- in the following manner:
def clean(self):
file_csv = self.cleaned_data['csv_file']
lines = file_csv.read().splitlines()
records = csv.reader(lines, dialect=csv.excel_tab)
If you are worried about the memory cost of creating the lines variable, you can force Django to save the file to a local file on disk changing the FILE_UPLOAD_MAX_MEMORY_SIZE variable in settings.py (more on this variable here):
# add to your settings.py
FILE_UPLOAD_MAX_MEMORY_SIZE = 0
FILE_UPLOAD_TEMP_DIR = '/tmp'
Then to process the file from it's tmp folder using universal mode:
def clean(self):
file_csv = open(self.cleaned_data['csv_file'].temporary_file_path, 'rU')
records = csv.reader(file_csv, dialect=csv.excel_tab)

Problem with the solution above is that it reads the whole file all at once, make it not acceptable when processing large csv file. For small CSV files files will be saved to disk instead of being kept in memory which is also not so great.
I've created a class to handle new lines
class FileWithUniversalNewLine(object):
def __init__(self, file_obj):
self.file = file_obj
def lines(self):
buff = "" # In case of reading incomplete line, buff will temporarly keep the incomplete line
while True:
line = self.file.read(2048)
if not line:
if buff:
yield buff
raise StopIteration
# Convert all new lines into linux new line
line = buff + line.replace("\r\n", "\n").replace("\r", "\n")
lines = line.split("\n")
buff = lines.pop()
for sline in lines:
yield sline
def close(self):
self.file.close()
def __exit__(self, *args, **kwargs):
return self.file.__exit__(*args, **kwargs)
def __enter__(self, *args, **kwargs):
return self
def __iter__(self):
return self.lines()
Usage:
csvfile = FileWithUniversalNewLine(file_csv)
records = csv.reader(csvfile, dialect=csv.excel_tab)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

writing to file using scrapy pipeline - python

Related

valueError: I/O operation on closed file while working on json file

How to convert a normal function into a function inside a class?

scrapy pipeline exporter object is not getting instantiated

python class file context manager

how to open a csv in universal new line mode through django upload?

Categories

Resources