Using read_csv with a home-made object as a 'file' - python

The read_csv doc says that its first parameter can be 'any object with a read() method (such as a file handle or StringIO)'. My question is about how to construct an object that will work in this capacity.
import pandas as pd
file_name = 'plain.txt'
class FileWrap:
def __init__(self, path):
self.file = open(path)
def read(self):
return self.file.readline().rstrip()
filewrap = FileWrap(file_name)
while True:
line = filewrap.read()
if not line:
break
print (line)
df = pd.read_csv(FileWrap(file_name), header=None)
print (df)
The output from this script is this.
The first three lines are simply intended to show that the FileWrap object's read method seems to function as would be expected. The remaining lines serve to show that there's something I don't understand about constructing an object with a read method that pandas can use to receive its input a line at a time. What does a read have to do to make pandas happy?
1,2,3
4,5,6
7,8,9
Traceback (most recent call last):
File "temp.py", line 20, in <module>
df = pd.read_csv(FileWrap(file_name), header=None)
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 645, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 388, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 729, in __init__
self._make_engine(self.engine)
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 922, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "C:\Python34\lib\site-packages\pandas\io\parsers.py", line 1389, in __init__
self._reader = _parser.TextReader(src, **kwds)
File "pandas\parser.pyx", line 535, in pandas.parser.TextReader.__cinit__ (pandas\parser.c:6077)
File "pandas\parser.pyx", line 797, in pandas.parser.TextReader._get_header (pandas\parser.c:9878)
File "pandas\parser.pyx", line 909, in pandas.parser.TextReader._tokenize_rows (pandas\parser.c:11257)
File "pandas\parser.pyx", line 2008, in pandas.parser.raise_parser_error (pandas\parser.c:26804)
TypeError: raise: exception class must be a subclass of BaseException

when pandas call check is_file_like it invalidate does object has read and __iter__ methods, is_file_like, so you can try:
import pandas as pd
file_name = 'plain.txt'
class FileWrap:
def __init__(self, path):
self.file = open(path)
def __iter__(self):
self.file.readline().rstrip()
def read(self, *args, **kwargs):
return self.file.read()
df = pd.read_csv(FileWrap(file_name), header=None)
print (df)

Related

How to set many columns on Pandas Python?

I want to insert more than a hundred columns into a CSV file. But seems pandas library has limited columns.
Here is the error message:
Traceback (most recent call last):
File "metric.py", line 91, in <module>
finalFile(sys.argv[1])
File "metric.py", line 80, in finalFile
data = pd.read_csv(f, header=None, dtype=str)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py", line 688, in read_csv
return _read(filepath_or_buffer, kwds)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py", line 454, in _read
parser = TextFileReader(fp_or_buf, **kwds)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py", line 948, in __init__
self._make_engine(self.engine)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py", line 1180, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py", line 2010, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 540, in pandas._libs.parsers.TextReader.__cinit__
pandas.errors.EmptyDataError: No columns to parse from file
And below is my function:
def finalFile(fname):
output = pd.DataFrame()
for file_name in os.listdir('test/sciprt-temp/'):
if file_name.startswith(fname):
with open(os.path.join('test/sciprt-temp/', file_name)) as f:
data = pd.read_csv(f, header=None, dtype=str)
output[file_name.rsplit('.', 4)[2]] = data[1]
output.insert(0, 'timestamp', dt.datetime.now().timestamp())
output.insert(0, 'hostname', fname.rsplit('-', 3)[0])
output.set_index(output.columns[0], inplace=True)
output.to_csv(fname.rsplit('.', 2)[2] + ".csv")
finalFile(sys.argv[1])
It seems to work fine when inserting few columns but not working with more columns.
hostname,timestamp,-diskstats_latency-sda-avgrdwait-g,-diskstats_latency-sda-avgwait-g,-diskstats_latency-sda-avgwrwait-g,-diskstats_latency-sda-svctm-g,-diskstats_latency-sda_avgwait-g
test.test.com,1617779170.62498,2.7979746835e-03,6.6681051841e-03,7.1533659185e-03,2.5977601795e-04,6.6681051841e-03

django-storage with S3: TypeError: a bytes-like object is required, not 'str'

I have a function that exports a CSV to S3:
def export_csv(report_id):
# MyModel has a JsonField called `json_data`
items = MyModel.objects.all()
data = [x.json_data for x in items]
df = pd.DataFrame(data)
file_buffer = io.StringIO()
# QUOTE_ALL to avoid separators from throwing off columns
df.to_csv(
file_buffer,
index=False,
encoding='utf-8',
quotechar='"',
quoting=csv.QUOTE_ALL,
)
report.report_data_file.save(
"data.csv", ContentFile(file_buffer.getvalue())
)
I'm using the following FieldField:
class PrivateMediaStorage(S3Boto3Storage):
location = settings.AWS_PRIVATE_MEDIA_LOCATION
default_acl = 'private'
file_overwrite = False
custom_domain = False
class Report(BaseModel):
...
report_data_file = models.FileField(
storage=PrivateMediaStorage(),
upload_to=export_location,
null=True,
blank=True,
)
This is working fine in my local environment, where PrivateMediaStorage is defined as
class PrivateMediaStorage(FileSystemStorage):
pass
In production, where I am using django-storages, I am seeing the following error when running this function:
TypeError: a bytes-like object is required, not 'str'
Here is the full traceback:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/site-packages/celery/app/trace.py", line 385, in trace_task
R = retval = fun(*args, **kwargs)
File "/usr/local/lib/python3.8/site-packages/celery/app/trace.py", line 650, in __protected_call__
return self.run(*args, **kwargs)
File "/code/apps/benchmark/tasks.py", line 45, in export_report_data_task
report.export_json_data()
File "/code/apps/benchmark/models.py", line 470, in export_json_data
export_csv(self.id)
File "/code/apps/benchmark/utils/export_report_data.py", line 46, in export_csv
report.report_data_file.save(
File "/usr/local/lib/python3.8/site-packages/django/db/models/fields/files.py", line 87, in save
self.name = self.storage.save(name, content, max_length=self.field.max_length)
File "/usr/local/lib/python3.8/site-packages/django/core/files/storage.py", line 52, in save
return self._save(name, content)
File "/usr/local/lib/python3.8/site-packages/storages/backends/s3boto3.py", line 495, in _save
self._save_content(obj, content, parameters=parameters)
File "/usr/local/lib/python3.8/site-packages/storages/backends/s3boto3.py", line 510, in _save_content
obj.upload_fileobj(content, ExtraArgs=put_parameters)
File "/usr/local/lib/python3.8/site-packages/boto3/s3/inject.py", line 619, in object_upload_fileobj
return self.meta.client.upload_fileobj(
File "/usr/local/lib/python3.8/site-packages/boto3/s3/inject.py", line 539, in upload_fileobj
return future.result()
File "/usr/local/lib/python3.8/site-packages/s3transfer/futures.py", line 106, in result
return self._coordinator.result()
File "/usr/local/lib/python3.8/site-packages/s3transfer/futures.py", line 265, in result
raise self._exception
File "/usr/local/lib/python3.8/site-packages/s3transfer/tasks.py", line 255, in _main
self._submit(transfer_future=transfer_future, **kwargs)
File "/usr/local/lib/python3.8/site-packages/s3transfer/upload.py", line 558, in _submit
self._submit_multipart_request(
File "/usr/local/lib/python3.8/site-packages/s3transfer/upload.py", line 622, in _submit_multipart_request
for part_number, fileobj in part_iterator:
File "/usr/local/lib/python3.8/site-packages/s3transfer/upload.py", line 270, in yield_upload_part_bodies
fileobj, full_size = self._get_upload_part_fileobj_with_full_size(
File "/usr/local/lib/python3.8/site-packages/s3transfer/upload.py", line 343, in _get_upload_part_fileobj_with_full_size
return six.BytesIO(data), len(data)
TypeError: a bytes-like object is required, not 'str'

how to explode dict (or list of dict) object in multiple column in dask.dataframe

When I try to convert some xml to dataframe using xmltodict it happens that a particular column contains all the info I need as dict or list of dict. I'm able to convert this column in multiple ones with pandas but I'm not able to perform the similar operation in dask.
Is not possible to use meta because I've no idea of all the possible fields that are available in the xml and dask is necessary because the true xml files are bigger than 1Gb each.
example.xml:
<?xml version="1.0" encoding="UTF-8"?>
<itemList>
<eventItem uid="1">
<timestamp>2019-07-04T09:57:35.044Z</timestamp>
<eventType>generic</eventType>
<details>
<detail>
<name>columnA</name>
<value>AAA</value>
</detail>
<detail>
<name>columnB</name>
<value>BBB</value>
</detail>
</details>
</eventItem>
<eventItem uid="2">
<timestamp>2019-07-04T09:57:52.188Z</timestamp>
<eventType>generic</eventType>
<details>
<detail>
<name>columnC</name>
<value>CCC</value>
</detail>
</details>
</eventItem>
</itemList>
Working pandas code:
import xmltodict
import collections
import pandas as pd
def pd_output_dict(details):
detail = details.get("detail", [])
ret_value = {}
if type(detail) in (collections.OrderedDict, dict):
ret_value[detail["name"]] = detail["value"]
elif type(detail) == list:
for i in detail:
ret_value[i["name"]] = i["value"]
return pd.Series(ret_value)
with open("example.xml", "r", encoding="utf8") as f:
df_dict_list = xmltodict.parse(f.read()).get("itemList", {}).get("eventItem", [])
df = pd.DataFrame(df_dict_list)
df = pd.concat([df, df.apply(lambda row: pd_output_dict(row.details), axis=1, result_type="expand")], axis=1)
print(df.head())
Not working dask code:
import xmltodict
import collections
import dask
import dask.bag as db
import dask.dataframe as dd
def dd_output_dict(row):
detail = row.get("details", {}).get("detail", [])
ret_value = {}
if type(detail) in (collections.OrderedDict, dict):
row[detail["name"]] = detail["value"]
elif type(detail) == list:
for i in detail:
row[i["name"]] = i["value"]
return row
with open("example.xml", "r", encoding="utf8") as f:
df_dict_list = xmltodict.parse(f.read()).get("itemList", {}).get("eventItem", [])
df_bag = db.from_sequence(df_dict_list)
df = df_bag.to_dataframe()
df = df.apply(lambda row: dd_output_dict(row), axis=1)
The idea is to have in dask similar result I've in pandas but a the moment I'm receiving errors:
>>> df = df.apply(lambda row: output_dict(row), axis=1)
Traceback (most recent call last):
File "C:\Anaconda3\lib\site-packages\dask\dataframe\utils.py", line 169, in raise_on_meta_error
yield
File "C:\Anaconda3\lib\site-packages\dask\dataframe\core.py", line 4711, in _emulate
return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
File "C:\Anaconda3\lib\site-packages\dask\utils.py", line 854, in __call__
return getattr(obj, self.method)(*args, **kwargs)
File "C:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 6487, in apply
return op.get_result()
File "C:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 151, in get_result
return self.apply_standard()
File "C:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 257, in apply_standard
self.apply_series_generator()
File "C:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 286, in apply_series_generator
results[i] = self.f(v)
File "<stdin>", line 1, in <lambda>
File "<stdin>", line 4, in output_dict
AttributeError: ("'str' object has no attribute 'get'", 'occurred at index 0')
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Anaconda3\lib\site-packages\dask\dataframe\core.py", line 3964, in apply
M.apply, self._meta_nonempty, func, args=args, udf=True, **kwds
File "C:\Anaconda3\lib\site-packages\dask\dataframe\core.py", line 4711, in _emulate
return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
File "C:\Anaconda3\lib\contextlib.py", line 130, in __exit__
self.gen.throw(type, value, traceback)
File "C:\Anaconda3\lib\site-packages\dask\dataframe\utils.py", line 190, in raise_on_meta_error
raise ValueError(msg)
ValueError: Metadata inference failed in `apply`.
You have supplied a custom function and Dask is unable to
determine the type of output that that function returns.
To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.
Original error is below:
------------------------
AttributeError("'str' object has no attribute 'get'", 'occurred at index 0')
Traceback:
---------
File "C:\Anaconda3\lib\site-packages\dask\dataframe\utils.py", line 169, in raise_on_meta_error
yield
File "C:\Anaconda3\lib\site-packages\dask\dataframe\core.py", line 4711, in _emulate
return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
File "C:\Anaconda3\lib\site-packages\dask\utils.py", line 854, in __call__
return getattr(obj, self.method)(*args, **kwargs)
File "C:\Anaconda3\lib\site-packages\pandas\core\frame.py", line 6487, in apply
return op.get_result()
File "C:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 151, in get_result
return self.apply_standard()
File "C:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 257, in apply_standard
self.apply_series_generator()
File "C:\Anaconda3\lib\site-packages\pandas\core\apply.py", line 286, in apply_series_generator
results[i] = self.f(v)
File "<stdin>", line 1, in <lambda>
File "<stdin>", line 4, in output_dict
Right, so operations like map_partitions will need to know the column names and data types. As you've mentioned, you can specify this with the meta= keyword.
Perhaps you can run through your data once to compute what these will be, and then construct a proper meta object, and pass that in? This is inefficient, and requires reading through all of your data, but I'm not sure that there is another way.

Read excel file got TypeError: int() argument must be a string, not '_NoValueType' in Pandas

I read an excel file with df = pd.read_csv(encoding = 'utf-8', engine = 'python') while print(df) I got the following TypeError:
Traceback (most recent call last):
File "C:\Users\User\Anaconda3\lib\site-packages\IPython\core\formatters.py", line 702, in __call__
printer.pretty(obj)
File "C:\Users\User\Anaconda3\lib\site-packages\IPython\lib\pretty.py", line 400, in pretty
return _repr_pprint(obj, self, cycle)
File "C:\Users\User\Anaconda3\lib\site-packages\IPython\lib\pretty.py", line 695, in _repr_pprint
output = repr(obj)
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\core\base.py", line 82, in __repr__
return str(self)
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\core\base.py", line 61, in __str__
return self.__unicode__()
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\core\frame.py", line 663, in __unicode__
line_width=width, show_dimensions=show_dimensions)
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\core\frame.py", line 1971, in to_string
formatter.to_string()
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\io\formats\format.py", line 620, in to_string
max_len = Series(text).str.len().max()
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\core\generic.py", line 9611, in stat_func
numeric_only=numeric_only)
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\core\series.py", line 3221, in _reduce
return op(delegate, skipna=skipna, **kwds)
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\core\nanops.py", line 131, in f
result = alt(values, axis=axis, skipna=skipna, **kwds)
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\core\nanops.py", line 507, in reduction
result = getattr(values, meth)(axis)
File "C:\Users\User\Anaconda3\lib\site-packages\numpy\core\_methods.py", line 28, in _amax
return umr_maximum(a, axis, None, out, keepdims, initial)
TypeError: int() argument must be a string, a bytes-like object or a number, not '_NoValueType'
When read column names I get Index(['names\t', 'matched_names', 'ratio'], dtype='object').
If someone knows what caused this typeerror, please help me. Thanks at advance.
Update: for your better understanding, I added the code how can I generate this csv file:
with open('result.csv', 'w', encoding = 'utf_8_sig') as f1:
writer = csv.writer(f1, delimiter='\t', lineterminator='\n', )
writer.writerow(('names', 'matched_names', 'ratio'))
for dish1, dish2 in itertools.combinations(enumerate(processedDishes), 2):
matcher = matchers[dish1[0]]
matcher.set_seq2(dish2[1])
ratio = int(round(100 * matcher.ratio()))
if ratio >= threshold_ratio:
#print(dishes[dish1[0]], dishes[dish2[0]])
my_list = (dishes[dish1[0]], dishes[dish2[0]], ratio)
print(my_list)
writer.writerow([my_list])
my_list has rows with format as follows: ('steve', 'john', 0)
With df.info()
Traceback (most recent call last):
File "<ipython-input-142-83941e9879da>", line 1, in <module>
df.info()
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2252, in info
_verbose_repr()
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2210, in _verbose_repr
counts = self.count()
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\core\frame.py", line 6800, in count
result = Series(counts, index=frame._get_agg_axis(axis))
File "C:\Users\User\Anaconda3\lib\site-packages\pandas\core\series.py", line 262, in __init__
.format(val=len(data), ind=len(index)))
ValueError: Length of passed values is 0, index implies 3

"pandas.io.common.EmptyDataError: No columns to parse from file" after moving to mac

In Windows 8, the script works fine. After I moved script and data.csv to work in my mac, I keep getting error: "pandas.io.common.EmptyDataError: No columns to parse from file."
The script and data are in the same folder as
"/Users/myname/Downloads/test/testimport.py"
"/Users/myname/Downloads/test/test2.csv"
I've tried many file locations to read the csv but nothing works.
file_loc = "../test/test2.csv"
# as well as "../test2.csv", "/test2.csv", "/Users/myname/Downloads/test/test2.csv"
import pandas as pd
df = pd.read_csv(file_loc)
exp_mat = df.as_matrix()
print exp_mat
How can I read the csv here? Is it wrong location problem or is csv filetype in mac not compatible?
Here is OS X El Capitan. Full error is
h143% python testimport.py
Traceback (most recent call last):
File "test_importexcel.py", line 24, in <module>
df = pd.read_csv(file_loc)
File "/Users/myname/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py", line 646, in parser_f
return _read(filepath_or_buffer, kwds)
File "/Users/myname/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py", line 389, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/Users/myname/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py", line 730, in __init__
self._make_engine(self.engine)
File "/Users/myname/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py", line 923, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/Users/myname/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py", line 1390, in __init__
self._reader = _parser.TextReader(src, **kwds)
File "pandas/parser.pyx", line 538, in pandas.parser.TextReader.__cinit__ (pandas/parser.c:6171)
pandas.io.common.EmptyDataError: No columns to parse from file
Data (copying from Number is like)
x time value
445.1207 0.003626 21935450
445.1203 0.011099 36700932
445.1203 0.017235 35722172
445.1203 0.022958 33623668
445.1203 0.028689 33500360
352.3396 37.180567 307886720
352.3396 37.185836 303264100
352.3396 37.191101 292523810

Categories