how to delete unwanted data from a csv file in python - python

import pandas as pd
sea_level_df = pd.read_csv(r"C:\Users\slaye\OneDrive\Desktop\SeaLevel.csv")
display(sea_level_df)
I'm trying to delete the first 3 rows of this file without literally highlighting the unwanted text in the actual file and pressing backspace. Is there a way I can do this in python?
this is the top of the csv file:
#title = mean sea level anomaly global ocean (66S to 66N) (Annual signals retained)
#institution = NOAA/Laboratory for Satellite Altimetry
#references = NOAA Sea Level Rise
year,TOPEX/Poseidon,Jason-1,Jason-2,Jason-3
1992.9614,-16.27000,
1992.9865,-17.97000,
1993.0123,-14.87000,
1993.0407,-19.87000,
1993.0660,-25.27000,
1993.0974,-29.37000,
I want to delete the first 3 hashed rows of text so I can parse this into a table in pandas. I'm getting the following error:
ParserError Traceback (most recent call last)
Input In [14], in <cell line: 2>()
1 import pandas as pd
----> 2 sea_level_df = pd.read_csv(r"C:\Users\slaye\OneDrive\Desktop\SeaLevel.csv")
3 display(sea_level_df)
File ~\anaconda3\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
305 if len(args) > num_allow_args:
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File ~\anaconda3\lib\site-packages\pandas\io\parsers\readers.py:680, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
665 kwds_defaults = _refine_defaults_read(
666 dialect,
667 delimiter,
(...)
676 defaults={"delimiter": ","},
677 )
678 kwds.update(kwds_defaults)
--> 680 return _read(filepath_or_buffer, kwds)
File ~\anaconda3\lib\site-packages\pandas\io\parsers\readers.py:581, in _read(filepath_or_buffer, kwds)
578 return parser
580 with parser:
--> 581 return parser.read(nrows)
File ~\anaconda3\lib\site-packages\pandas\io\parsers\readers.py:1254, in TextFileReader.read(self, nrows)
1252 nrows = validate_integer("nrows", nrows)
1253 try:
-> 1254 index, columns, col_dict = self._engine.read(nrows)
1255 except Exception:
1256 self.close()
File ~\anaconda3\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py:225, in CParserWrapper.read(self, nrows)
223 try:
224 if self.low_memory:
--> 225 chunks = self._reader.read_low_memory(nrows)
226 # destructive to chunks
227 data = _concatenate_chunks(chunks)
File ~\anaconda3\lib\site-packages\pandas\_libs\parsers.pyx:805, in pandas._libs.parsers.TextReader.read_low_memory()
File ~\anaconda3\lib\site-packages\pandas\_libs\parsers.pyx:861, in pandas._libs.parsers.TextReader._read_rows()
File ~\anaconda3\lib\site-packages\pandas\_libs\parsers.pyx:847, in pandas._libs.parsers.TextReader._tokenize_rows()
File ~\anaconda3\lib\site-packages\pandas\_libs\parsers.pyx:1960, in pandas._libs.parsers.raise_parser_error()
ParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 5

From the read_csv documentation, you can use skiprows = 3 to ignore the first 3 rows of the file.
Otherwise, pandas just reads your csv from the top down and assumes that all rows will follow the pattern of the first row. It doesn't see any delimiters (comma, tab, etc.) in the first row, so it assumes your data only has one column. The next few rows follow this same pattern (no delimiters = 1 column), then all of a sudden, there's a comma in the 4th row! Pandas sees this as a delimiter (which would indicate more than one column), but since there weren't any in the first rows, it thinks there should only be one column for the whole csv, so it throws the error.

Related

How to save a large pandas dataframe with compex arrays and load it up again?

I have a large pandas DataFrame with individual elements that are complex numpy arrays. Please see below a minimal code example to reproduce the scenario:
d = {f'x{i}': [] for i in range(4)}
df = pd.DataFrame(data=d).astype(object)
for K in range(4):
for i in range(4):
df.loc[f'{K}', f'x{i}'] = np.random.random(size=(2,2)) + np.random.random(size=(2,2)) * 1j
df
What is the best way to save these and load them up again for use later?
The problem I am having is that when I increase the size of the matrices stored and the number of elements, I get an OverflowError when I try to save it as .h5 file as shown below:
import pandas as pd
size = (300,300)
xs = 1500
d = {f'x{i}': [] for i in range(xs)}
df = pd.DataFrame(data=d).astype(object)
for K in range(10):
for i in range(xs):
df.loc[f'{K}', f'x{i}'] = np.random.random(size=size) + np.random.random(size=size) * 1j
df.to_hdf('test.h5', key="df", mode="w")
load_test = pd.read_hdf("test.h5", "df")
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
<ipython-input-124-8cb8df1a0653> in <module>
12 df.loc[f'{K}', f'x{i}'] = np.random.random(size=size) + np.random.random(size=size) * 1j
13
---> 14 df.to_hdf('test.h5', key="df", mode="w")
15
16
~/PQKs/pqks/lib/python3.6/site-packages/pandas/core/generic.py in to_hdf(self, path_or_buf, key, mode, complevel, complib, append, format, index, min_itemsize, nan_rep, dropna, data_columns, errors, encoding)
2447 data_columns=data_columns,
2448 errors=errors,
-> 2449 encoding=encoding,
2450 )
2451
~/PQKs/pqks/lib/python3.6/site-packages/pandas/io/pytables.py in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, format, index, min_itemsize, nan_rep, dropna, data_columns, errors, encoding)
268 path_or_buf, mode=mode, complevel=complevel, complib=complib
269 ) as store:
--> 270 f(store)
271 else:
272 f(path_or_buf)
~/PQKs/pqks/lib/python3.6/site-packages/pandas/io/pytables.py in <lambda>(store)
260 data_columns=data_columns,
261 errors=errors,
--> 262 encoding=encoding,
263 )
264
~/PQKs/pqks/lib/python3.6/site-packages/pandas/io/pytables.py in put(self, key, value, format, index, append, complib, complevel, min_itemsize, nan_rep, data_columns, encoding, errors, track_times)
1127 encoding=encoding,
1128 errors=errors,
-> 1129 track_times=track_times,
1130 )
1131
~/PQKs/pqks/lib/python3.6/site-packages/pandas/io/pytables.py in _write_to_group(self, key, value, format, axes, index, append, complib, complevel, fletcher32, min_itemsize, chunksize, expectedrows, dropna, nan_rep, data_columns, encoding, errors, track_times)
1799 nan_rep=nan_rep,
1800 data_columns=data_columns,
-> 1801 track_times=track_times,
1802 )
1803
~/PQKs/pqks/lib/python3.6/site-packages/pandas/io/pytables.py in write(self, obj, **kwargs)
3189 # I have no idea why, but writing values before items fixed #2299
3190 blk_items = data.items.take(blk.mgr_locs)
-> 3191 self.write_array(f"block{i}_values", blk.values, items=blk_items)
3192 self.write_index(f"block{i}_items", blk_items)
3193
~/PQKs/pqks/lib/python3.6/site-packages/pandas/io/pytables.py in write_array(self, key, value, items)
3047
3048 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
-> 3049 vlarr.append(value)
3050
3051 elif empty_array:
~/PQKs/pqks/lib/python3.6/site-packages/tables/vlarray.py in append(self, sequence)
526 nparr = None
527
--> 528 self._append(nparr, nobjects)
529 self.nrows += 1
530
~/PQKs/pqks/lib/python3.6/site-packages/tables/hdf5extension.pyx in tables.hdf5extension.VLArray._append()
OverflowError: value too large to convert to int
As noted in the similar issue https://stackoverflow.com/a/57133759/8896855, hdf/h5 files have more overhead and are intended to optimize many dataframes saved into a single file system. Feather and parquet objects will likely provide a better solution in terms of saving/loading a larger single dataframe as an in-memory object. In terms of the specific overflow error, this likely is the result of having larger mixed-type (as numpy array) columns stored in the "object" type in pandas. One (more complicated) option would be to split out the arrays in your dataframe into separate columns, but that's probably unnecessary.
A general quick fix would be to use df.to_pickle(r'path_to/filename.pkl'), but to_feather or to_parquet likely present more optimized solutions.

Reading .txt file using pandas and slicing it based on some range in one column [duplicate]

This question already has answers here:
Trouble opening files in python with VS Code
(3 answers)
Python Pandas: Boolean indexing on multiple columns [duplicate]
(2 answers)
Closed 2 years ago.
I have multiple .txt files that are full of rubbish data and only need a portion of it based on some range that changes between files. I'm still learning Python and not very experienced.
I am using VS code 1.50 and Python 3.8.1
Sample of my data: https://pastebin.com/kZm1spnz
My first issue is with reading the .txt file, here is what I did at first:
import pandas as pd
import os
#Reading my data
Data = pd.read_csv('Data_01.txt')
I don't understand why it gives an error even though the python script is in the same folder as the .txt file.
Error:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-28-436477220532> in <module>
3
4 #Reading my data
----> 5 Data = pd.read_csv("Data_01.txt", sep="\t", names=["Depth", "Porosity"])
~\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
684 )
685
--> 686 return _read(filepath_or_buffer, kwds)
687
688
~\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
450
451 # Create the parser.
--> 452 parser = TextFileReader(fp_or_buf, **kwds)
453
454 if chunksize or iterator:
~\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
934 self.options["has_index_names"] = kwds["has_index_names"]
935
--> 936 self._make_engine(self.engine)
937
938 def close(self):
~\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
1166 def _make_engine(self, engine="c"):
1167 if engine == "c":
-> 1168 self._engine = CParserWrapper(self.f, **self.options)
1169 else:
1170 if engine == "python":
~\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
1996 kwds["usecols"] = self.usecols
1997
-> 1998 self._reader = parsers.TextReader(src, **kwds)
1999 self.unnamed_cols = self._reader.unnamed_cols
2000
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()
FileNotFoundError: [Errno 2] No such file or directory: 'Data_01.txt'
I fixed it by using the full path of my data file but I don't understand the need for the full path, as follows:
import pandas as pd
import os
#Reading my data
Data = pd.read_csv(r"C:\Users\User\Desktop\Projects\SDP\Data_01.txt", sep="\t", names=["Depth", "Porosity"])
Now when slicing my data, I did not want to use Indices, i.e., "iloc" & "loc", to keep my code readable and easy to manipulate and to reapply for the other files, maybe use a for loop to sweep through them all in one run. So I tested first by using the following:
Data_result_1 = Data[Data['Depth'] >= 7711]
This works, however, I wish to use an additional condition in the same line where it stops at Depth = 7786, i.e., my range. But it does not work, here is the code I wrote that failed:
Data_result_1 = Data[Data['Depth'] >= 7711 and Data['Depth'] <= 7786]
Is there a way to use the nested conditions without creating a new line of code, I was able to reach my desired result by it feels unnecessary and, to be frank, ugly. here is what works:
Data_result_1 = Data[Data['Depth'] >= 7711 ]
Data_result_1 = Data_result_1[Data_result_1['Depth'] <= 7786]
You should use & instead of and:
Data_result_1 = Data[ (Data['Depth'] >= 7711) & (Data['Depth'] <= 7786)]

Python - Save dataframe to CSV "too many indices for array" error

I am trying to save a dataframe as CSV and get a "too many indices for array" error. The code used for the save is-
df.to_csv('CCS_Matrix.csv')
The dataframe looks like this
Var10 Var100 Var101
0 0 1 1
1 0 0 1
2 0 1 0
There are 250 columns and about 10 million rows in the dataset.
The dtypes for the dataframe are
Var10 int64
Var100 int64
Var101 int64
etc.
All the dtypes are the same for the 250 columns.
Here is the full output of the error message
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-16-37cbe55e6c0d> in <module>()
----> 1 df.to_csv('CCS_Matrix.csv', encoding='utf-8')
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in to_csv(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, tupleize_cols, date_format, doublequote, escapechar, decimal)
1401 doublequote=doublequote,
1402 escapechar=escapechar, decimal=decimal)
-> 1403 formatter.save()
1404
1405 if path_or_buf is None:
~/anaconda3/lib/python3.6/site-packages/pandas/io/formats/format.py in save(self)
1590 self.writer = csv.writer(f, **writer_kwargs)
1591
-> 1592 self._save()
1593
1594 finally:
~/anaconda3/lib/python3.6/site-packages/pandas/io/formats/format.py in _save(self)
1691 break
1692
-> 1693 self._save_chunk(start_i, end_i)
1694
1695 def _save_chunk(self, start_i, end_i):
~/anaconda3/lib/python3.6/site-packages/pandas/io/formats/format.py in _save_chunk(self, start_i, end_i)
1705 decimal=self.decimal,
1706 date_format=self.date_format,
-> 1707 quoting=self.quoting)
1708
1709 for col_loc, col in zip(b.mgr_locs, d):
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in to_native_types(self, slicer, na_rep, quoting, **kwargs)
611 values = self.values
612 if slicer is not None:
--> 613 values = values[:, slicer]
614 mask = isnull(values)
615
~/anaconda3/lib/python3.6/site-packages/pandas/core/sparse/array.py in __getitem__(self, key)
417 return self._get_val_at(key)
418 elif isinstance(key, tuple):
--> 419 data_slice = self.values[key]
420 else:
421 if isinstance(key, SparseArray):
IndexError: too many indices for array
Could you print out type(df)?
I've noted this problem in SparseDataFrames here.
I was able to solve the problem by calling .to_dense() on the SparseDataFrame, yielding a traditional DataFrame. Worked fine after that. Clearly that's not ideal for memory reasons, but at least it works in the short term.
The pandas team has responded that it is indeed a bug.
you can try another option to save as csv '.toCSV('name.csv)'. That can give you a different error message like ('SparseDataFrame' object has no attribute 'toCSV')
So the problem was solved by turning dataframe to dense dataframe
df.to_dense().to_csv("submission.csv", index = False, sep=',', encoding='utf-8')

Pandas returns "Passed header names mismatches usecols" error

The following works as expected. There are 190 columns that are all read in perfectly.
pd.read_csv("data.csv",
header=None,
names=columns,
# usecols=columns[:10],
nrows=10
)
I have used the usecols argument before, so I am perplexed as to why this is no longer working for me. I would guess that simply slicing the first 10 column names would trivially work, but I continue to get the "Passed header names mismatches usecols" error.
I am using pandas 0.16.2.
pd.read_csv("data.csv",
header=None,
names=columns,
usecols=columns[:10],
nrows=10
)
Traceback
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-44> in <module>()
3 nrows=10,
4 header=None,
----> 5 names=columns,
6 )
/.../lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, na_fvalues, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
472 skip_blank_lines=skip_blank_lines)
473
--> 474 return _read(filepath_or_buffer, kwds)
475
476 parser_f.__name__ = name
/.../lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
248
249 # Create the parser.
--> 250 parser = TextFileReader(filepath_or_buffer, **kwds)
251
252 if (nrows is not None) and (chunksize is not None):
/.../lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
564 self.options['has_index_names'] = kwds['has_index_names']
565
--> 566 self._make_engine(self.engine)
567
568 def _get_options_with_defaults(self, engine):
/.../m9tn/lib/python2.7/site-packages/pandas/io/parsers.pyc in _make_engine(self, engine)
703 def _make_engine(self, engine='c'):
704 if engine == 'c':
--> 705 self._engine = CParserWrapper(self.f, **self.options)
706 else:
707 if engine == 'python':
/.../lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds)
1070 kwds['allow_leading_cols'] = self.index_col is not False
1071
-> 1072 self._reader = _parser.TextReader(src, **kwds)
1073
1074 # XXX
pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4732)()
pandas/parser.pyx in pandas.parser.TextReader._get_header (pandas/parser.c:7330)()
ValueError: Passed header names mismatches usecols
It turns out there were 191 columns in the dataset (not 190). Pandas automatically set my first column of data as the index. I don't quite know why it caused it to error out since all of the columns in usecols were in fact present in the parsed in dataset.
So, the solution is to confirm that the number of columns in names exactly corresponds to the number of columns in your dataset.
Also, I found this discussion on GitHub.
For anyone out there debugging this error, it can also be caused if you forget a trailing comma in your list of column names. e.g.:
columns = [
'industry',
'amount'
'date',
...
]
Pandas will concatenate amount and date into a single amountdate, and of course the number of column names will be one lower than you expect.
I had the same problem. After finding this discussion, I verify that the delimiter in my file is ;
Using sep=';' in the read_csv call solved the issue.
In addition to the answer of #Raluar sep = ';'
add sep = '<object>' , right after the <file path> argument in pd.read_csv(), otherwise, it will read the whole df as a single column and would be unable to parse usecols=columns[:10] because there are no columns actually parsed already in the previous argument of names=columns for the reason mentioned above.

possible inconsistency in text handling of pandas read_table() function

In a previous post, I found out that pandas read_table() function can handle variable-lenth whitespace as a delimiter if you use the read_table('datafile', sep=r'\s*') construction. While this works great for many of my files, it does not work for others despite being highly similar.
EDIT:
I had posted examples that could not replicate the problem when other tried. So I am posting links to the original files for AY907538 and AY942707 as well as leaving the error message that I cannot manage to solve.
## filename:AY942707
# this will load with no problem
data = read_table('AY942707.hmmdomtblout', header=None, skiprows=3, sep=r'\s*')
## filename: AY907538
data = read_table('AY907538.hmmdomtblout', header=None, skiprows=3, sep=r'\s*')
which will generate the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-7-131d10d1fb1d> in <module>()
2
3 #temp = get_dataset('AY907538.hmmdomtblout')
----> 4 data = read_table('AY907538.hmmdomtblout', header=None, skiprows=3, sep=r'\s*')
5 #data = read_table('AY942707.hmmdomtblout', header=None, skiprows=3, sep=r'\s*')
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/parsers.pyc in read_table(filepath_or_buffer, sep, dialect, header, index_col, names, skiprows, na_values, thousands, comment, parse_dates, keep_date_col, dayfirst, date_parser, nrows, iterator, chunksize, skip_footer, converters, verbose, delimiter, encoding, squeeze)
282 kwds['encoding'] = None
283
--> 284 return _read(TextParser, filepath_or_buffer, kwds)
285
286 #Appender(_read_fwf_doc)
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(cls, filepath_or_buffer, kwds)
189 return parser
190
--> 191 return parser.get_chunk()
192
193 #Appender(_read_csv_doc)
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/parsers.pyc in get_chunk(self, rows)
779 msg = ('Expecting %d columns, got %d in row %d' %
780 (col_len, zip_len, row_num))
--> 781 raise ValueError(msg)
782
783 data = dict((k, v) for k, v in izip(self.columns, zipped_content))
ValueError: Expecting 26 columns, got 28 in row 6
The last field description of target in both files holds multiple words. Since white space is used as seperator, description of target is not treated as a single column by read_table. Each word in this field is in a different column. In AY942707 the first description of target holds more words than on all of the other lines, this is not the case in AY907538. read_table determines the number of columns from the first line and all following lines should have equal or less number of columns.

Categories