load text file with separate columns in python pandas - python

I have a text file that looks like this:
# Pearson correlation [n=344 #col=2]
# Name Name Value BiasCorr 2.50% 97.50% N: 2.50% N:97.50%
# --------------- --------------- -------- -------- -------- -------- -------- --------
101_DGCA3.1D[0] 101_LEC.1D[0] +0.85189 +0.85071 +0.81783 +0.87777 +0.82001 +0.87849
I have loaded it into python pandas using the following code:
import pandas as pd
data = pd.read_table('test.txt')
print data
However, I can't seem to access the different columns separately. I have tried using sep=' ' and copying the spaces between the columns in the text file, but I still don't get any column names and trying to print data[0] gives me an error:
Traceback (most recent call last):
File "cut_afni_output.py", line 3, in <module>
print data[0]
File "/home/user/anaconda2/lib/python2.7/site-packages/pandas/core/frame.py", line 1969, in __getitem__
return self._getitem_column(key)
File "/home/user/anaconda2/lib/python2.7/site-packages/pandas/core/frame.py", line 1976, in _getitem_column
return self._get_item_cache(key)
File "/home/user/anaconda2/lib/python2.7/site-packages/pandas/core/generic.py", line 1091, in _get_item_cache
values = self._data.get(item)
File "/home/user/anaconda2/lib/python2.7/site-packages/pandas/core/internals.py", line 3211, in get
loc = self.items.get_loc(item)
File "/home/user/anaconda2/lib/python2.7/site-packages/pandas/core/index.py", line 1759, in get_loc
return self._engine.get_loc(key)
File "pandas/index.pyx", line 137, in pandas.index.IndexEngine.get_loc (pandas/index.c:3979)
File "pandas/index.pyx", line 157, in pandas.index.IndexEngine.get_loc (pandas/index.c:3843)
File "pandas/hashtable.pyx", line 668, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12265)
File "pandas/hashtable.pyx", line 676, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12216)
KeyError: 0
I haven't been able to set the header row manually because it seems like python views the whole thing as one column. How do I make the text file be read in as separate columns that I can call?

Try this:
In [33]: df = pd.read_csv(filename, comment='#', header=None, delim_whitespace=True)
In [34]: df
Out[34]:
0 1 2 3 4 5 6 7
0 101_DGCA3.1D[0] 101_LEC.1D[0] 0.85189 0.85071 0.81783 0.87777 0.82001 0.87849

Related

Set single value on pandas multiindex dataframe

With a single-index dataframe, we can use loc to get, set, and change values:
>>> df=pd.DataFrame()
>>> df.loc['A',1]=1
>>> df
1
A 1.0
>>> df.loc['A',1]=2
>>> df.loc['A',1]
2.0
However, with a multiindex dataframe, loc can get and change values:
>>> df=pd.DataFrame([['A','B',1]])
>>> df=df.set_index([0,1])
>>> df.loc[('A','B'),2]
1
>>> df.loc[('A','B'),2]=3
>>> df.loc[('A','B'),2]
3
but setting them seems to fail:
>>> df=pd.DataFrame()
>>> df.loc[('A','B'),2]=3
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Program Files\Python39\lib\site-packages\pandas\core\indexing.py", line 688, in __setitem__
indexer = self._get_setitem_indexer(key)
File "C:\Program Files\Python39\lib\site-packages\pandas\core\indexing.py", line 630, in _get_setitem_indexer
return self._convert_tuple(key, is_setter=True)
File "C:\Program Files\Python39\lib\site-packages\pandas\core\indexing.py", line 754, in _convert_tuple
idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter)
File "C:\Program Files\Python39\lib\site-packages\pandas\core\indexing.py", line 1212, in _convert_to_indexer
return self._get_listlike_indexer(key, axis, raise_missing=True)[1]
File "C:\Program Files\Python39\lib\site-packages\pandas\core\indexing.py", line 1266, in _get_listlike_indexer
self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
File "C:\Program Files\Python39\lib\site-packages\pandas\core\indexing.py", line 1308, in _validate_read_indexer
raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Index(['A', 'B'], dtype='object')] are in the [index]"
Why is this, and what is the "right" way to use loc to set a single value in a multiindex dataframe?
This fails because you don't have the correct number of levels in the MultiIndex.
You need to initialize an empty DataFrame with the correct number of levels, for example using pandas.MultiIndex.from_arrays:
idx = pd.MultiIndex.from_arrays([[],[]])
df = pd.DataFrame(index=idx)
df.loc[('A','B'), 2] = 3
Output:
2
A B 3.0

pandas filter dataframe based on chained splits

I have a pandas dataframe which contains a column (column name filenames) with filenames. The filenames look something like:
long_file1_name_0.jpg
long_file2_name_1.jpg
long_file3_name_0.jpg
...
To filter, I do this (lets say `select_string="0"):
df_fp = df_fp[~df_fp["filenames"].str.split(".jpg")[0].split("_")[-1]==select_string]
but I get thrown this:
Traceback (most recent call last):
File "/file/location/dir/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2889, in get_loc
return self._engine.get_loc(casted_key)
File "pandas/_libs/index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 97, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1032, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1039, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "python_file.py", line 118, in <module>
main()
File "inference.py", line 57, in main
_=some_function(config_dict=config_dict, logger=logger, select_string=config_dict['global']['select_string'])
File "/file/location/dir/etc/fprint/dataloaders.py", line 31, in some_function2
logger=logger, select_string=select_string)
File "/file/location/dir/etc/fprint/preprocess.py", line 25, in df_preprocess
df_fp = df_fp[~df_fp["filenames"].str.split(".jpg")[0].split("_")[-1]==select_string]
File "/file/location/dir/lib/python3.7/site-packages/pandas/core/series.py", line 882, in __getitem__
return self._get_value(key)
File "/file/location/dir/lib/python3.7/site-packages/pandas/core/series.py", line 991, in _get_value
loc = self.index.get_loc(label)
File "/file/location/dir/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2891, in get_loc
raise KeyError(key) from err
KeyError: 0
I think it does not like me chaining the splits, but I vaguely remember doing this sometime ago and it did work.. so, I am perplexed why it throws this error.
PS: I do know how to solve using .contains but I would like to use this approach of comparig strings.
Any pointers would be great!
Here is another way, with .str.extract():
import pandas as pd
df = pd.DataFrame({'filename': ['long_file1_name_0.jpg',
'long_file2_name_1.jpg',
'long_file3_name_0.jpg',
'long_file3_name_33.jpg',]
})
Now, create a boolean mask. The squeeze() method ensures we have a series, so the mask will work:
mask = (df['filename'].str.extract( r'\w+_(\d+).jpg' )
.astype(int)
.eq(0)
.squeeze())
print(df.loc[mask])
filename
0 long_file1_name_0.jpg
2 long_file3_name_0.jpg
Assuming all rows contain .jpg, if not please change it to only . instead
select_string=str(0) #select string should be of type str
df_fp=df_fp[df_fp["filenames"].apply(lambda x: x.split(".jpg")[0].split("_")[-1]).astype(str)==select_string]
This part:
df_fp["filenames"].str.split(".jpg")[0]
returns you the first row of the DataFrame, not the first element of the list.
What you are looking for is expand (it will create a new columns for every element in the list after the split) parameter:
df[df['filenames'].str.split('.jpg', expand=True)[0].str.split('_', expand=True)[1] == '0']
Alternatively you could do that via apply:
df[df['filenames'].apply(lambda x: x.split('.jpg')[0].split('_')[-1]) == '0']
but contains is definitely more appropriate here.

ERROR (glitch in pandas?) : Why when indexes of a pandas.core.series.Series are numbers as strings python can't retrive the its values?

I'm going to the grain. Every one knows that a column, say col = df['field'] is a 'pandas.core.series.Series'. And also counts = df['field'].value_counts() with the method value_counts() is a 'pandas.core.series.Series' data type.
And that you can extract the value from the first row of a 'pandas.core.series.Series' with double brackets: col[0] or counts[0]
Nontheless indexes from col and counts are different. And this insight is what I think is the problem I'm about to present.
I have the next 'pandas.core.series.Series' data type generated by the next code:
We read the data frame as df
df = pd.read_csv('file.csv')
df has 'year' and 'product' columns, which I get its unique values and transform them into strings
vals_year = df['year'].astype('str').unique()
vals_product = df['product'].astype('str').unique()
This is the content in each variable:
>>>vals_year
>>>['16' '18' '17']
>>> vals_product
>>>['card' 'cash']
Then I use the value_counts() method to count and create 'pandas.core.series.Series' data type :
cy = df['year'].value_counts()
cp = df['product'].value_counts()
This is the output:
>>>cy
>>>16 65
17 40
18 12
Name: year, dtype: int64
>>>cp
>>>card 123
cash 106
Name: product, dtype: int64
Here is the first value of cp:
>>>cp[0]
>>>123
But when I try to see the first value from cy this happens:
>>>cy[0]
Traceback (most recent call last):
File "C:.../Test3.py", line 44, in <module>
print(cr[0])
File "C:\...\venv\lib\site-packages\pandas\core\series.py", line 1064, in __getitem__
result = self.index.get_value(self, key)
File "C:\...\venv\lib\site-packages\pandas\core\indexes\base.py", line 4723, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas\_libs\index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 88, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 992, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 998, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0
(I just copy paste the message)
Why this happens? It makes no sense!! Is this a glitch in pandas? I believe the problem resides, as I said before, in the fact that The original values from 'year' column were ints

How to attach a row header to a data frame in python. Row names should be taken from a dictionary

I have a dataframe like this:
1 2 3
0.950389665 0.29695614 0.250323227
0.228821863 0.544082251 0.809445825
0.595764836 0.726256844 0.301979059
0.12775065 0.307534453 0.7791458
0.538780306 0.651055165 0.8450824
0.674011952 0.03239639 0.650357821
0.257926954 0.828308299 0.526425688
I want to attach the row names for the above dataframe by reading the a values of dictionary dictionary
dict:
[dict_values(['First', 'Second', 'Third', 'Fourth', 'Fifth', 'Sixth', 'Seventh'])]
This is the code i used:
Result_matrix = df.set_index([dictionary.values()])
However, I get the following errors:
Traceback (most recent call last):
File "<ipython-input-79-dc4a38b0a2d5>", line 1, in <module>
enrond_matrix = enrond_dataframe.set_index([ reverse_dictionary.values()])
File "C:\Users\30295\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\pandas\core\frame.py", line 2830, in set_index
level = frame[col]._values
File "C:\Users\30295\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\pandas\core\frame.py", line 1964, in __getitem__
return self._getitem_column(key)
File "C:\Users\30295\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\pandas\core\frame.py", line 1971, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\30295\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\pandas\core\generic.py", line 1645, in _get_item_cache
values = self._data.get(item)
File "C:\Users\30295\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\pandas\core\internals.py", line 3590, in get
loc = self.items.get_loc(item)
File "C:\Users\30295\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\pandas\core\indexes\base.py", line 2444, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)
File "pandas\_libs\index.pyx", line 156, in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5210)
Kindly help me with this.
If possible i want to have the end result as a numpy array (matrix form)
You need convert to list first:
Result_matrix = df.set_index([list(d.values())])
print (Result_matrix)
1 2 3
First 0.950390 0.296956 0.250323
Second 0.228822 0.544082 0.809446
Third 0.595765 0.726257 0.301979
Fourth 0.127751 0.307534 0.779146
Fifth 0.538780 0.651055 0.845082
Sixth 0.674012 0.032396 0.650358
Seventh 0.257927 0.828308 0.526426
Or assign output:
df.index = d.values()
print (df)
1 2 3
First 0.950390 0.296956 0.250323
Second 0.228822 0.544082 0.809446
Third 0.595765 0.726257 0.301979
Fourth 0.127751 0.307534 0.779146
Fifth 0.538780 0.651055 0.845082
Sixth 0.674012 0.032396 0.650358
Seventh 0.257927 0.828308 0.526426
Last for numpy array add values, but information about rows names is lost:
print (Result_matrix.values)
[[ 0.95038966 0.29695614 0.25032323]
[ 0.22882186 0.54408225 0.80944582]
[ 0.59576484 0.72625684 0.30197906]
[ 0.12775065 0.30753445 0.7791458 ]
[ 0.53878031 0.65105517 0.8450824 ]
[ 0.67401195 0.03239639 0.65035782]

Not able to access a column in pandas data frame

I have data frame df.
df.columns gives this output
Index([u'Talk Time\t', u'Hold Time\t', u'Work Time\t', u'Call Type'], dtype='object')
Here, column 'Talk Time' has "\t" character with it, so if I do the following, I get an error
df['Talk Time']
Traceback (most recent call last):
File "<ipython-input-78-f2b7b9f43f59>", line 1, in <module>
old['Talk Time']
File "C:\Users\Admin\Anaconda\lib\site-packages\pandas\core\frame.py", line 1780, in __getitem__
return self._getitem_column(key)
File "C:\Users\Admin\Anaconda\lib\site-packages\pandas\core\frame.py", line 1787, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\Admin\Anaconda\lib\site-packages\pandas\core\generic.py", line 1068, in _get_item_cache
values = self._data.get(item)
File "C:\Users\Admin\Anaconda\lib\site-packages\pandas\core\internals.py", line 2849, in get
loc = self.items.get_loc(item)
File "C:\Users\Admin\Anaconda\lib\site-packages\pandas\core\index.py", line 1402, in get_loc
return self._engine.get_loc(_values_from_object(key))
File "pandas\index.pyx", line 134, in pandas.index.IndexEngine.get_loc (pandas\index.c:3820)
File "pandas\index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas\index.c:3700)
File "pandas\hashtable.pyx", line 696, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12323)
File "pandas\hashtable.pyx", line 704, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12274)
KeyError: 'Talk Time'
So I modify columns to remove tab characters as follows:
for n in range(len(df.columns)):
df.columns.values[n] = df.columns.values[n].rstrip()
Tab characters get removed, df.columns give the following output
Index([u'Talk Time', u'Hold Time', u'Work Time', u'Call Type'], dtype='object')
But, still when I am trying to access a column as
df['Talk Time']
, I am seeing the same error. Why is it happening?
The main issue is, that you replaced the value of the columns and that is you actually managed to do. But that is just an alias, thus the actual name stayed as was before. So df['Talk Time\t'] worked well on, if you tried to, but obviously that wasn't the result you waited for.
So the solution is that you have to change the df.columns instead of df.columns.value
df.columns = [c.rstrip() for c in df.columns]
This is what works fine according to your needs
I can't reproduce your second error, however, you could do:
df.columns = [i.rstrip() for i in df.columns]
Maybe this will help !

Categories