I want to replace the empty values in the dataframe using random already existing values, while maintaining the weights so that the correlation does not suffer and the data is not lost.
def nan_fill_random(column_name, nan):
for i in range(len(column_name)):
if column_name[i] == nan:
column_name[i] = random.choice(column_name[column_name != nan])
else:
continue
I wrote a function, but it periodically throws a KeyError: and the value has different numbers, I assume indexes. Also, when you restart the cell, it can either disappear or be updated.
nan_fill_random(data['education'], 'unknown')
Here is the error
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
W:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
W:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
W:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 14563
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_4720/2723938638.py in <module>
----> 1 nan_fill_random(data['education'], 'unknown')
~\AppData\Local\Temp/ipykernel_4720/1980306790.py in nan_fill_random(column_name, nan)
2 for i in range(len(column_name)):
3 if column_name[i] == nan:
----> 4 column_name[i] = random.choice(column_name[column_name != nan])
5 else:
6 continue
W:\ProgramData\Anaconda3\lib\random.py in choice(self, seq)
344 """Choose a random element from a non-empty sequence."""
345 # raises IndexError if seq is empty
--> 346 return seq[self._randbelow(len(seq))]
347
348 def shuffle(self, x, random=None):
W:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
940
941 elif key_is_scalar:
--> 942 return self._get_value(key)
943
944 if is_hashable(key):
W:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in _get_value(self, label, takeable)
1049
1050 # Similar to Index.get_value, but we do not fall back to positional
-> 1051 loc = self.index.get_loc(label)
1052 return self.index._get_values_for_loc(self, loc, label)
1053
W:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 14563
def nan_fill_random(column_name, nan):
list_values = set(column_name)
try :
list_values.remove(nan)
except :
return(column_name)
column_name = column_name.apply(lambda x: x if x != nan else random.choice(list(list_values)))
return(column_name)
Related
I think my issue is that not every file i am analysing contains every item in my sleep_stages list but I'm not sure how to fix it. For instance some files will not contain any mention of sleep stage N1 or another item from list. I would like to be able to input na just for that value but capture other values from items on list.
see code:
def get_sleep_times(hypno):
sleep_stages=['Sleep stage N1','Sleep stage R','Sleep stage N2', 'Sleep
stage N3', 'Sleep stage ?']
sleep_times = {}
totsleep_time = 0
tmp = hypno.groupby('description')['duration'].sum()
for stage in sleep_stages:
sleep_times['Duration of ' +stage]=tmp.loc[stage]
totsleep_time += tmp.loc[stage]
sleep_times['Total Sleep Duration'] = totsleep_time
return sleep_times
see error message:
KeyError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
~\Anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
~\Anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Sleep stage ?'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_21360/487801242.py in <module>
----> 1 hypno_report(dfpmnospcl)
~\AppData\Local\Temp/ipykernel_21360/1936770717.py in hypno_report(df)
16 print(fnames, matches)
17 hypno = pd.read_csv(matches[0], delimiter='\t',encoding='utf-8')
---> 18 result.update(get_sleep_times(hypno))
19 result.update(get_hypno_counts(hypno, events))
20 results.append(result)
~\AppData\Local\Temp/ipykernel_21360/2297364264.py in get_sleep_times(hypno)
12
13 else:
---> 14 sleep_times['Duration of ' +stage]=tmp.loc[stage]
15 totsleep_time += tmp.loc[stage]
16
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
929
930 maybe_callable = com.apply_if_callable(key, self.obj)
--> 931 return self._getitem_axis(maybe_callable, axis=axis)
932
933 def _is_scalar_access(self, key: tuple):
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1162 # fall thru to straight lookup
1163 self._validate_key(key, axis)
-> 1164 return self._get_label(key, axis=axis)
1165
1166 def _get_slice_axis(self, slice_obj: slice, axis: int):
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
1111 def _get_label(self, label, axis: int):
1112 # GH#5667 this will fail if the label is not present in the axis.
-> 1113 return self.obj.xs(label, axis=axis)
1114
1115 def _handle_lowerdim_multi_index_axis0(self, tup: tuple):
~\Anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3774 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
3775 else:
-> 3776 loc = index.get_loc(key)
3777
3778 if isinstance(loc, np.ndarray):
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 'Sleep stage ?'
This might not work properly since I don't know how hypno looks like. Would be helpful if you could describe its structure next time.
def get_sleep_times(hypno):
sleep_stages=['Sleep stage N1', 'Sleep stage R', 'Sleep stage N2', 'Sleep stage N3', 'Sleep stage ?']
tmp = hypno.groupby('description')['duration'].sum()
sleep_times = {'Duration of ' + stage: tmp.loc[stage] if stage in tmp.columns else None for stage in sleep_stages}
sleep_times['Total sleep duration: '] = sum(sleep_times.values())
return sleep_times
The sleep_times dictionary will contain the duration for each stage (if the stage name is present within the column names of tmp) or None otherwise.
The values from the dictionary will then be summed up and added as a new entry (total sleep duration).
I have a sql query to pull in contact records. I am looping through each record and assigning the values to variables in python. It's working for all the values variible pairs except HMid. I can't for the life of me figure out why. When I print Updatedata HMid shows up fine as a number like this '106594451'.
Any guidance is much appreciated.
updateSQL = """ Select c.mobilephone
,l.HMid
,[firstname]
,[lastname]
,fullname
,[emailaddress1]
from contact c where and \ksl_communityid = '%s'
""" % (CommunityID)
Updatedata = pd.read_sql(updateSQL, cnxnCS1)
print(Updatedata)
for i in range(len(Updatedata)):
mobilephone = data.loc[i,'mobilephone']
firstname = data.loc[i,'firstname']
lastname = data.loc[i,'lastname']
fullname = data.loc[i,'fullname']
emailaddress1 = data.loc[i,'emailaddress1']
HMid= data.loc[i,'HMid']
Here is the error I am getting:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)get_loc(self, key, method, tolerance) 2645 try:
-> 2646 return self._engine.get_loc(key) 2647 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'HMid'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last) <ipython-input-143-13351626f5cd> in <module>
53 fullname = data.loc[i,'fullname']
54 emailaddress1 = data.loc[i,'emailaddress1']
---> 55 HMid = data.loc[i,'HMid']
56
57 print(HMid)
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
__getitem__(self, key) 1759 except (KeyError, IndexError, AttributeError): 1760 pass
-> 1761 return self._getitem_tuple(key) 1762 else: 1763 # we by definition only have the 0th axis
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
_getitem_tuple(self, tup) 1269 def _getitem_tuple(self, tup: Tuple): 1270 try:
-> 1271 return self._getitem_lowerdim(tup) 1272 except IndexingError: 1273 pass
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
_getitem_lowerdim(self, tup) 1418 return section 1419 # This is an elided recursive call to iloc/loc/etc'
-> 1420 return getattr(section, self.name)[new_key] 1421 1422 raise IndexingError("not applicable")
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
__getitem__(self, key) 1765 1766 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1767 return self._getitem_axis(maybe_callable, axis=axis) 1768 1769 def _is_scalar_access(self, key: Tuple):
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
_getitem_axis(self, key, axis) 1962 # fall thru to straight lookup 1963 self._validate_key(key, axis)
-> 1964 return self._get_label(key, axis=axis) 1965 1966
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
_get_label(self, label, axis)
618 # but will fail when the index is not present
619 # see GH5667
--> 620 return self.obj._xs(label, axis=axis)
621 elif isinstance(label, tuple) and isinstance(label[axis], slice):
622 raise IndexingError("no slices here, handle elsewhere")
~\Anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level) 3535 loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) 3536 else:
-> 3537 loc = self.index.get_loc(key) 3538 3539 if isinstance(loc, np.ndarray):
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 2646 return self._engine.get_loc(key) 2647 except KeyError:
-> 2648 return self._engine.get_loc(self._maybe_cast_indexer(key)) 2649 indexer = self.get_indexer([key], method=method, tolerance=tolerance) 2650 if indexer.ndim > 1 or indexer.size > 1:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'HMid'
I am trying to run an enrichment analysis with gseapy enrichr on a list of gene names that look like the following:
0 RAB4B
1 TIGAR
2 RNF44
3 DNAH3
4 RPL23A
5 ARL8B
6 CALB2
7 MFSD3
8 PIGV
9 ZNF708
Name: 0, dtype: object
I am using the following code:
# run enrichr
# if you are only intrested in dataframe that enrichr returned, please set no_plot=True
# list, dataframe, series inputs are supported
enr = gseapy.enrichr(gene_list = glist2,
gene_sets=['ARCHS4_Cell-lines', 'KEGG_2016','KEGG_2013', 'GO_Cellular_Component_2018', 'GO_Cellular_Component_AutoRIF', 'GO_Cellular_Component_AutoRIF_Predicted_zscore', 'GO_Molecular_Function_2018', 'GO_Molecular_Function_AutoRIF', 'GO_Molecular_Function_AutoRIF_Predicted_zscore'],
organism='Human', # don't forget to set organism to the one you desired! e.g. Yeast
description='test_name',
outdir='test/enrichr_kegg',
# no_plot=True,
cutoff=1 # test dataset, use lower value from range(0,1)
)
However, I am receiving the following error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3079 try:
-> 3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Adjusted P-value'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-78-dad3e0840d86> in <module>
9 outdir='test/enrichr_kegg',
10 # no_plot=True,
---> 11 cutoff=1 # test dataset, use lower value from range(0,1)
12 )
~/venv/lib/python3.7/site-packages/gseapy/enrichr.py in enrichr(gene_list, gene_sets, organism, description, outdir, background, cutoff, format, figsize, top_term, no_plot, verbose)
500 # set organism
501 enr.set_organism()
--> 502 enr.run()
503
504 return enr
~/venv/lib/python3.7/site-packages/gseapy/enrichr.py in run(self)
418 top_term=self.__top_term, color='salmon',
419 title=self._gs,
--> 420 ofname=outfile.replace("txt", self.format))
421 if msg is not None : self._logger.warning(msg)
422 self._logger.info('Done.\n')
~/venv/lib/python3.7/site-packages/gseapy/plot.py in barplot(df, column, title, cutoff, top_term, figsize, color, ofname, **kwargs)
498 if colname in ['Adjusted P-value', 'P-value']:
499 # check if any values in `df[colname]` can't be coerced to floats
--> 500 can_be_coerced = df[colname].map(isfloat)
501 if np.sum(~can_be_coerced) > 0:
502 raise ValueError('some value in %s could not be typecast to `float`'%colname)
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key)
3022 if self.columns.nlevels > 1:
3023 return self._getitem_multilevel(key)
-> 3024 indexer = self.columns.get_loc(key)
3025 if is_integer(indexer):
3026 indexer = [indexer]
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
-> 3082 raise KeyError(key) from err
3083
3084 if tolerance is not None:
KeyError: 'Adjusted P-value'
It seems that everything is running fine before calculating the adjusted p values. Also, when I insert my gene names into sites like Biomart, I get returns on the values that I input, but I don't know where I'm going wrong with the Adjusted P - Values in my code. Can anyone point me in the right direction? Thanks
How many genes do you have in your gene list? I had same issue. My gene list has about 22000 genes. I only picked top 5000 genes. Then the problem solved. Of course you can change it as you wish.
Here is my code:
import gseapy
enr_res = gseapy.enrichr(gene_list=glist[:5000],
organism='human',
gene_sets=['GO_Biological_Process_2018','KEGG_2019_Human','WikiPathways_2019_Human','GO_Biological_Process_2017b'],
description='pathway',
cutoff = 0.5)
I have two columns - text and title for news articles.
Data looks fine, apologize for a printscreen, just to show the structure.
But it gives me a weird error when I try to calculate the polarity.
# Create
polarity = []
# Creare for loop for Text column only
for i in range(len(jordan_df['text'])):
polarity.append(TextBlob(jordan_df['text'][i]).sentiment.polarity)
# Put data together
polarity_data = {'article_text':jordan_df['text'], 'article_polarity': polarity}
The weird thing that this code works, when I change jordan_df to some_df with the same structure.
Error:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method,
tolerance)
2897 try:
-> 2898 return self._engine.get_loc(casted_key)
2899 except KeyError as err:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
**KeyError: 0**
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
3 frames
<ipython-input-186-edab50678cab> in <module>()
9 # Creare for loop for Text column only
10 for i in range(len(jordan_df['text'])):
---> 11 polarity.append(TextBlob(jordan_df['text'][i]).sentiment.polarity)
12
13 # Put data together
/usr/local/lib/python3.7/dist-packages/pandas/core/series.py in __getitem__(self, key)
880
881 elif key_is_scalar:
--> 882 return self._get_value(key)
883
884 if is_hashable(key):
/usr/local/lib/python3.7/dist-packages/pandas/core/series.py in _get_value(self, label, takeable)
988
989 # Similar to Index.get_value, but we do not fall back to positional
--> 990 loc = self.index.get_loc(label)
991 return self.index._get_values_for_loc(self, loc, label)
992
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method,
tolerance)
2898 return self._engine.get_loc(casted_key)
2899 except KeyError as err:
-> 2900 raise KeyError(key) from err
2901
2902 if tolerance is not None:
Add this line in your code:
polarity = []
jordan_df.reset_index(drop=True,inplace = True) #add this line
# Creare for loop for Text column only
for i in range(len(jordan_df['text'])):
polarity.append(TextBlob(jordan_df['text'][i]).sentiment.polarity)
# Put data together
polarity_data = {'article_text':jordan_df['text'], 'article_polarity': polarity}
You have probably filtered out result, which have changed the index in your jordan_df. You can see in head() of your jordan_df that the index starts with 7.
And that's why you get KeyError on Key 0
i.e. when i=0 in jordan_df['text'][i]
I have a series of SKUs in a DataFrame: [35641, 265689494123, 36492, 56526246546, 26412...].
The problem is that the long barcodes (like 56526246546) in the DataFrame need to be truncated at certain points. The length over 5 should trigger the deletion process, which truncates like [7:12] in a list.
I tried using the following code without any prevail:
if df.loc[len(df['SKU']) > 5]:
df.loc[df['SKU'].df.slice(start=7,stop=12)]
I get following error messages:
KeyError Traceback (most recent call last)
c:\users\User\appdata\local\programs\python\python37\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2656 try:
-> 2657 return self._engine.get_loc(key)
2658 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
KeyError: True
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-64-cea7b4ca2640> in <module>
1 #g[:] = (elem[:12] for elem in g)
----> 2 if df.loc[len(df['SKU']) > 5]:
3 df.loc[df['SKU'].df.slice(start=7,stop=12)]
c:\users\User\appdata\local\programs\python\python37\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1498
1499 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1500 return self._getitem_axis(maybe_callable, axis=axis)
1501
1502 def _is_scalar_access(self, key):
c:\users\User\appdata\local\programs\python\python37\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1911 # fall thru to straight lookup
1912 self._validate_key(key, axis)
-> 1913 return self._get_label(key, axis=axis)
1914
1915
c:\users\User\appdata\local\programs\python\python37\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
139 raise IndexingError('no slices here, handle elsewhere')
140
--> 141 return self.obj._xs(label, axis=axis)
142
143 def _get_loc(self, key, axis=None):
c:\users\User\appdata\local\programs\python\python37\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3583 drop_level=drop_level)
3584 else:
-> 3585 loc = self.index.get_loc(key)
3586
3587 if isinstance(loc, np.ndarray):
c:\users\User\appdata\local\programs\python\python37\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2657 return self._engine.get_loc(key)
2658 except KeyError:
-> 2659 return self._engine.get_loc(self._maybe_cast_indexer(key))
2660 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2661 if indexer.ndim > 1 or indexer.size > 1:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
KeyError: True
How do I fix this code?
P.S Some of the error messages seem to be popping up due to the fact that I've added the code BEFORE converting the dict into a DataFrame.
According to the output you want, I think you can use:
df['SKU'] = df['SKU'].apply(lambda x: int(str(x)[6:11]) if len(str(x)) > 5 else x)
Output:
SKU
0 35641
1 49412
2 36492
3 46546
4 26412
Here is my suggestion:
df.loc[:, 'SKU'] = df.loc[:, 'SKU'].astype(str).apply(lambda x: x[7:12] if len(x) > 5 else x)