How to shorten certain substrings in a DataFrame - python

I have a series of SKUs in a DataFrame: [35641, 265689494123, 36492, 56526246546, 26412...].
The problem is that the long barcodes (like 56526246546) in the DataFrame need to be truncated at certain points. The length over 5 should trigger the deletion process, which truncates like [7:12] in a list.
I tried using the following code without any prevail:
if df.loc[len(df['SKU']) > 5]:
df.loc[df['SKU'].df.slice(start=7,stop=12)]
I get following error messages:
KeyError Traceback (most recent call last)
c:\users\User\appdata\local\programs\python\python37\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2656 try:
-> 2657 return self._engine.get_loc(key)
2658 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
KeyError: True
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-64-cea7b4ca2640> in <module>
1 #g[:] = (elem[:12] for elem in g)
----> 2 if df.loc[len(df['SKU']) > 5]:
3 df.loc[df['SKU'].df.slice(start=7,stop=12)]
c:\users\User\appdata\local\programs\python\python37\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1498
1499 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1500 return self._getitem_axis(maybe_callable, axis=axis)
1501
1502 def _is_scalar_access(self, key):
c:\users\User\appdata\local\programs\python\python37\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1911 # fall thru to straight lookup
1912 self._validate_key(key, axis)
-> 1913 return self._get_label(key, axis=axis)
1914
1915
c:\users\User\appdata\local\programs\python\python37\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
139 raise IndexingError('no slices here, handle elsewhere')
140
--> 141 return self.obj._xs(label, axis=axis)
142
143 def _get_loc(self, key, axis=None):
c:\users\User\appdata\local\programs\python\python37\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3583 drop_level=drop_level)
3584 else:
-> 3585 loc = self.index.get_loc(key)
3586
3587 if isinstance(loc, np.ndarray):
c:\users\User\appdata\local\programs\python\python37\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2657 return self._engine.get_loc(key)
2658 except KeyError:
-> 2659 return self._engine.get_loc(self._maybe_cast_indexer(key))
2660 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2661 if indexer.ndim > 1 or indexer.size > 1:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
KeyError: True
How do I fix this code?
P.S Some of the error messages seem to be popping up due to the fact that I've added the code BEFORE converting the dict into a DataFrame.

According to the output you want, I think you can use:
df['SKU'] = df['SKU'].apply(lambda x: int(str(x)[6:11]) if len(str(x)) > 5 else x)
Output:
SKU
0 35641
1 49412
2 36492
3 46546
4 26412

Here is my suggestion:
df.loc[:, 'SKU'] = df.loc[:, 'SKU'].astype(str).apply(lambda x: x[7:12] if len(x) > 5 else x)

Related

Periodic KeyError in Pandas

I want to replace the empty values in the dataframe using random already existing values, while maintaining the weights so that the correlation does not suffer and the data is not lost.
def nan_fill_random(column_name, nan):
for i in range(len(column_name)):
if column_name[i] == nan:
column_name[i] = random.choice(column_name[column_name != nan])
else:
continue
I wrote a function, but it periodically throws a KeyError: and the value has different numbers, I assume indexes. Also, when you restart the cell, it can either disappear or be updated.
nan_fill_random(data['education'], 'unknown')
Here is the error
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
W:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
W:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
W:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 14563
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_4720/2723938638.py in <module>
----> 1 nan_fill_random(data['education'], 'unknown')
~\AppData\Local\Temp/ipykernel_4720/1980306790.py in nan_fill_random(column_name, nan)
2 for i in range(len(column_name)):
3 if column_name[i] == nan:
----> 4 column_name[i] = random.choice(column_name[column_name != nan])
5 else:
6 continue
W:\ProgramData\Anaconda3\lib\random.py in choice(self, seq)
344 """Choose a random element from a non-empty sequence."""
345 # raises IndexError if seq is empty
--> 346 return seq[self._randbelow(len(seq))]
347
348 def shuffle(self, x, random=None):
W:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
940
941 elif key_is_scalar:
--> 942 return self._get_value(key)
943
944 if is_hashable(key):
W:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in _get_value(self, label, takeable)
1049
1050 # Similar to Index.get_value, but we do not fall back to positional
-> 1051 loc = self.index.get_loc(label)
1052 return self.index._get_values_for_loc(self, loc, label)
1053
W:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 14563
def nan_fill_random(column_name, nan):
list_values = set(column_name)
try :
list_values.remove(nan)
except :
return(column_name)
column_name = column_name.apply(lambda x: x if x != nan else random.choice(list(list_values)))
return(column_name)

Python pd.read_sql dataframe KeyError

I have a sql query to pull in contact records. I am looping through each record and assigning the values to variables in python. It's working for all the values variible pairs except HMid. I can't for the life of me figure out why. When I print Updatedata HMid shows up fine as a number like this '106594451'.
Any guidance is much appreciated.
updateSQL = """ Select c.mobilephone
,l.HMid
,[firstname]
,[lastname]
,fullname
,[emailaddress1]
from contact c where and \ksl_communityid = '%s'
""" % (CommunityID)
Updatedata = pd.read_sql(updateSQL, cnxnCS1)
print(Updatedata)
for i in range(len(Updatedata)):
mobilephone = data.loc[i,'mobilephone']
firstname = data.loc[i,'firstname']
lastname = data.loc[i,'lastname']
fullname = data.loc[i,'fullname']
emailaddress1 = data.loc[i,'emailaddress1']
HMid= data.loc[i,'HMid']
Here is the error I am getting:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)get_loc(self, key, method, tolerance) 2645 try:
-> 2646 return self._engine.get_loc(key) 2647 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'HMid'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last) <ipython-input-143-13351626f5cd> in <module>
53 fullname = data.loc[i,'fullname']
54 emailaddress1 = data.loc[i,'emailaddress1']
---> 55 HMid = data.loc[i,'HMid']
56
57 print(HMid)
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
__getitem__(self, key) 1759 except (KeyError, IndexError, AttributeError): 1760 pass
-> 1761 return self._getitem_tuple(key) 1762 else: 1763 # we by definition only have the 0th axis
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
_getitem_tuple(self, tup) 1269 def _getitem_tuple(self, tup: Tuple): 1270 try:
-> 1271 return self._getitem_lowerdim(tup) 1272 except IndexingError: 1273 pass
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
_getitem_lowerdim(self, tup) 1418 return section 1419 # This is an elided recursive call to iloc/loc/etc'
-> 1420 return getattr(section, self.name)[new_key] 1421 1422 raise IndexingError("not applicable")
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
__getitem__(self, key) 1765 1766 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1767 return self._getitem_axis(maybe_callable, axis=axis) 1768 1769 def _is_scalar_access(self, key: Tuple):
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
_getitem_axis(self, key, axis) 1962 # fall thru to straight lookup 1963 self._validate_key(key, axis)
-> 1964 return self._get_label(key, axis=axis) 1965 1966
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in
_get_label(self, label, axis)
618 # but will fail when the index is not present
619 # see GH5667
--> 620 return self.obj._xs(label, axis=axis)
621 elif isinstance(label, tuple) and isinstance(label[axis], slice):
622 raise IndexingError("no slices here, handle elsewhere")
~\Anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level) 3535 loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) 3536 else:
-> 3537 loc = self.index.get_loc(key) 3538 3539 if isinstance(loc, np.ndarray):
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 2646 return self._engine.get_loc(key) 2647 except KeyError:
-> 2648 return self._engine.get_loc(self._maybe_cast_indexer(key)) 2649 indexer = self.get_indexer([key], method=method, tolerance=tolerance) 2650 if indexer.ndim > 1 or indexer.size > 1:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'HMid'

Receiving Key Error = 0 while calculating the polarity in Python

I have two columns - text and title for news articles.
Data looks fine, apologize for a printscreen, just to show the structure.
But it gives me a weird error when I try to calculate the polarity.
# Create
polarity = []
# Creare for loop for Text column only
for i in range(len(jordan_df['text'])):
polarity.append(TextBlob(jordan_df['text'][i]).sentiment.polarity)
# Put data together
polarity_data = {'article_text':jordan_df['text'], 'article_polarity': polarity}
The weird thing that this code works, when I change jordan_df to some_df with the same structure.
Error:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method,
tolerance)
2897 try:
-> 2898 return self._engine.get_loc(casted_key)
2899 except KeyError as err:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
**KeyError: 0**
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
3 frames
<ipython-input-186-edab50678cab> in <module>()
9 # Creare for loop for Text column only
10 for i in range(len(jordan_df['text'])):
---> 11 polarity.append(TextBlob(jordan_df['text'][i]).sentiment.polarity)
12
13 # Put data together
/usr/local/lib/python3.7/dist-packages/pandas/core/series.py in __getitem__(self, key)
880
881 elif key_is_scalar:
--> 882 return self._get_value(key)
883
884 if is_hashable(key):
/usr/local/lib/python3.7/dist-packages/pandas/core/series.py in _get_value(self, label, takeable)
988
989 # Similar to Index.get_value, but we do not fall back to positional
--> 990 loc = self.index.get_loc(label)
991 return self.index._get_values_for_loc(self, loc, label)
992
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method,
tolerance)
2898 return self._engine.get_loc(casted_key)
2899 except KeyError as err:
-> 2900 raise KeyError(key) from err
2901
2902 if tolerance is not None:
Add this line in your code:
polarity = []
jordan_df.reset_index(drop=True,inplace = True) #add this line
# Creare for loop for Text column only
for i in range(len(jordan_df['text'])):
polarity.append(TextBlob(jordan_df['text'][i]).sentiment.polarity)
# Put data together
polarity_data = {'article_text':jordan_df['text'], 'article_polarity': polarity}
You have probably filtered out result, which have changed the index in your jordan_df. You can see in head() of your jordan_df that the index starts with 7.
And that's why you get KeyError on Key 0
i.e. when i=0 in jordan_df['text'][i]

data.get_data_yahoo throws error on some ticker symbols

running the pandas data_reader code throws an error on some stock symbols
running the following code:
import pandas as pd
import pandas_datareader as dr
%matplotlib inline
df = dr.data.get_data_yahoo('FRE.DE',start='2018-10-1', end='2018-11-30')
throws the following error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/miniconda2/envs/py37/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2656 try:
-> 2657 return self._engine.get_loc(key)
2658 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Date'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-17-d55e44b68e87> in <module>
----> 1 df = dr.data.get_data_yahoo('FRE.DE',start='2018-10-1', end='2018-11-30')
2 df
~/miniconda2/envs/py37/lib/python3.7/site-packages/pandas_datareader/data.py in get_data_yahoo(*args, **kwargs)
68
69 def get_data_yahoo(*args, **kwargs):
---> 70 return YahooDailyReader(*args, **kwargs).read()
71
72
~/miniconda2/envs/py37/lib/python3.7/site-packages/pandas_datareader/base.py in read(self)
208 if isinstance(self.symbols, (compat.string_types, int)):
209 df = self._read_one_data(self.url,
--> 210 params=self._get_params(self.symbols))
211 # Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
212 elif isinstance(self.symbols, DataFrame):
~/miniconda2/envs/py37/lib/python3.7/site-packages/pandas_datareader/yahoo/daily.py in _read_one_data(self, url, params)
140 prices.columns = [col.capitalize() for col in prices.columns]
141 prices['Date'] = to_datetime(
--> 142 to_datetime(prices['Date'], unit='s').dt.date)
143
144 if 'Data' in prices.columns:
~/miniconda2/envs/py37/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key)
2925 if self.columns.nlevels > 1:
2926 return self._getitem_multilevel(key)
-> 2927 indexer = self.columns.get_loc(key)
2928 if is_integer(indexer):
2929 indexer = [indexer]
~/miniconda2/envs/py37/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2657 return self._engine.get_loc(key)
2658 except KeyError:
-> 2659 return self._engine.get_loc(self._maybe_cast_indexer(key))
2660 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2661 if indexer.ndim > 1 or indexer.size > 1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Date'
when I replace 'FRE.DE' with 'FRM.DE' which is a different ticker symbol, it works perfect.
thinking, hmm, maybe the data for the symbol does not exists, I went to the yahoo finance page :
https://de.finance.yahoo.com/quote/FRE.DE/history?p=FRE.DE
and there the historic data if displayed.
For me your code actually works with FRE.DE, but doesn't work with FRM.DE.
which is consistent with the fact that
this page:
https://de.finance.yahoo.com/quote/FRE.DE
is found
and this page:
https://de.finance.yahoo.com/quote/FRM.DE
is not found

Add new column in a data frame which contains number of rows [duplicate]

This question already has answers here:
Pandas create new column with count from groupby
(5 answers)
Closed 3 years ago.
I have a dataframe called : oppty_oppline_sacc with this format :
opp_id$ sacc_id$
001 AAB
002 AAB
003 BBC
.. ..
I would like to add a new column named nb_opportunity which contains the number of opp_id$ per each sacc_id$. So i did like this :
oppty_oppline_sacc['nb_oppline'] = oppty_oppline_sacc.groupby(['sacc_id$'],as_index=False)['opp_id$'].count()
But i get this error :
KeyError Traceback (most recent call last)
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2524 try:
-> 2525 return self._engine.get_loc(key)
2526 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'nb_oppline'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/internals.py in set(self, item, value, check)
3967 try:
-> 3968 loc = self.items.get_loc(item)
3969 except KeyError:
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2526 except KeyError:
-> 2527 return self._engine.get_loc(self._maybe_cast_indexer(key))
2528
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'nb_oppline'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-96-7b260546d446> in <module>()
----> 1 oppty_oppline_sacc['nb_oppline'] = oppty_oppline_sacc.groupby(['sacc_id$'],as_index=False)['opp_line_id$'].count()
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
2517 else:
2518 # set column
-> 2519 self._set_item(key, value)
2520
2521 def _setitem_slice(self, key, value):
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/frame.py in _set_item(self, key, value)
2584 self._ensure_valid_index(value)
2585 value = self._sanitize_column(key, value)
-> 2586 NDFrame._set_item(self, key, value)
2587
2588 # check if we are modifying a copy
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/generic.py in _set_item(self, key, value)
1952
1953 def _set_item(self, key, value):
-> 1954 self._data.set(key, value)
1955 self._clear_item_cache()
1956
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/internals.py in set(self, item, value, check)
3969 except KeyError:
3970 # This item wasn't present, just insert at end
-> 3971 self.insert(len(self.items), item, value)
3972 return
3973
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/internals.py in insert(self, loc, item, value, allow_duplicates)
4070
4071 block = make_block(values=value, ndim=self.ndim,
-> 4072 placement=slice(loc, loc + 1))
4073
4074 for blkno, count in _fast_count_smallints(self._blknos[loc:]):
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
2955 placement=placement, dtype=dtype)
2956
-> 2957 return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
2958
2959 # TODO: flexible with index=None and/or items=None
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/internals.py in __init__(self, values, ndim, fastpath, placement, **kwargs)
2080
2081 super(ObjectBlock, self).__init__(values, ndim=ndim, fastpath=fastpath,
-> 2082 placement=placement, **kwargs)
2083
2084 #property
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/internals.py in __init__(self, values, placement, ndim, fastpath)
118 raise ValueError('Wrong number of items passed %d, placement '
119 'implies %d' % (len(self.values),
--> 120 len(self.mgr_locs)))
121
122 #property
ValueError: Wrong number of items passed 2, placement implies 1
Can you help me please to resolve this problem?
thank you
What #jezrael said. The transform method keeps the original index allowing you you to make a new column on the dataframe.
df['nb_oppline'] = df.groupby('sacc_id$')['opp_id$'].transform('count')
Ouput
opp_id$ sacc_id$ nb_oppline
0 1 AAB 2
1 2 AAB 2
2 3 BBC 1

Categories