preprocessing class error, "AttributeError: 'function' object has no attribute 'str'" - python

So I did an nlp project earlier now I have pickled the model and trying to apply it to a new data set, the data set is something I scrapped from twitter. So of course the new dataframe doesn't have the same columns as the old dataset, so I am making a class to preprocess the data to make closer the old dataframe which was used for the nlp project. This is what I did
def __init__(self):
pass
def fit(self, text_column):
df = pd.DataFrame(text_column)
df.text_length = self.text_length(text_column)
df.num_capital_letters = self.num_capital_letters(text_column)
df.percentage_of_capital_letters = self.percentage_of_capital_letters(text_column)
df.greater_than_50_percent = self.greater_than_50_percent(text_column)
df.reading_level = self.reading_level(text_column)
#df =pd.DataFrame(Text.df_user_tweets
return df
def text_length(self,column):
return column.apply(lambda x: len(x))
def num_capital_letters(self,column):
return column.apply.str.findall(r"[A-Z]").str.len()
def percentage_of_capital_letters(self,column):
return column.apply.str.findall(r"[A-Z]").str.len()/column.apply(lambda x: len(x))
def greater_than_50_percent(self,column):
return column.apply(lambda x: x>= .5 )
def reading_level(self,column):
return column.apply(lambda x :textstat.flesch_reading_ease(x))
pre = Preprocesser()
pre.fit(text_column = df_user_tweets.Text)
This is the error that I got
<ipython-input-136-3b74ba5d2425> in num_capital_letters(self, column)
17 return column.apply(lambda x: len(x))
18 def num_capital_letters(self,column):
---> 19 return column.apply.str.findall(r"[A-Z]").len()
20 def percentage_of_capital_letters(self,column):
21 return column.apply.str.findall(r"[A-Z]").str.len()/column.apply(lambda x: len(x))
AttributeError: 'function' object has no attribute 'str'
It sounds like my error is in line 19 but not sure what I need to do fix it, appreciate any help

df_user_tweets.Text is of type pd.Series and it has a method apply. this method takes a lambda function to do some work on values of that Series (which is a column), and it does not have an str attribute.
So instead of column.apply.findall do column.str.findall.
you can find the doc of pandas here: https://pandas.pydata.org/docs/reference/api/pandas.Series.str.html?highlight=str#pandas.Series.str

Related

How do I solve the "AttributeError: 'Series' object has no attribute '_check_fillna'" error when using the technical-analysis-library-in-python

I am using the following library to fill a Dataframe with modified data (indicator of financial data) https://technical-analysis-library-in-python.readthedocs.io/en/latest/
However, the library has a number of classes that seem to miss certain attributes; or lack the inheritance from another class.
I have created a pandas.Series filled with ones to demonstrate. I call the method aroon_up() from class AroonIndicator with the aforementioned series as input, but I get a 'Series' object has no attribute '_check_fillna'" error. I see that there is no attribute _check_fillna in the AroonIndicator class, but there is in the IndicatorMixin. I have tried to run the Series through the IndicatorMixin class, but it states that this class takes no arguments.
Can someone explain to me what I am doing wrong?
Library
class IndicatorMixin:
"""Util mixin indicator class"""
_fillna = False
def _check_fillna(self, series: pd.Series, value: int = 0) -> pd.Series:
"""Check if fillna flag is True.
Args:
series(pandas.Series): calculated indicator series.
value(int): value to fill gaps; if -1 fill values using 'backfill' mode.
Returns:
pandas.Series: New feature generated.
"""
if self._fillna:
series_output = series.copy(deep=False)
series_output = series_output.replace([np.inf, -np.inf], np.nan)
if isinstance(value, int) and value == -1:
series = series_output.fillna(method="ffill").fillna(method='bfill')
else:
series = series_output.fillna(method="ffill").fillna(value)
return series
#staticmethod
def _true_range(
high: pd.Series, low: pd.Series, prev_close: pd.Series
) -> pd.Series:
tr1 = high - low
tr2 = (high - prev_close).abs()
tr3 = (low - prev_close).abs()
true_range = pd.DataFrame(data={"tr1": tr1, "tr2": tr2, "tr3": tr3}).max(axis=1)
return true_range
class AroonIndicator(IndicatorMixin):
"""Aroon Indicator
Identify when trends are likely to change direction.
Aroon Up = ((N - Days Since N-day High) / N) x 100
Aroon Down = ((N - Days Since N-day Low) / N) x 100
Aroon Indicator = Aroon Up - Aroon Down
https://www.investopedia.com/terms/a/aroon.asp
Args:
close(pandas.Series): dataset 'Close' column.
window(int): n period.
fillna(bool): if True, fill nan values.
"""
def __init__(self, close: pd.Series, window: int = 25, fillna: bool = False):
self._close = close
self._window = window
self._fillna = fillna
# self._check_fillna = checkfillna
self._run()
self._check_fillna(IndicatorMixin._check_fillna())
def _run(self):
min_periods = 0 if self._fillna else self._window
rolling_close = self._close.rolling(
self._window, min_periods=min_periods)
self._aroon_up = rolling_close.apply(
lambda x: float(np.argmax(x) + 1) / self._window * 100, raw=True
)
def aroon_up(self) -> pd.Series:
"""Aroon Up Channel
Returns:
pandas.Series: New feature generated.
"""
aroon_up_series = self._check_fillna(self._aroon_up, value=0)
return pd.Series(aroon_up_series, name=f"aroon_up_{self._window}")
My program
# Create an empty DataFrame
table = pd.DataFrame()
# Create a serie of ones
list = np.ones((100))
sr = pd.Series(list)
# fill the empty Dataframe with the indicator of the Series
'try 1:'
table['numbers'] = AroonIndicator.aroon_up(sr)
'try 2:'
table['numbers'] = AroonIndicator.aroon_up(IndicatorMixin(sr))
# print the table
print(table)
The Aroon functions return values as panda Series, however you are trying to assign the results to the 'table' variable, which you have initialized as a DataFrame.
Also, when the only parameter you can pass to a function is 'self', you do not include a parameter when you call the function.
Lastly, don't use reserved words like 'list' for variable names.
Try:
import pandas as pd
import numpy as np
list_values = pd.Series(np.ones(100))
sr = AroonIndicator(list_values)
sr = sr.aroon_up()
print(sr)

Python return statement failing to return a list to be written into a pandas DF

For the life of me I cannot figure out why this function is not returning anything. Any insight will be greatly appreciated!
Basically I create a list of string variables that I am preserving in a Pandas DF. I am using the DF to pull the variable to plug into the function via a .apply() method. But my return function yields NONE results in my DF.
def add_combinations_to_directory(comb_tuples, person_id):
meta_list = []
for comb in comb_tuples:
concat_name = generate_normalized_name(comb)
metaphone_tuple = doublemetaphone(concat_name)
meta_list.append(metaphone_tuple[0])
if metaphone_tuple[1] != '':
meta_list.append(metaphone_tuple[1])
if metaphone_tuple[0] in __lookup_dict[0]:
__lookup_dict[0][metaphone_tuple[0]].append(person_id)
else:
__lookup_dict[0][metaphone_tuple[0]] = [person_id]
if metaphone_tuple[1] in __lookup_dict[1]:
__lookup_dict[1][metaphone_tuple[1]].append(person_id)
else:
__lookup_dict[1][metaphone_tuple[1]] = [person_id]
print(meta_list)
return meta_list
def add_person_to_lookup_directory(person_id, name_tuple):
add_combinations_to_directory(name_tuple, person_id)
def create_meta_names(x, id):
add_person_to_lookup_directory(id, x)
other['Meta_names'] = other.apply(lambda x: create_meta_names(x['Owners'], x['place_id']), axis=1)
Figured it out! it was a problem of nested functions. The return value from the add_combinations_to_directory was being returned to the add_person_to_lookup_directory function and not passing through to the dataframe.

'Series' object has no attribute 'values_counts'

When I try to apply the values_count() method to series within a function, I am told that 'Series' object has no attribute 'values_counts'.
def replace_1_occ_feat(col_list, df):
for col in col_list:
feat_1_occ = df[col].values_counts()[df[col].values_counts() == 1].index
feat_means = df[col].groupby(col)['SalePrice'].mean()
feat_means_no_1_occ = feat_means.iloc[feat_means.difference(feat_1_occ),:]
for feat in feat_1_occ:
# Find the closest mean SalePrice
replacement = (feat_means_no_1_occ - feat_means.iloc[feat,:]).idxmin()
df.col.replace(feat, replacement, inplace = True)
However when running df.column.values_count() outside a function it works.
The problem occurs on the first line when the values_counts() methods is used.
I checked the pandas version it's 0.23.0.
The function is value_counts(). Note only count is plural.

AttributeError: 'Series' object has no attribute 'label'

I'm trying to follow a tutorial on sound classification in neural networks, and I've found 3 different versions of the same tutorial, all of which work, but they all reach a snag at this point in the code, where I get the "AttributeError: 'Series' object has no attribute 'label'" issue. I'm not particularly au fait with either NNs or Python, so apologies if this is something trivial like a deprecation error, but I can't seem to figure it out myself.
def parser(row):
# function to load files and extract features
file_name = os.path.join(os.path.abspath(data_dir), 'Train/train', str(row.ID) + '.wav')
# handle exception to check if there isn't a file which is corrupted
try:
# here kaiser_fast is a technique used for faster extraction
X, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
# we extract mfcc feature from data
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
except Exception as e:
print("Error encountered while parsing file: ", file)
return None, None
feature = mfccs
label = row.Class
return [feature, label]
temp = train.apply(parser, axis=1)
temp.columns = ['feature', 'label']
from sklearn.preprocessing import LabelEncoder
X = np.array(temp.feature.tolist())
y = np.array(temp.label.tolist())
lb = LabelEncoder()
y = np_utils.to_categorical(lb.fit_transform(y))
As mentioned, I've seen three different tutorials on the same subject, all of which end with the same "temp = train.apply(parser, axis=1) temp.columns = ['feature', 'label']" fragment, so I'm assuming this is assigning correctly, but I don't know where it's going wrong otherwise. Help appreciated!
Edit: Traceback as requested, turns out I'd added the wrong traceback. Also I've since found out that this is a case of converting the series object to a dataframe, so any help with that would be great.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-17-1613f53e2d98> in <module>()
1 from sklearn.preprocessing import LabelEncoder
2
----> 3 X = np.array(temp.feature.tolist())
4 y = np.array(temp.label.tolist())
5
/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in __getattr__(self, name)
4370 if self._info_axis._can_hold_identifiers_and_holds_name(name):
4371 return self[name]
-> 4372 return object.__getattribute__(self, name)
4373
4374 def __setattr__(self, name, value):
AttributeError: 'Series' object has no attribute 'feature'
Your current implementation of parser(row) method returns a list for each row of data from train DataFrame. But this is then collected as a pandas.Series object.
So your temp is actually a Series object. Then the following line dont have any effect:
temp.columns = ['feature', 'label']
Since temp is a Series, it does not have any columns, and hence temp.feature and temp.label dont exist and hence the error.
Change your parser() method as following:
def parser(row):
...
...
...
# Return pandas.Series instead of List
return pd.Series([feature, label])
By doing this, the apply method from temp = train.apply(parser, axis=1) will return a DataFrame, so your other code will work.
I cannot say about the tutorials you are following. Maybe they followed an older version of pandas which allowed a list to be automatically converted to DataFrame.

Some operations on DataFrame

I am working on praising a *.csv file. Therefore I try to create a class which helps me to simplify some operations on DataFrame.
I've created two methods in order to parse a column 'z' that contains values for the 'Price' column.
def subr(self):
isone = self.df.z == 1.0
if isone.any():
atone = self.df.Price[isone].iloc[0]
self.df.loc[self.df.z.between(0.8, 2.5), 'Benchmark'] = atone
# df.loc[(df.r >= .8) & (df.r <= 1.4), 'value'] = atone
return self.df
def obtain_z(self):
"Return a column with z for E_ref"
self.z_col = self.subr()
self.dfnew = self.df.groupby((self.df.z < self.df.z.shift()).cumsum()).apply(self.z_col)
return self.dfnew
def main():
x = ParseDataBase('data.csv')
file_content = x.read_file()
new_df = x.obtain_z()
I'm getting the following error:
'DataFrame' objects are mutable, thus they cannot be hashed
'DataFrame' objects are mutable means that we can change elements of that Frame. I'm not sure when I'm hashing.
I noticed the use of apply(self.z_col) is going wrong.
I also have no clue how to fix it.
You are passing the DataFrame self.df returned by self.subr() to apply, but actually apply only takes functions as parameters (see examples here).

Categories