apply my function if...else statement with condition doesn't pass - python

my input(as example):
df = pd.DataFrame({'frame':[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],'sum_result_ICV':[0,1,1,1,2,2,2,2,1,1,1,1,1,1,1,0], 'sum_result_AO':[0,1,1,1,0,0,0,0,1,1,1,1,1,1,1,0]})
dd['result_ICV'] = 0
dd['result_ATO'] = 0
My code and my_func:
for z in range(0,len(cv_details)):
def result_func(row):
for i in range(0,len(dd)):
if row==2:
return(cv_details[z])
elif row==1:
if dd.loc[dd['sum_result_'+cv_details[z]]==2,'frame'].empty:
return ('ReviewNG-'+cv_details[z])
elif (dd['frame'][i]-dd.loc[dd.iloc[:,z+1]==2,'frame'].iloc[0]) <=3 :
return('NG-'+cv_details[z])
elif (dd['frame'][i]-dd.loc[dd.iloc[:,z+1]==2,'frame'].iloc[-1]) <=3 :
return('NG-'+cv_details[z])
else:
return('ReviewNG-'+cv_details[z])
elif row==0:
return('Other')
else:
return ""
dd.iloc[:,z+3]=dd.iloc[:,z+1].apply(result_func)
I expect:
But my output:
So as you can see: I need some condition, for example: "if sum_result_ICV equal 0 -> put "Other", if 'sum_result_ICV' equal 1 AND difference of (Number of Frame minus Number where first/last Frame==2) equal or less than 3 -> put "NG-ICV" in other wise "ReviewNG-ICV"(for example number of frame 11 where in sum_result_ICV was 1 and in distance from number 7 of frame where was sum_result_ICV equal 2, so 11-7>3 put "ReviewNG-ICV" ). In my example frame from 1 to 3 must be "NG-ICV", and also from 8 to 10. But from 11 to 14 it must be "ReviewBG-ICV". Also, please see pic that I expect from my function. So what I do wrong?
UPDATE based an answer of #woblob
That new code with loop:
for z in range(0,len(cv_details)):
df.iloc[df.iloc[:,z+1].to_numpy()==0, z+2 ] = 'Other'
mask2= df.iloc[:,z+1]==2
mask1 =df.iloc[:,z+1]==1
df.iloc[mask2,z+2]=cv_details[z]
if df.loc[mask2,'frame'].empty:
df.iloc[mask1,z+2]='ReviewNG-'+cv_details[z]
else:
df_frame_first=df.loc[mask2,'frame'].iloc[0]
df_frame_last=df.loc[mask2,'frame'] .iloc[-1]
mask_lt_3 = ((df.frame - df_frame_first) <= 3) | (df.frame - df_frame_last <= 3)
ones_lt_3 = mask1 & mask_lt_3
ones_not_lt_3 = mask1 & (~mask_lt_3)
df.iloc[ones_lt_3, z+2] = 'NG-'+cv_details[z]
df.iloc[ones_not_lt_3 , z+2] = 'ReviewNG-'+cv_details[z]

As I was trying to untangle the logic, I reworked it completely.
dd.loc[dd.result == 0, "sum_result"] = 'Other'
mask2 = dd.result == 2
mask1 = dd.result == 1
dd.loc[mask2, "sum_result"] = 'ICV'
if dd.loc[mask2,'frame'].empty:
dd.loc[mask1, "sum_result"] = 'No sum_result==2'
else:
dd_frame_first = dd.loc[mask2,'frame'].iloc[0]
dd_frame_last = dd.loc[mask2,'frame'].iloc[-1]
mask_lt_3 = ((dd.frame - dd_frame_first) <= 3) | (dd.frame - dd_frame_last <= 3)
ones_lt_3 = mask1 & mask_lt_3
ones_not_lt_3 = mask1 & (~mask_lt_3)
dd.loc[ones_lt_3, "sum_result"] = 'NG-ICV'
dd.loc[ones_not_lt_3 , "sum_result"] = 'ReviewNG-ICV'

Related

Optimizing pandas operations

I'm cleaning a data set with 6 columns and just under 9k rows. As part of the clean up I have to find zero/negative, repetitive, interpolated, and outlier values defined as:
repetitive values - 3 subsequent values are equivalent up to 6 decimal places, flag the first one
interpolated values - take a = row1_val - row2_val, b = row2_val-row3_val, c = row3_val - row4_val, etc. If a=b or b=c, etc. flag
outlier values - 1.1peak < MW < 0.1peak
Right now I am using for loops on the data frame to do the row comparisons and flag the values, put them into a new data frame, and replace them with 999999 but it takes FOREVER. I used the following code to find and replace the zero/negative values, but I cant seem to make it work for the multi row functions used in the for loop. Can anyone show me how this works?
zero/negative values:
df = (df.drop(data_columns, axis=1).join(df[data_columns].apply(pd.to_numeric, errors='coerce')))
Missing_Vals_df = df.loc[(df['A KW'].isnull()) | (df['A KVAR'].isnull()) | (df['B KW'].isnull()) | (df['B KVAR'].isnull()) | (df['C KW'].isnull()) | (df['C KVAR'].isnull())]
df = df.fillna(999999)
Loops:
for x in range(len(df)-2):
for thing in data_columns:
if df.loc[x][thing] <= 0:
df = df.replace(to_replace = df.loc[x][thing], value=999999)
elif (round(df.loc[x][thing], 6) == round(df.loc[x+1][thing], 6) == round(df.loc[x+2][thing], 6)) & (df.loc[x][thing] != 999999):
if x not in duplicate_loc:
duplicate_loc.append(x)
duplicate_df = duplicate_df.append(df.loc[(x)])
df = df.replace(to_replace = df.iloc[x][thing], value=999999)
elif (round((df.loc[x+1][thing] - df.loc[x][thing]), 3) == round((df.loc[x+2][thing] - df.loc[x+1][thing]), 3)) & (df.loc[x][thing] != 999999):
if x not in interpolated_loc:
interpolated_loc.append(x)
interpolated_df = interpolated_df.append(df.loc[(x)])
df = df.replace(to_replace = df.iloc[x][thing], value=999999)
elif ((df.loc[x][thing] > 1.1*df_peak.loc[0]['Value']) | (df.loc[x][thing] > 1.1*df_peak.loc[0]['Value']) | (df.loc[x][thing] > 1.1*df_peak.loc[0]['Value'])) & (df.loc[x][thing] != 999999):
if x not in outlier_loc:
outlier_loc.append(x)
outlier_df = outlier_df.append(df.loc[(x)])
df = df.replace(to_replace = df.iloc[x][thing], value=999999)

“For Loop” User Define function

I am trying to create a user defined function with 1 input (“Port”) and it should give me a list of 371 float, however, now I am getting only 1 element in the list. Would really appreciate any help. Thanks.
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
def function (port):
for year in range(1990,2021,1):
for month in range(1,13,1):
if year == 1990 and month == 1:
pass
else:
y = []
a = df
a = a.set_index('date')
a = a[a.index.year == year]
a = a[a.index.month == month - 1]
a.loc['Volatility'] = a.std()
a = a.T
a['Portfolio'] = pd.qcut(a['Volatility'], q=[0, .2, .4, .6, .8, 1],\
labels=['1','2','3','4','5']).astype(int)
port = a[a['Portfolio'] == port]
port.sort_values("Volatility")
b = port.index
#############################################################
c = df
c = c.set_index('date')
c = c[c.index.year == year]
c = c[c.index.month == month]
c = c[b]
c = c.T
c += 1
c["returns"] = (c.product(axis=1) - 1)
C_r = c['returns'].sum()
y.append(C_r)
return y
p = function(1)
It's because you initialize y as empty array inside the loop. So it will become [] in every epochs. In order solving your problem, you just need to initialize the empty array before the loop.
y = []
for year in range(1990,2021,1):
for month in range(1,13,1):
if year == 1990 and month == 1:
pass
else:
.....
y.append(...)
You can also simplify the conditional statement by just do
if not (year == 1990 and month == 1):
So you can get rid the else and pass since they aren't necessary.

No axis named 1 for object type Series

I'm using df.iloc[i] to assign (ori + gap) on each row of the dataframe. But I got 'No axis named 1 for object type Series' error. And I don't understand why.
df1 = pd.read_csv('异常销量监控_0127_to Xiaolei Eagle send.csv',low_memory=False)
df2 = pd.read_csv('test0412.csv',dtype = {'Actual':float})
gap = 0
for i in range(len(df2)):
ym = df2['YM'].iloc[i]
kcode = df2['REPKCode'].iloc[i]
fn = df2['FamilyNameE'].iloc[i]
ori = float(df2['Actual'].iloc[i])
filt = (df1['YM'] == ym )& (df1['REPKCode'] == kcode) & (df1['FamilyNameE'] == fn))
gap = df1[filt]['Actual']
df2['Actual'].iloc[i] = (ori + gap)
df2.to_csv('after.csv',index=False)
The issue is in the following line
filt = (df1['YM'] == ym )& (df1['REPKCode'] == kcode) & (df1['FamilyNameE'] == fn))
gap = df1[filt]['Actual']
the value of filt will be either 1 or 0 because you are checking multiple conditions
(df1['YM'] == ym )& (df1['REPKCode'] == kcode) & (df1['FamilyNameE'] == fn))
and if the above condition is true , filt will be assigned 1 otherwise 0.
So your condition was true and filt == 1
Now in the following line
gap = df1[filt]['Actual']
you are actually doing this
gap = df1[1]['Actual']
Its trying to find the column '1' in df1 dataframe and because there is no column as '1' thats why its giving you error.
EDIT
Reply to your comment - How can I get the 'Actual' value with 'YM','REPKCode','FamilyNameE' match in df1?
for that you need to write below lines
gap = df1[ df1['YM'] == ym ][ df1['REPKCode'] == kcode][ df1['FamilyNameE'] == fn]['Actual']
and remove below lines
filt = (df1['YM'] == ym )& (df1['REPKCode'] == kcode) & (df1['FamilyNameE'] == fn))
gap = df1[filt]['Actual']
I think the problem here is,
df2["YM"].iloc[i]
because when you are typed df2["YM"] it returns YM column from the dataframe df2.
So, that means you are trying to get a column from the column by typing df2["YM"].iloc[i].
Try, df2.iloc[i]

Merge & Filter Multiple Columns of One Dataframe with Boolean Logic

Objective: Output buy/sell/neutral/error indicators to a single df[column] while filtering out "False" values. Indicators are based on the below dataframe column, and then formulated with a boolean statement:
df['sma_10'] = pd.DataFrame(ta.SMA(df['close'], timeperiod=10), dtype=np.float, columns=['close'])
df['buy'] = pd.DataFrame(df['close'] > df['sma_10'], columns=['buy'])
df['buy'] = df['buy'].replace({True: 'BUY'})
df['sell'] = pd.DataFrame(df['close'] < df['sma_10'], columns=['sell'])
df['sell'] = df['sell'].replace({True: 'SELL'})
df['neutral'] = pd.DataFrame(df['close'] == df['sma_10'], columns=['neutral'])
df['neutral'] = df['neutral'].replace({True: 'NEUTRAL'})
df['error'] = pd.DataFrame((df['buy'] == False) & (df['sell'] == False) & (df['neutral'] == False), columns=['Error'])
df['error'] = df['error'].replace({True: 'ERROR'})
Current output of df
buy sell Neutral Error
False False False ERROR
BUY False False False
False SELL False False
False False NEUTRAL False
Desired output of df
Indicator
ERROR
BUY
SELL
NEUTRAL
Attempts & Methods:
1st Method: Merging all the buy/sell/neutral/error columns and attempting to drop "False" values. Dataframe only iterates once before erroring out.
df['sma_10_indic']=[df['buy'].astype(str)+df['sell'].astype(str)+df['neutral'].astype(str)+df['error'].astype(str)].drop("False")
I have tried a subroutine of if & elif's such as:
This method also errors out before the first index
df['buy'] = pd.DataFrame(df['close'] > df['sma_10'])
df['sell'] = pd.DataFrame(df['close'] < df['sma_10'])
df['neutral'] = pd.DataFrame(df['close'] == df['sma_10'])
error = ((buy == False) and (sell == False) and (neutral == False))
if (df['buy'] == "True"):
df['sma_10_indic'] = pd.DataFrame("BUY",columns=['indicator'])
elif (df['sell'] == "True"):
df['sma_10_indic'] = pd.DataFrame("SELL",columns=['indicator'])
elif (df['neutral'] == "True"):
df['sma_10_indic'] = pd.DataFrame("NEUTRAL",columns=['indicator'])
elif (error == True):
df['sma_10_indic'] = pd.DataFrame("ERROR",columns=['indicator'])
I am unsure on the path ahead, I have been beating my head against the wall for about 14 hours on this one with no clear path ahead. I have also tried creating another seperate dataframe and merging them via concat with no luck due to the boolean. I am relatively new to python and pandas/dataframes, so please be patient with me. Thank you in Advance!
Use numpy.select:
m1 = df['close'] > df['sma_10']
m2 = df['close'] < df['sma_10']
m3 = df['close'] == df['sma_10']
df['Indicator'] = np.select([m1, m2, m3], ['BUY','SELL','NEUTRAL'], 'ERROR')

python, operation on big pandas Dataframe

I have a pandas DataFrame named Joined with 5 fields:
product | price | percentil_25 | percentil_50 | percentile_75
for each row I want to class the price like this:
if the price is below percentil_25 I'm giving to this product the class 1, and so on
So what I did is:
classe_final = OrderedDict()
classe_final['sku'] = []
classe_final['class'] = []
for index in range(len(joined)):
classe_final['sku'].append(joined.values[index][0])
if(float(joined.values[index][1]) <= float(joined.values[index][2])):
classe_final['class'].append(1)
elif(float(joined.values[index][2]) < float(joined.values[index][1]) and float(joined.values[index][1]) <= float(joined.values[index][3])):
classe_final['class'].append(2)
elif(float(joined.values[index][3]) < float(joined.values[index][1]) and float(joined.values[index][1]) <= float(joined.values[index][4])):
classe_final['class'].append(3)
else:
classe_final['class'].append(4)
But as my DataFrame is quite big it's taking forever.
Do you have any idea how I could do this quicker?
# build an empty df
df = pd.DataFrame()
# get a list of the unique products, could skip this perhaps
df['Product'] = other_df['Sku'].unique()
2 ways, define a func and call apply
def class(x):
if x.price < x.percentil_25:
return 1
elif x.price >= x.percentil_25 and x.price < x.percentil_50:
return 2:
elif x.price >= x.percentil_50 and x.price < x.percentil_75:
return 2:
elif x.price >= x.percentil_75:
return 4
df['class'] = other_df.apply(lambda row: class(row'), axis=1)
another way which I think is better and will be much faster is we could add the 'class' column to your existing df and use loc and then just take a view of the 2 columns of interest:
joined.loc[joined['price'] < joined['percentil_25'], 'class'] =1
joined.loc[(joined['price'] >= joined['percentil_25']) & (joined['price'] < joined['percentil_50']), 'class'] =2
joined.loc[(joined['price'] >= joined['percentil_50']) & (joined['price'] < joined['percentil_75']), 'class'] =3
joined.loc[joined['price'] >= joined['percentil_75'], 'class'] =4
classe_final = joined[['cku', 'class']]
Just for kicks you could use a load of np.where conditions:
classe_final['class'] = np.where(joined['price'] > joined['percentil_75'], 4, np.where( joined['price'] > joined['percentil_50'], 3, np.where( joined['price'] > joined['percentil_25'], 2, 1 ) ) )
this evaluates whether the price is greater than percentil_75, if so then class 4 otherwise it evaluates another conditiona and so on, may be worth timing this compared to loc but it is a lot less readable
Another solution, if someone asked me to bet which one is the fastest I'd go for this:
joined.set_index("product").eval(
"1 * (price >= percentil_25)"
" + (price >= percentil_50)"
" + (price >= percentil_75)"
)

Categories