Getting previous row values from within pandas apply() function - python

import pandas as pd
def greater_or_less(d):
if d['current'] > d['previous']:
d['result']="Greater"
elif d['current'] < d['previous']:
d['result']="Less"
elif d['current'] == d['previous']:
d['result']="Equal"
else:
pass
return d
df=pd.DataFrame({'current':[1,2,2,8,7]})
# Duplicate the column with shifted values
df['previous']=df['current'].shift(1)
df['result']=""
df=df.apply(greater_or_less,axis=1)
The result is:
current previous result
1 NaN
2 1 Greater
2 2 Equal
8 2 Greater
7 8 Less
I'd then drop the previous column as it's no longer needed. Ending up with:
current result
1
2 Greater
2 Equal
8 Greater
7 Less
How can I do this without adding the extra column?
What i'd like to do, is know how to reference the previous row's value from within the greater_or_less function.

Use diff() method:
import pandas as pd
import numpy as np
df=pd.DataFrame({'current':[1,2,2,8,7]})
np.sign(df.current.diff()).map({1:"Greater", 0:"Equal", -1:"Less"})

Related

Create new dataframe from another dataframe

I've created a dataframe. I'd like to create a new dataframe depending on the current dataframe's conditions. My Python code is as follows:
df = pd.DataFrame({'A':[1,2,3,4,5,6,7,8,9,10],'B':[10,20,30,40,50,60,70,80,90,100]})
df
A B
0 1 10
1 2 20
2 3 30
3 4 40
4 5 50
5 6 60
6 7 70
7 8 80
8 9 90
9 10 100
import pywt
import numpy as np
import scipy.signal as signal
import matplotlib.pyplot as plt
from skimage.restoration import denoise_wavelet
wavelet_type='db6'
def new_df(df):
df0 = pd.DataFrame()
if (df.iloc[:,0]>=1) & (df.iloc[:,0]<=3):
df0['B'] = denoise_wavelet(df.loc[(df.iloc[:,0]>=1) & (df.iloc[:,0]<=3),'B'], method='BayesShrink', mode='soft', wavelet_levels=3, wavelet='sym8', rescale_sigma='True')
elif (df.iloc[:,0]>=4) & (df.iloc[:,0]<=6):
df0['B'] = denoise_wavelet(df.loc[(df.iloc[:,0]>=4) & (df.iloc[:,0]<=6),'B'], method='BayesShrink', mode='soft', wavelet_levels=3, wavelet='sym8', rescale_sigma='True')
else:
df0['B']=df.iloc[:,1]
return df0
I want a new dataframe that will denoise the values in column B that meet the conditions, but leave the remaining values alone and keep them in the new dataframe. My code gives me error message: ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all(). Could you please help me?
My desired output should look
A B
0 1 15*
1 2 25*
2 3 35*
3 4 45*
4 5 55*
5 6 65*
6 7 70
7 8 80
8 9 90
9 10 100
#* represents new values may be different when you get the result.
#this is just for a demo.
May be my code idea is wrong. Could you please help me?
(df.iloc[:,0]>=1) will return a pandas series of boolean values corresponding to which elements in the first column of df are greater than or equal to 1.
In the line
if (df.iloc[:,0]>=1) & (df.iloc[:,0]<=3):
you are hence trying to do boolean arithmetic with two pandas series which doesn't make sense.
Pandas gives you a hint in the error message as to what might solve the problem:
e.g. if you wanted to check whether any element in df.iloc[:,0] was greater than one, you could use (df.iloc[:,0]>=1).any() which would return a single bool that you could then compare with the result of (df.iloc[:,0]<=3).any().
Without more context to the problem or what you're trying to do, it is hard to help you further.
Note that since you are filtering the data while passing it to denoise_wavelet, you don't really need the if statements, but you should assign the returned value to the same "view" of the DataFrame. Here is my approach. It first copy the original DataFrame and replace the desired rows with the "denoised" data.
import numpy as np
import pandas as pd
import scipy.signal as signal
import matplotlib.pyplot as plt
from skimage.restoration import denoise_wavelet
wavelet_type='db6'
df = pd.DataFrame({'A':[1,2,3,4,5,6,7,8,9,10],'B':[10,20,30,40,50,60,70,80,90,100]})
def new_df(df):
df0 = df.copy()
df0.loc[(df.iloc[:,0]>=1) & (df.iloc[:,0]<=3),'B'] = denoise_wavelet(df.loc[(df.iloc[:,0]>=1) & (df.iloc[:,0]<=3),'B'].values, method='BayesShrink', mode='soft', wavelet_levels=3, wavelet='sym8', rescale_sigma='True')
df0.loc[(df.iloc[:,0]>=4) & (df.iloc[:,0]<=6),'B'] = denoise_wavelet(df.loc[(df.iloc[:,0]>=4) & (df.iloc[:,0]<=6),'B'].values, method='BayesShrink', mode='soft', wavelet_levels=3, wavelet='sym8', rescale_sigma='True')
return df0
new_df(df)
However, I don't really know how denoise_wavelet so I don't know if the result is correct, but the values from index 6 to 9 are left unchanged.
Updated
For applying for 2 or more columns:
df = pd.DataFrame({'A':[1,2,3,4,5,6,7,8,9,10],
'B1':[10,20,30,40,50,60,70,80,90,100],
'B2':[10,20,30,40,50,60,70,80,90,100],
'B3':[10,20,30,40,50,60,70,80,90,100]})
def apply_denoise(col):
col.loc[1:3] = denoise_wavelet(col.loc[1:3], method='BayesShrink', mode='soft', wavelet_levels=3, wavelet='sym8', rescale_sigma='True')
col.loc[4:6] = denoise_wavelet(col.loc[4:6], method='BayesShrink', mode='soft', wavelet_levels=3, wavelet='sym8', rescale_sigma='True')
return col
new_df = df.set_index('A').apply(apply_denoise)
new_df
Note that since you are always conditioning on column 'A' you can convert it to an index and make use of indexing to implement the condition. Then using apply you can call the function apply_denoise on each column, and it will return a new DataFrame with the resulting columns.

how do you divide each value from a pandas series in sequence

Hi I was trying to figure out how to divide values from a DataFrame. But here I made an example for pandas series
a = pd.Series([1, 2, 16,64,128,360,720])
a
-----------------
0 1
1 2
2 16
3 64
4 128
5 360
6 720
So is there any way I could divide a number in a given row by the value from the previous row?
0 2
1 8
2 4
3 2
4 2.8
5 2
Furthermore, I also tried to get the output like "if the value is double, print the index".
Thank you for your help!
What it seems to me is that you are trying to divide a number in a given row by the one of the previous. This can be achieved using this code
import pandas as pd
import numpy as np
a = pd.Series([1, 2, 16,64,128,360,720])
division = pd.Series(np.divide(a.values[1:],a.values[:-1]))
index = pd.Series(np.multiply(division == 2, [i for i in range(len(a)-1)]))
Note: your question is very ill posed. You didn't specify what you wanted to achieve, I figured out by myself from the example. You also added a wrong snipped of code. Pay attention to make a nicer question next time
Well I don't know what exactly you want to divide your pandas Series by, but you can do it like this :
# If you want to store a new result
b = a / value_to_divive_by
# or if you want to apply it directly to your serie
a /= value_to_divive_by
or using list comprehension
b = [int(nb / your_value_here) for nb in a]
# with a min value to do the divison
b = [int(nb / your_value_here) for nb in a if nb > min_value]
There is probably other ways to do what you want, but there is two easy solutions

Keep upper n rows of a pandas dataframe based on condition

how would I delete all rows from a dataframe that come after a certain fulfilled condition? As an example I have the following dataframe:
import pandas as pd
xEnd=1
yEnd=2
df = pd.DataFrame({'x':[1,1,1,2,2,2], 'y':[1,2,3,3,4,3], 'id':[0,1,2,3,4,5]})
How would i get a dataframe that deletes the last 4 rows and keeps the upper 2 as in row 2 the condition x=xEnd and y=yEnd is fulfilled.
EDITED: should have mentioned that the dataframe is not necessarily ascending. Could also be descending and i still would like to get the upper ones.
To slice your dataframe until the first time a condition across 2 series are satisfied, first calculate the required index and then slice via iloc.
You can calculate the index via set_index, isin and np.ndarray.argmax:
idx = df.set_index(['x', 'y']).isin((xEnd, yEnd)).values.argmax()
res = df.iloc[:idx+1]
print(res)
x y id
0 1 1 0
1 1 2 1
If you need better performance, see Efficiently return the index of the first value satisfying condition in array.
not 100% sure i understand correctly, but you can filter your dataframe like this:
df[(df.x <= xEnd) & (df.y <= yEnd)]
this yields the dataframe:
id x y
0 0 1 1
1 1 1 2
If x and y are not strictly increasing and you want whats above the line that satisfy condition:
df[df.index <= (df[(df.x == xEnd) & (df.y == yEnd)]).index.tolist()]
df = df.iloc[[0:yEnd-1],[:]]
Select just first two rows and keep all columns and put it in new dataframe.
Or you can use the same name of variable too.

GroupBy aggregate count based on specific column

I've been looking for a few hours and can't seem to find a topic related to that exact matter.
So basically, I want to apply on a groupby to find something else than the mean. My groupby returns two columns 'feature_name' and 'target_name', and I want to replace the value in 'target_name' by something else : the number of occurences of 1, of 0, the difference between both, etc.
print(df[[feature_name, target_name]])
When I print my dataframe with the column I use, I get the following : screenshot
I already have the following code to compute the mean of 'target_name' for each value of 'feature_name':
df[[feature_name, target_name]].groupby([feature_name],as_index=False).mean()
Which returns : this.
And I want to compute different things than the mean. Here are the values I want to compute in the end : what I want
In my case, the feature 'target_name' will always be equal to either 1 or 0 (with 1 being 'good' and 0 'bad'.
I have seen this example from an answer.:
df.groupby(['catA', 'catB'])['scores'].apply(lambda x: x[x.str.contains('RET')].count())
But I don't know how to apply this to my case as x would be simply an int.
And after solving this issue, I still need to compute more than just the count!
Thanks for reading ☺
import pandas as pd
import numpy as np
def my_func(x):
# Create your 3 metrics here
calc1 = x.min()
calc2 = x.max()
calc3 = x.sum()
# return a pandas series
return pd.Series(dict(metric1=calc1, metric2=calc2, metric3=calc3))
# Apply the function you created
df.groupby(...)['columns needed to calculate formulas'].apply(my_func).unstack()
Optionally, using .unstack() at the end allows you to see all your 3 metrics as column headers
As an example:
df
Out[]:
Names A B
0 In 0.820747 0.370199
1 Out 0.162521 0.921443
2 In 0.534743 0.240836
3 Out 0.910891 0.096016
4 In 0.825876 0.833074
5 Out 0.546043 0.551751
6 In 0.305500 0.091768
7 Out 0.131028 0.043438
8 In 0.656116 0.562967
9 Out 0.351492 0.688008
10 In 0.410132 0.443524
11 Out 0.216372 0.057402
12 In 0.406622 0.754607
13 Out 0.272031 0.721558
14 In 0.162517 0.408080
15 Out 0.006613 0.616339
16 In 0.313313 0.808897
17 Out 0.545608 0.445589
18 In 0.353636 0.465455
19 Out 0.737072 0.306329
df.groupby('Names')['A'].apply(my_func).unstack()
Out[]:
metric1 metric2 metric3
Names
In 0.162517 0.825876 4.789202
Out 0.006613 0.910891 3.879669

Calculating value from a group of cells with Pandas

I am trying to read a csv file of horse track information.
I am attempting to code for the post positions (col 3) in race 1 the max value for the field qpts (col 210). I have spend days on researching this and can find no clear answer on web or youtube.
When I run the code below, I get "The truth value of a Series is ambiguous....."
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',100)
df = pd.read_csv('track.csv', header=None, na_values=['.'])
index = list(range(0,200,1))
columns = list(range(0,1484,1))
if df.ix[2]== 1:
qpts = (df.max([210]))
print (qpts)
the problem is with
if df.ix[2] == 1. The expression df.ix[2] == 1 will return a pd.Series of truth values. By putting an if in front, you are attempting to evaluate a series of values as either True or False, which is what is throwing the error.
There are several ways to produce a series where the value is 210 and the indices are those where df.ix[2] == 1
This is one way
pd.Series(210, df.index[df.ix[2] == 1])
Here df.ix[2]== 1 is going to return a Series. You need to use a function such as .any() or .all() to combine the Series into a single value which you can do a truth statement upon. For example,
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',100)
df = pd.read_csv('track.csv', header=None, na_values=['.'])
index = list(range(0,200,1))
columns = list(range(0,1484,1))
if (df.ix[2]== 1).any(axis=1):
qpts = (df.max([210]))
print (qpts)
In the case above we are checking to see if any of the Series elements are equal to 1. If so then the the if statement will be implemented. If we do not do this then we could have a situation as follows:
print(df)
Out[1]:
1 3
2 7
3 1
4 5
5 6
print(df.ix[2]== 1)
Out[2]:
1 False
2 False
3 True
4 False
5 False
Therefore the Series would be both simultaneously True and False.

Categories