Create new dataframe from another dataframe - python

I've created a dataframe. I'd like to create a new dataframe depending on the current dataframe's conditions. My Python code is as follows:
df = pd.DataFrame({'A':[1,2,3,4,5,6,7,8,9,10],'B':[10,20,30,40,50,60,70,80,90,100]})
df
A B
0 1 10
1 2 20
2 3 30
3 4 40
4 5 50
5 6 60
6 7 70
7 8 80
8 9 90
9 10 100
import pywt
import numpy as np
import scipy.signal as signal
import matplotlib.pyplot as plt
from skimage.restoration import denoise_wavelet
wavelet_type='db6'
def new_df(df):
df0 = pd.DataFrame()
if (df.iloc[:,0]>=1) & (df.iloc[:,0]<=3):
df0['B'] = denoise_wavelet(df.loc[(df.iloc[:,0]>=1) & (df.iloc[:,0]<=3),'B'], method='BayesShrink', mode='soft', wavelet_levels=3, wavelet='sym8', rescale_sigma='True')
elif (df.iloc[:,0]>=4) & (df.iloc[:,0]<=6):
df0['B'] = denoise_wavelet(df.loc[(df.iloc[:,0]>=4) & (df.iloc[:,0]<=6),'B'], method='BayesShrink', mode='soft', wavelet_levels=3, wavelet='sym8', rescale_sigma='True')
else:
df0['B']=df.iloc[:,1]
return df0
I want a new dataframe that will denoise the values in column B that meet the conditions, but leave the remaining values alone and keep them in the new dataframe. My code gives me error message: ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all(). Could you please help me?
My desired output should look
A B
0 1 15*
1 2 25*
2 3 35*
3 4 45*
4 5 55*
5 6 65*
6 7 70
7 8 80
8 9 90
9 10 100
#* represents new values may be different when you get the result.
#this is just for a demo.
May be my code idea is wrong. Could you please help me?

(df.iloc[:,0]>=1) will return a pandas series of boolean values corresponding to which elements in the first column of df are greater than or equal to 1.
In the line
if (df.iloc[:,0]>=1) & (df.iloc[:,0]<=3):
you are hence trying to do boolean arithmetic with two pandas series which doesn't make sense.
Pandas gives you a hint in the error message as to what might solve the problem:
e.g. if you wanted to check whether any element in df.iloc[:,0] was greater than one, you could use (df.iloc[:,0]>=1).any() which would return a single bool that you could then compare with the result of (df.iloc[:,0]<=3).any().
Without more context to the problem or what you're trying to do, it is hard to help you further.

Note that since you are filtering the data while passing it to denoise_wavelet, you don't really need the if statements, but you should assign the returned value to the same "view" of the DataFrame. Here is my approach. It first copy the original DataFrame and replace the desired rows with the "denoised" data.
import numpy as np
import pandas as pd
import scipy.signal as signal
import matplotlib.pyplot as plt
from skimage.restoration import denoise_wavelet
wavelet_type='db6'
df = pd.DataFrame({'A':[1,2,3,4,5,6,7,8,9,10],'B':[10,20,30,40,50,60,70,80,90,100]})
def new_df(df):
df0 = df.copy()
df0.loc[(df.iloc[:,0]>=1) & (df.iloc[:,0]<=3),'B'] = denoise_wavelet(df.loc[(df.iloc[:,0]>=1) & (df.iloc[:,0]<=3),'B'].values, method='BayesShrink', mode='soft', wavelet_levels=3, wavelet='sym8', rescale_sigma='True')
df0.loc[(df.iloc[:,0]>=4) & (df.iloc[:,0]<=6),'B'] = denoise_wavelet(df.loc[(df.iloc[:,0]>=4) & (df.iloc[:,0]<=6),'B'].values, method='BayesShrink', mode='soft', wavelet_levels=3, wavelet='sym8', rescale_sigma='True')
return df0
new_df(df)
However, I don't really know how denoise_wavelet so I don't know if the result is correct, but the values from index 6 to 9 are left unchanged.
Updated
For applying for 2 or more columns:
df = pd.DataFrame({'A':[1,2,3,4,5,6,7,8,9,10],
'B1':[10,20,30,40,50,60,70,80,90,100],
'B2':[10,20,30,40,50,60,70,80,90,100],
'B3':[10,20,30,40,50,60,70,80,90,100]})
def apply_denoise(col):
col.loc[1:3] = denoise_wavelet(col.loc[1:3], method='BayesShrink', mode='soft', wavelet_levels=3, wavelet='sym8', rescale_sigma='True')
col.loc[4:6] = denoise_wavelet(col.loc[4:6], method='BayesShrink', mode='soft', wavelet_levels=3, wavelet='sym8', rescale_sigma='True')
return col
new_df = df.set_index('A').apply(apply_denoise)
new_df
Note that since you are always conditioning on column 'A' you can convert it to an index and make use of indexing to implement the condition. Then using apply you can call the function apply_denoise on each column, and it will return a new DataFrame with the resulting columns.

Related

i want to filter data with &,but got a wrong result

Here is my dataframe,i want to set the value satisfy busSpeed<3 and sationUuid==1 in the same time to be NAN ,but i got a wrong result ,anyone help, thanks
below is my code
import pandas as pd
import numpy as np
df=pd.read_excel('d:gps/9-20-32-36574.xlsx')
df.sationUuid.fillna(method='bfill',inplace=True)
df.dropna(subset=['sationUuid'],inplace=True)
df1=list()
for i in range(len(df)):
if((df.sationUuid[i]==1)&(df.busSpeed[i]<3)):
df1.append(df.replace(df.busSpeed[i],np.NAN))
this is the data before processing
this is the result i got
sationuuid great than 1 was also set to nan,if statement seems only do busSpeed<3,how this happend
You can do this with Pandas querying rather than with a loop and if block. Using Pandas querying has the added benefit of being much more efficient than using a loop. To replace the values satisfying your condition, you can use assignment with .loc.
Here's an example:
import pandas as pd
import numpy as np
df = pd.read_excel('d:gps/9-20-32-36574.xlsx')
df.sationUuid.fillna(method='bfill',inplace=True)
df.dropna(subset=['stationUuid'],inplace=True)
mask = (df.stationUuid == 1) & (df.busSpeed < 3)
df.loc[mask, 'busSpeed'] = np.nan
So if df looks like the following after reading in from excel:
stationUuid busSpeed
1 1.5
2 1
1 100
3 10
Then df will look like this after the rest of the script:
stationUuid busSpeed
1 NaN
2 1
1 100
3 10

GroupBy aggregate count based on specific column

I've been looking for a few hours and can't seem to find a topic related to that exact matter.
So basically, I want to apply on a groupby to find something else than the mean. My groupby returns two columns 'feature_name' and 'target_name', and I want to replace the value in 'target_name' by something else : the number of occurences of 1, of 0, the difference between both, etc.
print(df[[feature_name, target_name]])
When I print my dataframe with the column I use, I get the following : screenshot
I already have the following code to compute the mean of 'target_name' for each value of 'feature_name':
df[[feature_name, target_name]].groupby([feature_name],as_index=False).mean()
Which returns : this.
And I want to compute different things than the mean. Here are the values I want to compute in the end : what I want
In my case, the feature 'target_name' will always be equal to either 1 or 0 (with 1 being 'good' and 0 'bad'.
I have seen this example from an answer.:
df.groupby(['catA', 'catB'])['scores'].apply(lambda x: x[x.str.contains('RET')].count())
But I don't know how to apply this to my case as x would be simply an int.
And after solving this issue, I still need to compute more than just the count!
Thanks for reading ☺
import pandas as pd
import numpy as np
def my_func(x):
# Create your 3 metrics here
calc1 = x.min()
calc2 = x.max()
calc3 = x.sum()
# return a pandas series
return pd.Series(dict(metric1=calc1, metric2=calc2, metric3=calc3))
# Apply the function you created
df.groupby(...)['columns needed to calculate formulas'].apply(my_func).unstack()
Optionally, using .unstack() at the end allows you to see all your 3 metrics as column headers
As an example:
df
Out[]:
Names A B
0 In 0.820747 0.370199
1 Out 0.162521 0.921443
2 In 0.534743 0.240836
3 Out 0.910891 0.096016
4 In 0.825876 0.833074
5 Out 0.546043 0.551751
6 In 0.305500 0.091768
7 Out 0.131028 0.043438
8 In 0.656116 0.562967
9 Out 0.351492 0.688008
10 In 0.410132 0.443524
11 Out 0.216372 0.057402
12 In 0.406622 0.754607
13 Out 0.272031 0.721558
14 In 0.162517 0.408080
15 Out 0.006613 0.616339
16 In 0.313313 0.808897
17 Out 0.545608 0.445589
18 In 0.353636 0.465455
19 Out 0.737072 0.306329
df.groupby('Names')['A'].apply(my_func).unstack()
Out[]:
metric1 metric2 metric3
Names
In 0.162517 0.825876 4.789202
Out 0.006613 0.910891 3.879669

Creating a new column depending on the equality of two other columns [duplicate]

This question already has an answer here:
python pandas : compare two columns for equality and result in third dataframe
(1 answer)
Closed last month.
l want to compare the values of two columns where I create a new column bin_crnn. I want 1 if they are equals or 0 if not.
# coding: utf-8
import pandas as pd
df = pd.read_csv('file.csv',sep=',')
if df['crnn_pred']==df['manual_raw_value']:
df['bin_crnn']=1
else:
df['bin_crnn']=0
l got the following error
if df['crnn_pred']==df['manual_raw_value']:
File "/home/ahmed/anaconda3/envs/cv/lib/python2.7/site-packages/pandas/core/generic.py", line 917, in __nonzero__
.format(self.__class__.__name__))
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
You need cast boolean mask to int with astype:
df['bin_crnn'] = (df['crnn_pred']==df['manual_raw_value']).astype(int)
Sample:
df = pd.DataFrame({'crnn_pred':[1,2,5], 'manual_raw_value':[1,8,5]})
print (df)
crnn_pred manual_raw_value
0 1 1
1 2 8
2 5 5
print (df['crnn_pred']==df['manual_raw_value'])
0 True
1 False
2 True
dtype: bool
df['bin_crnn'] = (df['crnn_pred']==df['manual_raw_value']).astype(int)
print (df)
crnn_pred manual_raw_value bin_crnn
0 1 1 1
1 2 8 0
2 5 5 1
You get error, because if compare columns output is not scalar, but Series (array) of True and False values.
So need all or
any for return scalar True or False.
I think better it explain this answer.
One fast approach is to use np.where.
import numpy as np
df['test'] = np.where(df['crnn_pred']==df['manual_raw_value'], 1, 0)
No need for a loop or if statement, just need to set a new column using a boolean mask.
df['bin_crnn'].loc[df['crnn_pred']==df['manual_raw_value']] = 1
df['bin_crnn'].fillna(0, inplace = True)
Another quick way just using Pandas and not Numpy is
df['columns_are_equal'] = df.apply(lambda x: int(x['column_a'] ==x['column_b']), axis=1)
You are comparing 2 columns, try this..
bin_crnn = []
for index, row in df.iterrows():
if row['crnn_pred'] == row['manual_raw_value']:
bin_crnn.append(1)
else:
bin_crnn.append(0)
df['bin_crnn'] = bin_crnn

Calculating value from a group of cells with Pandas

I am trying to read a csv file of horse track information.
I am attempting to code for the post positions (col 3) in race 1 the max value for the field qpts (col 210). I have spend days on researching this and can find no clear answer on web or youtube.
When I run the code below, I get "The truth value of a Series is ambiguous....."
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',100)
df = pd.read_csv('track.csv', header=None, na_values=['.'])
index = list(range(0,200,1))
columns = list(range(0,1484,1))
if df.ix[2]== 1:
qpts = (df.max([210]))
print (qpts)
the problem is with
if df.ix[2] == 1. The expression df.ix[2] == 1 will return a pd.Series of truth values. By putting an if in front, you are attempting to evaluate a series of values as either True or False, which is what is throwing the error.
There are several ways to produce a series where the value is 210 and the indices are those where df.ix[2] == 1
This is one way
pd.Series(210, df.index[df.ix[2] == 1])
Here df.ix[2]== 1 is going to return a Series. You need to use a function such as .any() or .all() to combine the Series into a single value which you can do a truth statement upon. For example,
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',100)
df = pd.read_csv('track.csv', header=None, na_values=['.'])
index = list(range(0,200,1))
columns = list(range(0,1484,1))
if (df.ix[2]== 1).any(axis=1):
qpts = (df.max([210]))
print (qpts)
In the case above we are checking to see if any of the Series elements are equal to 1. If so then the the if statement will be implemented. If we do not do this then we could have a situation as follows:
print(df)
Out[1]:
1 3
2 7
3 1
4 5
5 6
print(df.ix[2]== 1)
Out[2]:
1 False
2 False
3 True
4 False
5 False
Therefore the Series would be both simultaneously True and False.

Getting previous row values from within pandas apply() function

import pandas as pd
def greater_or_less(d):
if d['current'] > d['previous']:
d['result']="Greater"
elif d['current'] < d['previous']:
d['result']="Less"
elif d['current'] == d['previous']:
d['result']="Equal"
else:
pass
return d
df=pd.DataFrame({'current':[1,2,2,8,7]})
# Duplicate the column with shifted values
df['previous']=df['current'].shift(1)
df['result']=""
df=df.apply(greater_or_less,axis=1)
The result is:
current previous result
1 NaN
2 1 Greater
2 2 Equal
8 2 Greater
7 8 Less
I'd then drop the previous column as it's no longer needed. Ending up with:
current result
1
2 Greater
2 Equal
8 Greater
7 Less
How can I do this without adding the extra column?
What i'd like to do, is know how to reference the previous row's value from within the greater_or_less function.
Use diff() method:
import pandas as pd
import numpy as np
df=pd.DataFrame({'current':[1,2,2,8,7]})
np.sign(df.current.diff()).map({1:"Greater", 0:"Equal", -1:"Less"})

Categories