Im searching for a function that Returns the Position of an element in a dataframe.
- there is duplicates in the dataframe amongst the values
- dataframe About 10*2000
- the function will be applied on a dataframe using applymap()
# initial dataframe
df = pandas.DataFrame({"R1": [8,2,3], "R2": [2,3,4], "R3": [-3,4,-1]})
Example:
get_position(2) is not clear as it could be either "R1" or "R2". I am
wondering if there is another way that python knows which Position the
element holds - possibly during the applymap() Operation
Edit:
df.rank(axis=1,pct=True)
EDIT2:
#intial dataframe
df_initial = pandas.DataFrame({"R1": [8,2,3], "R2": [2,3,4], "R3": [-3,4,-1]})
step1)
df_rank = df_initial.rank(axis=1,pct=True)
step2)
# Building Groups based on the percentage of the respective value
def function103(x):
if 0.0 <= x <= 0.1:
P1.append(get_column_name1(x))
return x
elif 0.1 < x <= 0.2:
P2.append(get_column_name1(x))
return x
elif 0.2 < x <= 0.3:
P3.append(get_column_name1(x))
return x
elif 0.3 < x <= 0.4:
P4.append(get_column_name1(x))
return x
elif 0.4 < x <= 0.5:
P5.append(get_column_name1(x))
return x
elif 0.5 < x <= 0.6:
P6.append(get_column_name1(x))
return x
elif 0.6 < x <= 0.7:
P7.append(get_column_name1(x))
return x
elif 0.7 < x <= 0.8:
P8.append(get_column_name1(x))
return x
elif 0.8 < x <= 0.9:
P9.append(get_column_name1(x))
return x
elif 0.9 < x <= 1.0:
P10.append(get_column_name1(x))
return x
else:
return x
step3)
# trying to get the columns Name of the the respective value
# my idea was to determine the Position of each value to then write a function
def get_column_name1(x)
#to return the values column Name
step 4)
# apply the function
P1=[]
P2=[]
P3=[]
P4=[]
P5=[]
P6=[]
P7=[]
P8=[]
P9=[]
P10=[]
P11=[]
df_rank.applymap(function103).head()
If need index or columns names by value in DataFrame use numpy.where for positions and then select all index or columns values converted to numpy array:
df = pd.DataFrame({"R1": [8,2,3], "R2": [2,3,4], "R3": [-3,4,-1]})
i, c = np.where(df == 2)
print (i, c)
[0 1] [1 0]
print (df.index.values[i])
[0 1]
print (df.columns.values[c])
['R2' 'R1']
EDIT:
i, c = np.where(df == 2)
df1 = df.rank(axis=1,pct=True)
print (df1)
R1 R2 R3
0 1.000000 0.666667 0.333333
1 0.333333 0.666667 1.000000
2 0.666667 1.000000 0.333333
print (df1.iloc[i, c])
R2 R1
0 0.666667 1.000000
1 0.666667 0.333333
print (df1.where(df == 2).dropna(how='all').dropna(how='all', axis=1))
R1 R2
0 NaN 0.666667
1 0.333333 NaN
Or:
out = df1.stack()[df.stack() == 2].rename_axis(('idx','cols')).reset_index(name='val')
print (out)
idx cols val
0 0 R2 0.666667
1 1 R1 0.333333
EDIT:
Solution for your function - need iterate by one column DataFrame created by reshape and extract Series.name, what is same like column name:
def get_column_name1(x):
return x.name
P1=[]
P2=[]
P3=[]
P4=[]
P5=[]
P6=[]
P7=[]
P8=[]
P9=[]
P10=[]
P11=[]
def function103(x):
if 0.0 <= x[0] <= 0.1:
P1.append(get_column_name1(x))
return x
elif 0.1 < x[0] <= 0.2:
P2.append(get_column_name1(x))
return x
elif 0.2 < x[0] <= 0.3:
P3.append(get_column_name1(x))
return x
elif 0.3 < x[0] <= 0.4:
P4.append(get_column_name1(x))
return x
elif 0.4 < x[0] <= 0.5:
P5.append(get_column_name1(x))
return x
elif 0.5 < x[0] <= 0.6:
P6.append(get_column_name1(x))
return x
elif 0.6 < x[0] <= 0.7:
P7.append(get_column_name1(x))
return x
elif 0.7 < x[0] <= 0.8:
P8.append(get_column_name1(x))
return x
elif 0.8 < x[0] <= 0.9:
P9.append(get_column_name1(x))
return x
elif 0.9 < x[0] <= 1.0:
P10.append(get_column_name1(x))
return x
else:
return x
a = df_rank.stack().reset_index(level=0, drop=True).to_frame().apply(function103, axis=1)
print (P4)
['R3', 'R1', 'R3']
Related
I would like to create following dataframe:
df = pd.DataFrame({
'A': ['0','0','0','8.020833015','8.009259224','8.003472328','8.020833015','0','0','5','4.994213104','0','0','0','8.012152672','8.009259224','0'],
'Step_ID': ['Step_1','Step_1','Step_1','Step_2','Step_2','Step_2','Step_2','Step_3','Step_3','Step_4','Step_4','Step_5','Step_5','Step_5','Step_6','Step_6','Step_7']})
print (df)
What I have is the column A and according to these values I would like to set the values in the column Step_ID.
Step_ID - it begins from Step_1. Then if the number is bigger then Step_2 (for all the number that are bigger than 0, till the zero values will be reached). Then to zero values should be Step_3 assigned and so on.
# add a Step ID
df = pd.DataFrame({
'A': ['0','0','0','8.020833015','8.009259224','8.003472328','8.020833015','0','0','5','4.994213104','0','0','0','8.012152672','8.009259224','0']})
step = 0
value = None
def get_step(x):
global step
global value
if x != value:
value = x
step += 1
return f'Step_{step}'
df['Step_ID'] = df['A'].apply(get_step)
df.to_csv('test.csv' , index=None)
The code above does something similar, but only with unique numbers. Should be there one more "if" - if value > 0 in order to perform desired functionality?
I can see you implemented XOR gate but we need some customisation, I have added a new function to check.
import pandas as pd
df = pd.DataFrame({
'A': ['0','0','0','8.020833015','8.009259224','8.003472328','8.020833015','0','0','5','4.994213104','0','0','0','8.012152672','8.009259224','0']})
step = 0
value = None
def check(x, y):
try:
x = float(x)
y = float(y)
if x== 0 and y == 0:
return 0
elif x == 0 and y > 0:
return 1
elif x > 0 and y == 0:
return 1
else:
return 0
except:
return 1
def get_step(x):
global step
global value
# if x != value:
if check(x, value):
step += 1
value = x
return f'Step_{step}'
df['Step_ID'] = df['A'].apply(get_step)
df.to_csv('GSH0211.csv' , index=None)
Try this. You can adjust the threshold to the value you want.
df = pd.DataFrame({'A': ['0','0','0','8.020833015','8.009259224','8.003472328','8.020833015','0','0','5','4.994213104','0','0','0','8.012152672','8.009259224','0']})
df['A'] = df['A'].astype(float)
diff = df['A']-df['A'].shift().fillna(0)
threshold = 0.1
df['Step_ID'] = (abs(diff)>threshold).cumsum().add(1)
df['Step_ID'] = 'Step_' + df['Step_ID'].astype(str)
df
A Step_ID
0 0.000000 Step_1
1 0.000000 Step_1
2 0.000000 Step_1
3 8.020833 Step_2
4 8.009259 Step_2
5 8.003472 Step_2
6 8.020833 Step_2
7 0.000000 Step_3
8 0.000000 Step_3
9 5.000000 Step_4
10 4.994213 Step_4
11 0.000000 Step_5
12 0.000000 Step_5
13 0.000000 Step_5
14 8.012153 Step_6
15 8.009259 Step_6
16 0.000000 Step_7
I've got a function f(a, b) that is taking two pandas dataframes to apply different formulas to the values like this:
def f(a, b):
if a > 0 and b > 0:
return a + b
elif a > 0 and b < 0:
return a - b
elif a < 0 and b > 0:
return a * b
elif a < 0 and b < 0:
return a / b
else:
print('bad')
dfa = pd.DataFrame({'a':[1, 1]})
dfb = pd.DataFrame({'b':[2, 2]})
f(dfa,dfb)
The issue here in particular is, that I'd need the current value that is processed in the function to branch, however, using the and operator leads to this below.
"The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()"
and using & is leading to a
"cannot compare [type] array with a scalar of type [bool]"
Edit:
Considering the answers, I starting to realize that my minimal example might not transport my intention very well.
def f(a, b):
if a > 0 and b > 0:
X = operationA()
elif a > 0 and b < 0:
X = operationB()
elif a < 0 and b < 0:
X = operationC()
elif a < 0 and b < 0:
X = operationD()
else:
print('bad')
Y = operationY()
return X, Y
# both dataframes are part of a training example label example = (a, b)
df_label_partA = pd.DataFrame({'a':[1, 1, -1, -1]})
df_label_partB = pd.DataFrame({'b':[1, -1, 1, -1]})
f(df_label_partA, df_label_partB)
the data frames can't be considered separately as each is part of a list of labels (basically a tuple split up into 2 lists)
IIUC:
pd.concat([dfa,dfb], axis=1).apply(lambda x: f(*x), axis=1)
Outputs:
0 3
1 3
dtype: int64
You can try this
def f(a, b):
if all(a > 0) and all(b > 0):
return dfa.a + dfb.b
elif all(a > 0) and all(b < 0):
return dfa.a - dfb.b
elif all(a < 0) and all(b > 0):
return dfa.a * dfb.b
elif all(a < 0) and all(b < 0):
return dfa.a / dfb.b
else:
print('bad')
dfa = pd.DataFrame({'a':[1, 1]})
dfb = pd.DataFrame({'b':[2, 2]})
f(dfa,dfb)
output
0 3
1 3
dtype: int64
When I run the following script I get the following error message: TypeError: 'method' object is not subscriptable.
I basically have a dataframe called 'abc' and a dictionary 'abcquintile'.
my abcquintile acts as my base for segmenting my values in my dataframe.
My dictionary looks like this:
a b c
0.2 100 10 20
0.4 250 20 50
0.6 350 40 80
0.8 600 120 200>
abcquintile = abc[['a', 'b', 'c']].quantile(q=[0.20, 0.40, 0.60, 0.80])
abcquintile = abc.to_dict
# Arguments (x = value)
def ascore(x):
if x <= abcquintile['a'][0.20]:
return 5
elif x <= abcquintile['a'][0.40]:
return 4
elif x <= abcquintile['a'][0.60]:
return 3
elif x <= abcquintile['a'][0.80]:
return 2
else:
return 1
# Arguments (x = value, p = b or c)
def bcscore(x, p):
if x <= abcquintile[p][0.20]:
return 1
elif x <= abcquintile[p][0.40]:
return 2
elif x <= abcquintile[p][0.60]:
return 3
elif x <= abcquintile[p][0.80]:
return 4
else:
return 5
abc['a_quintile'] = abc['a'].apply(lambda x: ascore(x))
abc['b_quintile'] = abc['b'].apply(lambda x: bcscore(x, 'b'))
abc['c_quintile'] = abc['c'].apply(lambda x: bcscore(x, 'c'))
I am running into an issue creating a function that will recognize if a particular value in a column is between two values.
def bid(x):
if df['tla'] < 85000:
return 1
elif (df['tla'] >= 85000) & (df['tla'] < 110000):
return 2
elif (df['tla'] >= 111000) & (df['tla'] < 126000):
return 3
elif (df['tla'] >= 126000) & (df['tla'] < 150000):
return 4
elif (df['tla'] >= 150000) & (df['tla'] < 175000):
return 5
elif (df['tla'] >= 175000) & (df['tla'] < 200000):
return 6
elif (df['tla'] >= 200000) & (df['tla'] < 250000):
return 7
elif (df['tla'] >= 250000) & (df['tla'] < 300000):
return 8
elif (df['tla'] >= 300000) & (df['tla'] < 375000):
return 9
elif (df['tla'] >= 375000) & (df['tla'] < 453100):
return 10
elif df['tla'] >= 453100:
return 11
I apply that to my new column:
df['bid_bucket'] = df['bid_bucket'].apply(bid)
And I am getting this error back:
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Anyone have any ideas?
try the following using numpy.select
import numpy as np
values = [1,2,3,4,5,6,7,8,9,10,11]
cond = [df['tla']<85000, (df['tla'] >= 850000) & (df['tla'] < 110000), .... ]
df['bid_bucket'] = np.select(cond, values)
This can already be accomplished with pd.cut, defining the bin edges, and adding +1 to the labels to get your numbering to start at 1.
import pandas as pd
import numpy as np
df = pd.DataFrame({'tla': [7, 85000, 111000, 88888, 51515151]})
df['bid_bucket'] = pd.cut(df.tla, right=False,
bins=[-np.inf, 85000, 110000, 126000, 150000, 175000,
200000, 250000, 300000, 375000, 453100, np.inf],
labels=False)+1
Output: df
tla bid_bucket
0 7 1
1 85000 2
2 111000 3
3 88888 2
4 126000 4
5 51515151 11
You can simply use the np.digitize function to assign the ranges
df['bid_bucket'] = np.digitize(df['bid_bucket'],np.arange(85000,453100,25000))
Example
a = np.random.randint(85000,400000,10)
#array([305628, 134122, 371486, 119856, 321423, 346906, 319321, 165714,360896, 206404])
bins=[-np.inf, 85000, 110000, 126000, 150000, 175000,
200000, 250000, 300000, 375000, 453100, np.inf]
np.digitize(a,bins)
Out:
array([9, 4, 9, 3, 9, 9, 9, 5, 9, 7])
To keep it in pandas: I think referencing df['tla'] in your function means to reference a series instead of a single value which leads to the ambiguity. You should provide the specific value instead. You could use lambda x, then your code could be something like this
df = pd.DataFrame({'tla':[10,123456,999999]})
def bid(x):
if x < 85000:
return 1
elif (x >= 85000 and x < 110000):
return 2
elif (x >= 111000 and x < 126000):
return 3
elif (x >= 126000 and x < 150000):
return 4
elif (x >= 150000 and x < 175000):
return 5
elif (x >= 175000 and x < 200000):
return 6
elif (x >= 200000 and x < 250000):
return 7
elif (x >= 250000 and x < 300000):
return 8
elif (x >= 300000 and x < 375000):
return 9
elif (x >= 375000 and x < 453100):
return 10
elif x >= 453100:
return 11
df['bid_bucket'] = df['tla'].apply(lambda x: bid(x))
df
You have two possibilities.
Either apply a function defined on a row on the pandas DataFrame in a row-wise way:
def function_on_a_row(row):
if row.tla > ...
...
df.apply(function_on_a_row, axis=1)
In which case keep bid the way you defined it but replace the parameter x with a word like "row" and then the df with "row" to keep the parameters name meaningful, and use:
df.bid_bucket = df.apply(bid, axis=1)
Or apply a function defined on an element on a pandas Series.
def function_on_an_elt(element_of_series):
if element_of_series > ...
...
df.new_column = df.my_column_of_interest.apply(function_on_an_elt)
In your case redefine bid accordingly.
Here you tried to mix both approaches, which does not work.
I'm using numpy.random.rand(1) to generate a number between 0-1 and the values of a and b change depending on what the number is. How can I make the probability of x > .5 and x < .5 be proportional to a and b? So if a is 75 and b is 15 then the probability of x > .5 is 5 times more probable than x < .5. I'm unsure as to what codes to use to assign probabilities. This is what I have:
a = 100
b = 100
while True:
x = numpy.random.rand(1)
print x
if x < .5:
a = a + 10
b = b - 10
print a
print b
if x > .5:
b = b + 10
a = a - 10
print a
print b
if b1 == 0:
print a
break
if b2 == 0:
print b
break
I'd make two calls to random: One to calculate a random number between 0 and 0.5 and a second to determine if the number should be above 0.5.
For example;
a = 100
b = 100
x = numpy.random.rand(1)/2.0
proportion = a/(1.0*b+a)
if numpy.random.rand(1) > proportion:
x += 0.5
what a fitting name.
ratio = a/b
x = numpy.random.uniform(0.5 + ratio*0.5)
now you have a numbers distributed between 0 and the ratio multiplied by 0.5. With uniform distribution, the ratio between the population of numbers greater than 0.5 and the population lower than 0.5 is the desired ratio.
now we just need to broadcast those ranges to be between 0.5 and 1.0.
if x >= 0.5:
x = x - math.trunc(x)
if int(str(number-int(x))[1:]) < 5:
x += 0.5
In this case I would have numpy generate a number between 1 and 100 then assign based on that. I.e. (pusdocode)
if rand(100) => 50: a=0
else: a=1
then all you have to do is changed the 50 to whatever the % you want. I can elaborate further if that is confusing.
def get_rand_value(a, b):
if rand(100) => a:
return True
else:
return 1
a = 100
b = 100
while True:
x = get_rand_value(a, b)
print x
if x < .5:
a = a + 10
b = b - 10
print a
print b
if x > .5:
b = b + 10
a = a - 10
print a
print b
if b1 == 0:
print a
break
if b2 == 0:
print b
break