I am attempting to change the values of two columns in my dataset from specific numeric values (2, 10, 25 etc.) to single values (1, 2, 3 or 4) based on the percentile of the specific value within the dataset.
Using the pandas quantile() function I have got the ranges I wish to replace between, but I haven't figured out a working method to do so.
age1 = datasetNB.Age.quantile(0.25)
age2 = datasetNB.Age.quantile(0.5)
age3 = datasetNB.Age.quantile(0.75)
fare1 = datasetNB.Fare.quantile(0.25)
fare2 = datasetNB.Fare.quantile(0.5)
fare3 = datasetNB.Fare.quantile(0.75)
My current solution attempt for this problem is as follows:
for elem in datasetNB['Age']:
if elem <= age1:
datasetNB[elem].replace(to_replace = elem, value = 1)
print("set to 1")
elif (elem > age1) & (elem <= age2):
datasetNB[elem].replace(to_replace = elem, value = 2)
print("set to 2")
elif (elem > age2) & (elem <= age3):
datasetNB[elem].replace(to_replace = elem, value = 3)
print("set to 3")
elif elem > age3:
datasetNB[elem].replace(to_replace = elem, value = 4)
print("set to 4")
else:
pass
for elem in datasetNB['Fare']:
if elem <= fare1:
datasetNB[elem] = 1
elif (elem > fare1) & (elem <= fare2):
datasetNB[elem] = 2
elif (elem > fare2) & (elem <= fare3):
datasetNB[elem] = 3
elif elem > fare3:
datasetNB[elem] = 4
else:
pass
What should I do to get this working?
pandas already has one function to do that, pandas.qcut.
You can simply do
q_list = [0, 0.25, 0.5, 0.75, 1]
labels = range(1, 5)
df['Age'] = pd.qcut(df['Age'], q_list, labels=labels)
df['Fare'] = pd.qcut(df['Fare'], q_list, labels=labels)
Input
import numpy as np
import pandas as pd
# Generate fake data for the sake of example
df = pd.DataFrame({
'Age': np.random.randint(10, size=6),
'Fare': np.random.randint(10, size=6)
})
>>> df
Age Fare
0 1 6
1 8 2
2 0 0
3 1 9
4 9 6
5 2 2
Output
DataFrame after running the above code
>>> df
Age Fare
0 1 3
1 4 1
2 1 1
3 1 4
4 4 3
5 3 1
Note that in your specific case, since you want quartiles, you can just assign q_list = 4.
Related
I have a data frame that consists of a time-series of integers. I'm trying to group the data frame by year and then for each year count the number of times that the sum of the absolute value of consecutive entries with the same sign is greater than or equal to 5.
>>> import pandas as pd
>>> l = [1, -1, -4, 2, 2, 4, 5, 1, -3, -4]
>>> idx1 = pd.date_range('2019-01-01',periods=5)
>>> idx2 = pd.date_range('2020-01-01',periods=5)
>>> idx = idx1.union(idx2)
>>> df = pd.DataFrame(l, index=idx, columns=['a'])
>>> df
a
2019-01-01 1
2019-01-02 -1
2019-01-03 -4 \\ 2019 count = 1: abs(-1) + abs(-4) >= 5
2019-01-04 2
2019-01-05 2
2020-01-01 4
2020-01-02 5 \\ 2020 count = 1: abs(4) + abs(5) + abs(1) = 10 >=5
2020-01-03 1
2020-01-04 -3
2020-01-05 -4 \\ 2020 count = 2: abs(-3) + abs(-4) = 7 >= 5
The desired output is:
2019 1
2020 2
My approach to solve this problem is to chain groupby and apply. Below are the implementations of the functions I created to pass to groupby and apply respectively.
>>> def get_year(x):
return x.year
>>> def count(group, t=5):
c = 0 # counter
s = 0 # sum of consec vals w same sign
for i in range(1,len(group)):
if np.sign(group['a'].iloc[i-1]) == np.sign(group['a'].iloc[i]):
if s == 0:
s = group['a'].iloc[i-1] + group['a'].iloc[i]
else:
s += group['a'].iloc[i]
if i == (len(group) -1):
return c + 1
elif (np.sign(group['a'].iloc[i-1]) != np.sign(group['a'].iloc[i])) and (abs(s) >= t):
#if consec streak of vals w same sign is broken and abs(s) >= t then inc c and reset s
c += 1
s = 0
elif (np.sign(group['a'].iloc[i-1]) != np.sign(group['a'].iloc[i])) and (abs(s) < t):
#if consec streak of vals w same sign is broken and abs(s) < t then reset s
s = 0
return c
>>> by_year = df.groupby(get_year)
>>> by_year.apply(count)
2019 1
2020 2
My question is:
Is there a more "pythonic" implementation of the above count function that produces the desired result but doesn't rely on for loops?
I'm working on an array called numbers which will be created with 4 columns called (x), (y), (z) respectively and the fourth is used in the program.
I want that if the x and y values of two rows coincide, then based on their c, one of them would be deleted from the main array (a "0" z value removes "1", a "1" z value removes "2" and a "2" z value removes "0").
The original array looks like:
[[12 15 2 0]
[65 23 0 0]
[24 66 2 0]
[65 23 1 0]
[24 66 0 0]]
The problem is that when I try to run the following program I do not get the required array at the end. The expected output array would look like:
[[12 15 2 0]
[65 23 0 0]
[24 66 2 0]]
I have given an extract from the program below
import numpy as np
#Array
numbers = np.array([[12,15,2,0],[65,23,0,0],[24,66,2,0],[65,23,1,0],[24,66,0,0]])
#Original Array
print(numbers)
#Lists to store x, y and z values
xs = []
ys = []
zs = []
#Any removed row is added into this list
removed = []
#Code to delete a row
for line1 in numbers:
for line2 in numbers:
if line1[0] == line2[0]:
if line2[1] == line2[1]:
if line1[2] == 1 and line2[2] == 0:
removed.append(line1)
if line1[2] == 0 and line2[2] == 2:
removed.append(line1)
if line1[2] == 2 and line2[2] == 1:
removed.append(line1)
for i in removed:
numbers = np.delete(numbers,i,axis=0)
for line in numbers:
xs.append(line[0])
ys.append(line[1])
zs.append(line[2])
#Update the original Array
for i in removed:
print(removed)
print()
print("x\n", xs)
print("y\n", ys)
print("z\n", zs)
print()
#Updated Array
print(numbers)
Test array
a = lifeforms = np.array([[12,15,2,0],
[13,13,0,0],
[13,13,1,0],
[13,13,2,0],
[65,23,1,0],
[24,66,2,0],
[14,14,1,0],
[14,14,1,1],
[14,14,1,2],
[14,14,2,0],
[15,15,3,2],
[15,15,2,0],
[65,23,0,0],
[24,66,0,0]])
Function that implements color selection.
test_one = np.array([[0,1],[1,0],[1,2],[2,1]])
test_two = np.array([[0,2],[2,0]])
def f(g):
a = g.loc[:,2].unique()
if np.any(np.all(a == test_one, axis=1)):
idx = (g[2] == g[2].min()).idxmax()
elif np.any(np.all(a == test_two, axis=1)):
idx = (g[2] == g[2].max()).idxmax()
else:
raise ValueError('group colors outside bounds')
return idx
Groupby first two columns; iterate over groups; save indices of desired rows; use those indices to select rows from the DataFrame.
df = pd.DataFrame(a)
gb = df.groupby([0,1])
indices = []
for k,g in gb:
if g.loc[:,2].unique().shape[0] > 2:
#print(f'(0,1,2) - dropping indices {g.index}')
continue
if g.shape[0] == 1:
indices.extend(g.index.to_list())
#print(f'unique - keeping index {g.index.values}')
continue
#print(g.loc[:,2])
try:
idx = f(g)
except ValueError as e:
print(sep)
print(e)
print(g)
print(sep)
continue
#print(f'keeping index {idx}')
indices.append(idx)
#print(sep)
print(df.loc[indices,:])
If you can use pandas, you can do the following:
x = np.array([[12,15,2,0],[65,23,0,1],[24,66,2,0],[65,23,1,0],[24,66,0,0]])
df = pd.DataFrame(x)
new_df = df.iloc[df.loc[:,(0,1)].drop_duplicates().index]
print(new_df)
0 1 2 3
0 12 15 2 0
1 65 23 0 1
2 24 66 2 0
What it does is the following:
transform the array to pandas data-frame
df.loc[:,(0,1)].drop_duplicates().index will return the indices of the rows you wish to keep (based on the first and second columns)
df.iloc will return the sliced data-frame.
Edit based on OP questions in the comments and #wwii remarks:
you can return to numpy array using .to_numpy(), so just do arr = new_df.to_numpy()
You can try the following:
xx = np.array([[12,15,2,0],[65,23,1,0],[24,66,2,0],[65,23,0,0],[24,66,0,0]])
df = pd.DataFrame(xx)
df_new = df.groupby([0,1], group_keys=False).apply(lambda x: x.loc[x[2].idxmin()])
df_new.reset_index(drop=True, inplace=True)
0 1 2 3
0 12 15 2 0
1 24 66 0 0
2 65 23 0 0
When there is a special heuristic to consider one can do the following:
import pandas as pd
import numpy as np
def f_(x):
vals = x[2].tolist()
if len(vals)==2:
# print(vals)
if vals[0] == 0 and vals[1] == 1:
return vals[0]
elif vals[0] == 1 and vals[1] == 0:
return vals[1]
elif vals[0] == 1 and vals[1] == 2:
return vals[0]
elif vals[0] == 2 and vals[1] == 0:
return vals[0]
elif len(vals) > 2:
return -1
else:
return x[2]
xx = np.array([[12,15,2,0],[65,23,1,0],[24,66,2,0],[65,23,0,0],[24,66,0,0]])
df = pd.DataFrame(xx)
df_new = df.groupby([0,1], group_keys=False).apply(lambda x: x.loc[x[2] == f_(x)])
df_new.reset_index(drop=True, inplace=True)
print(df_new)
0 1 2 3
0 12 15 2 0
1 24 66 2 0
2 65 23 0 0
I would like to create following dataframe:
df = pd.DataFrame({
'A': ['0','0','0','8.020833015','8.009259224','8.003472328','8.020833015','0','0','5','4.994213104','0','0','0','8.012152672','8.009259224','0'],
'Step_ID': ['Step_1','Step_1','Step_1','Step_2','Step_2','Step_2','Step_2','Step_3','Step_3','Step_4','Step_4','Step_5','Step_5','Step_5','Step_6','Step_6','Step_7']})
print (df)
What I have is the column A and according to these values I would like to set the values in the column Step_ID.
Step_ID - it begins from Step_1. Then if the number is bigger then Step_2 (for all the number that are bigger than 0, till the zero values will be reached). Then to zero values should be Step_3 assigned and so on.
# add a Step ID
df = pd.DataFrame({
'A': ['0','0','0','8.020833015','8.009259224','8.003472328','8.020833015','0','0','5','4.994213104','0','0','0','8.012152672','8.009259224','0']})
step = 0
value = None
def get_step(x):
global step
global value
if x != value:
value = x
step += 1
return f'Step_{step}'
df['Step_ID'] = df['A'].apply(get_step)
df.to_csv('test.csv' , index=None)
The code above does something similar, but only with unique numbers. Should be there one more "if" - if value > 0 in order to perform desired functionality?
I can see you implemented XOR gate but we need some customisation, I have added a new function to check.
import pandas as pd
df = pd.DataFrame({
'A': ['0','0','0','8.020833015','8.009259224','8.003472328','8.020833015','0','0','5','4.994213104','0','0','0','8.012152672','8.009259224','0']})
step = 0
value = None
def check(x, y):
try:
x = float(x)
y = float(y)
if x== 0 and y == 0:
return 0
elif x == 0 and y > 0:
return 1
elif x > 0 and y == 0:
return 1
else:
return 0
except:
return 1
def get_step(x):
global step
global value
# if x != value:
if check(x, value):
step += 1
value = x
return f'Step_{step}'
df['Step_ID'] = df['A'].apply(get_step)
df.to_csv('GSH0211.csv' , index=None)
Try this. You can adjust the threshold to the value you want.
df = pd.DataFrame({'A': ['0','0','0','8.020833015','8.009259224','8.003472328','8.020833015','0','0','5','4.994213104','0','0','0','8.012152672','8.009259224','0']})
df['A'] = df['A'].astype(float)
diff = df['A']-df['A'].shift().fillna(0)
threshold = 0.1
df['Step_ID'] = (abs(diff)>threshold).cumsum().add(1)
df['Step_ID'] = 'Step_' + df['Step_ID'].astype(str)
df
A Step_ID
0 0.000000 Step_1
1 0.000000 Step_1
2 0.000000 Step_1
3 8.020833 Step_2
4 8.009259 Step_2
5 8.003472 Step_2
6 8.020833 Step_2
7 0.000000 Step_3
8 0.000000 Step_3
9 5.000000 Step_4
10 4.994213 Step_4
11 0.000000 Step_5
12 0.000000 Step_5
13 0.000000 Step_5
14 8.012153 Step_6
15 8.009259 Step_6
16 0.000000 Step_7
I am trying to iterate over a pandas dataframe and update the value if condition is met but i am getting an error.
for line, row in enumerate(df.itertuples(), 1):
if row.Qty:
if row.Qty == 1 and row.Price == 10:
row.Buy = 1
AttributeError: can't set attribute
First iterating in pandas is possible, but very slow, so another vectorized solution are used.
I think you can use iterrows if you need iterating:
for idx, row in df.iterrows():
if df.loc[idx,'Qty'] == 1 and df.loc[idx,'Price'] == 10:
df.loc[idx,'Buy'] = 1
But better is to use vectorized solutions – set value by boolean mask with loc:
mask = (df['Qty'] == 1) & (df['Price'] == 10)
df.loc[mask, 'Buy'] = 1
Or solution with mask:
df['Buy'] = df['Buy'].mask(mask, 1)
Or if you need if...else use numpy.where:
df['Buy'] = np.where(mask, 1, 0)
Samples.
Set values by conditions:
df = pd.DataFrame({'Buy': [100, 200, 50],
'Qty': [5, 1, 1],
'Name': ['apple', 'pear', 'banana'],
'Price': [1, 10, 10]})
print (df)
Buy Name Price Qty
0 100 apple 1 5
1 200 pear 10 1
2 50 banana 10 1
mask = (df['Qty'] == 1) & (df['Price'] == 10)
df['Buy'] = df['Buy'].mask(mask, 1)
print (df)
Buy Name Price Qty
0 100 apple 1 5
1 1 pear 10 1
2 1 banana 10 1
df['Buy'] = np.where(mask, 1, 0)
print (df)
Buy Name Price Qty
0 0 apple 1 5
1 1 pear 10 1
2 1 banana 10 1
Ok, if you intend to set values in df then you need track the index values.
option 1
using itertuples
# keep in mind `row` is a named tuple and cannot be edited
for line, row in enumerate(df.itertuples(), 1): # you don't need enumerate here, but doesn't hurt.
if row.Qty:
if row.Qty == 1 and row.Price == 10:
df.set_value(row.Index, 'Buy', 1)
option 2
using iterrows
# keep in mind that `row` is a `pd.Series` and can be edited...
# ... but it is just a copy and won't reflect in `df`
for idx, row in df.iterrows():
if row.Qty:
if row.Qty == 1 and row.Price == 10:
df.set_value(idx, 'Buy', 1)
option 3
using straight up loop with get_value
for idx in df.index:
q = df.get_value(idx, 'Qty')
if q:
p = df.get_value(idx, 'Price')
if q == 1 and p == 10:
df.set_value(idx, 'Buy', 1)
pandas.DataFrame.set_value method is deprecated as of 0.21.0 pd.DataFrame.set_value
Use pandas.Dataframe.at
for index, row in df.iterrows():
if row.Qty and row.Qty == 1 and row.Price == 10:
df.at[index,'Buy'] = 1
For example I would like to get letters indicating a row where period of at least two consecutive drops in other column begins.
Exemplary data:
a b
0 3 a
1 2 b
2 3 c
3 2 d
4 1 e
5 0 f
6 -1 g
7 3 h
8 1 i
9 0 j
Exemplary solution with simple loop:
import pandas as pd
df = pd.DataFrame({'a': [3,2,3,2,1,0,-1,3,1,0], 'b': list('abcdefghij')})
less = 0
l = []
prev_prev_row = df.iloc[0]
prev_row = df.iloc[1]
if prev_row['a'] < prev_prev_row['a']: less = 1
for i, row in df.iloc[2:len(df)].iterrows():
if row['a'] < prev_row['a']:
less = less + 1
else:
less = 0
if less == 2:
l.append(prev_prev_row['b'])
prev_prev_row = prev_row
prev_row = row
This gives list l:
['c', 'h']
Here's one approach with some help from NumPy and Scipy -
from scipy.ndimage.morphology import binary_closing
arr = df.a.values
mask1 = np.hstack((False,arr[1:] < arr[:-1],False))
mask2 = mask1 & (~binary_closing(~mask1,[1,1]))
final_mask = mask2[1:] > mask2[:-1]
out = list(df.b[final_mask])
use rolling(2) in reverse
s = df.a[::-1].diff().gt(0).rolling(2).sum().eq(2)
df.b.loc[s & (s != s.shift(-1))]
2 c
7 h
Name: b, dtype: object
if you actually wanted a list
df.b.loc[s & (s != s.shift(-1))].tolist()
['c', 'h']