Deleting a row from an array - python

I'm working on an array called numbers which will be created with 4 columns called (x), (y), (z) respectively and the fourth is used in the program.
I want that if the x and y values of two rows coincide, then based on their c, one of them would be deleted from the main array (a "0" z value removes "1", a "1" z value removes "2" and a "2" z value removes "0").
The original array looks like:
[[12 15 2 0]
[65 23 0 0]
[24 66 2 0]
[65 23 1 0]
[24 66 0 0]]
The problem is that when I try to run the following program I do not get the required array at the end. The expected output array would look like:
[[12 15 2 0]
[65 23 0 0]
[24 66 2 0]]
I have given an extract from the program below
import numpy as np
#Array
numbers = np.array([[12,15,2,0],[65,23,0,0],[24,66,2,0],[65,23,1,0],[24,66,0,0]])
#Original Array
print(numbers)
#Lists to store x, y and z values
xs = []
ys = []
zs = []
#Any removed row is added into this list
removed = []
#Code to delete a row
for line1 in numbers:
for line2 in numbers:
if line1[0] == line2[0]:
if line2[1] == line2[1]:
if line1[2] == 1 and line2[2] == 0:
removed.append(line1)
if line1[2] == 0 and line2[2] == 2:
removed.append(line1)
if line1[2] == 2 and line2[2] == 1:
removed.append(line1)
for i in removed:
numbers = np.delete(numbers,i,axis=0)
for line in numbers:
xs.append(line[0])
ys.append(line[1])
zs.append(line[2])
#Update the original Array
for i in removed:
print(removed)
print()
print("x\n", xs)
print("y\n", ys)
print("z\n", zs)
print()
#Updated Array
print(numbers)

Test array
a = lifeforms = np.array([[12,15,2,0],
[13,13,0,0],
[13,13,1,0],
[13,13,2,0],
[65,23,1,0],
[24,66,2,0],
[14,14,1,0],
[14,14,1,1],
[14,14,1,2],
[14,14,2,0],
[15,15,3,2],
[15,15,2,0],
[65,23,0,0],
[24,66,0,0]])
Function that implements color selection.
test_one = np.array([[0,1],[1,0],[1,2],[2,1]])
test_two = np.array([[0,2],[2,0]])
def f(g):
a = g.loc[:,2].unique()
if np.any(np.all(a == test_one, axis=1)):
idx = (g[2] == g[2].min()).idxmax()
elif np.any(np.all(a == test_two, axis=1)):
idx = (g[2] == g[2].max()).idxmax()
else:
raise ValueError('group colors outside bounds')
return idx
Groupby first two columns; iterate over groups; save indices of desired rows; use those indices to select rows from the DataFrame.
df = pd.DataFrame(a)
gb = df.groupby([0,1])
indices = []
for k,g in gb:
if g.loc[:,2].unique().shape[0] > 2:
#print(f'(0,1,2) - dropping indices {g.index}')
continue
if g.shape[0] == 1:
indices.extend(g.index.to_list())
#print(f'unique - keeping index {g.index.values}')
continue
#print(g.loc[:,2])
try:
idx = f(g)
except ValueError as e:
print(sep)
print(e)
print(g)
print(sep)
continue
#print(f'keeping index {idx}')
indices.append(idx)
#print(sep)
print(df.loc[indices,:])

If you can use pandas, you can do the following:
x = np.array([[12,15,2,0],[65,23,0,1],[24,66,2,0],[65,23,1,0],[24,66,0,0]])
df = pd.DataFrame(x)
new_df = df.iloc[df.loc[:,(0,1)].drop_duplicates().index]
print(new_df)
0 1 2 3
0 12 15 2 0
1 65 23 0 1
2 24 66 2 0
What it does is the following:
transform the array to pandas data-frame
df.loc[:,(0,1)].drop_duplicates().index will return the indices of the rows you wish to keep (based on the first and second columns)
df.iloc will return the sliced data-frame.
Edit based on OP questions in the comments and #wwii remarks:
you can return to numpy array using .to_numpy(), so just do arr = new_df.to_numpy()
You can try the following:
xx = np.array([[12,15,2,0],[65,23,1,0],[24,66,2,0],[65,23,0,0],[24,66,0,0]])
df = pd.DataFrame(xx)
df_new = df.groupby([0,1], group_keys=False).apply(lambda x: x.loc[x[2].idxmin()])
df_new.reset_index(drop=True, inplace=True)
0 1 2 3
0 12 15 2 0
1 24 66 0 0
2 65 23 0 0
When there is a special heuristic to consider one can do the following:
import pandas as pd
import numpy as np
def f_(x):
vals = x[2].tolist()
if len(vals)==2:
# print(vals)
if vals[0] == 0 and vals[1] == 1:
return vals[0]
elif vals[0] == 1 and vals[1] == 0:
return vals[1]
elif vals[0] == 1 and vals[1] == 2:
return vals[0]
elif vals[0] == 2 and vals[1] == 0:
return vals[0]
elif len(vals) > 2:
return -1
else:
return x[2]
xx = np.array([[12,15,2,0],[65,23,1,0],[24,66,2,0],[65,23,0,0],[24,66,0,0]])
df = pd.DataFrame(xx)
df_new = df.groupby([0,1], group_keys=False).apply(lambda x: x.loc[x[2] == f_(x)])
df_new.reset_index(drop=True, inplace=True)
print(df_new)
0 1 2 3
0 12 15 2 0
1 24 66 2 0
2 65 23 0 0

Related

How can I replace values in a CSV column from a range?

I am attempting to change the values of two columns in my dataset from specific numeric values (2, 10, 25 etc.) to single values (1, 2, 3 or 4) based on the percentile of the specific value within the dataset.
Using the pandas quantile() function I have got the ranges I wish to replace between, but I haven't figured out a working method to do so.
age1 = datasetNB.Age.quantile(0.25)
age2 = datasetNB.Age.quantile(0.5)
age3 = datasetNB.Age.quantile(0.75)
fare1 = datasetNB.Fare.quantile(0.25)
fare2 = datasetNB.Fare.quantile(0.5)
fare3 = datasetNB.Fare.quantile(0.75)
My current solution attempt for this problem is as follows:
for elem in datasetNB['Age']:
if elem <= age1:
datasetNB[elem].replace(to_replace = elem, value = 1)
print("set to 1")
elif (elem > age1) & (elem <= age2):
datasetNB[elem].replace(to_replace = elem, value = 2)
print("set to 2")
elif (elem > age2) & (elem <= age3):
datasetNB[elem].replace(to_replace = elem, value = 3)
print("set to 3")
elif elem > age3:
datasetNB[elem].replace(to_replace = elem, value = 4)
print("set to 4")
else:
pass
for elem in datasetNB['Fare']:
if elem <= fare1:
datasetNB[elem] = 1
elif (elem > fare1) & (elem <= fare2):
datasetNB[elem] = 2
elif (elem > fare2) & (elem <= fare3):
datasetNB[elem] = 3
elif elem > fare3:
datasetNB[elem] = 4
else:
pass
What should I do to get this working?
pandas already has one function to do that, pandas.qcut.
You can simply do
q_list = [0, 0.25, 0.5, 0.75, 1]
labels = range(1, 5)
df['Age'] = pd.qcut(df['Age'], q_list, labels=labels)
df['Fare'] = pd.qcut(df['Fare'], q_list, labels=labels)
Input
import numpy as np
import pandas as pd
# Generate fake data for the sake of example
df = pd.DataFrame({
'Age': np.random.randint(10, size=6),
'Fare': np.random.randint(10, size=6)
})
>>> df
Age Fare
0 1 6
1 8 2
2 0 0
3 1 9
4 9 6
5 2 2
Output
DataFrame after running the above code
>>> df
Age Fare
0 1 3
1 4 1
2 1 1
3 1 4
4 4 3
5 3 1
Note that in your specific case, since you want quartiles, you can just assign q_list = 4.

Assigning values to numpy cells in one go

Suppose I have a 2D numpy array of zeros. I want to assign 1 to multiple cells. How do I do this?
So for example:
arr = np.zeros((5,3))
idx = [0,1,2,2,0]
Here, idx is the column indices of the cells I want changed.
So my desired output is:
1 0 0
0 1 0
0 0 1
0 0 1
1 0 0
Try advanced indexing:
arr[np.arange(len(arr)), idx] = 1
current = 0
for row in arr:
col = idx[current]
row[col] = 1
current += 1
import numpy as np
from random import randint
arr = np.zeros((5,3))
idx = [0,1,2,2,0]
#Validate idx elements here
#Distribute 1s to columns randomly
for x in idx:
while True:
i = randint(0,arr.shape[0] - 1)
if arr[i][x] == 1:
continue #Try again
else:
arr[i][x] = 1
break
print (arr)

Pandas compare next row and merge based on conditions

I have below dataframe. Where START+TIME=END
I want ti check id END of current row = START of next row then merge that 2 rows providing "ID" hsould the same
So the output should look like,
So the output is as below
Sample DF
Start Time End ID
0 43500 60 43560 23
1 43560 60 43620 23
2 43620 1020 44640 24
3 44640 260 44900 24
4 44900 2100 47000 24
Code:
a = df["ID"].tolist()
arr = []
t = True
for i in sorted(list(set(a))):
j = 1
k = 0
temp = {}
tempdf = df[df["ID"] == i]
temp["Start"] = tempdf.iloc[k]["Start"]
temp["Time"] = tempdf.iloc[k]["Time"]
temp["End"] = tempdf.iloc[k]["End"]
temp["ID"] = tempdf.iloc[k]["ID"]
while j < len(tempdf):
if temp["End"] == tempdf.iloc[j]["Start"]:
temp["End"] = tempdf.iloc[j]["End"]
temp["Time"] += tempdf.iloc[j]["Time"]
j += 1
arr.append(temp)
df = pd.DataFrame(arr)
Output DF:
Start Time End ID
0 43500 120 43620 23
1 43620 3380 47000 24
I'm not sure how your data is formatted exactly but you can just replace.
I suggest you use numpy and try something along the lines :
i=0
while i != len(data):
if data[i][4] == data[i+1][2]:
data[i][4] = data[i+1][2]
data[i+1].pop
else :
i+=1

add column values according to value with if

I would like to create following dataframe:
df = pd.DataFrame({
'A': ['0','0','0','8.020833015','8.009259224','8.003472328','8.020833015','0','0','5','4.994213104','0','0','0','8.012152672','8.009259224','0'],
'Step_ID': ['Step_1','Step_1','Step_1','Step_2','Step_2','Step_2','Step_2','Step_3','Step_3','Step_4','Step_4','Step_5','Step_5','Step_5','Step_6','Step_6','Step_7']})
print (df)
What I have is the column A and according to these values I would like to set the values in the column Step_ID.
Step_ID - it begins from Step_1. Then if the number is bigger then Step_2 (for all the number that are bigger than 0, till the zero values will be reached). Then to zero values should be Step_3 assigned and so on.
# add a Step ID
df = pd.DataFrame({
'A': ['0','0','0','8.020833015','8.009259224','8.003472328','8.020833015','0','0','5','4.994213104','0','0','0','8.012152672','8.009259224','0']})
step = 0
value = None
def get_step(x):
global step
global value
if x != value:
value = x
step += 1
return f'Step_{step}'
df['Step_ID'] = df['A'].apply(get_step)
df.to_csv('test.csv' , index=None)
The code above does something similar, but only with unique numbers. Should be there one more "if" - if value > 0 in order to perform desired functionality?
I can see you implemented XOR gate but we need some customisation, I have added a new function to check.
import pandas as pd
df = pd.DataFrame({
'A': ['0','0','0','8.020833015','8.009259224','8.003472328','8.020833015','0','0','5','4.994213104','0','0','0','8.012152672','8.009259224','0']})
step = 0
value = None
def check(x, y):
try:
x = float(x)
y = float(y)
if x== 0 and y == 0:
return 0
elif x == 0 and y > 0:
return 1
elif x > 0 and y == 0:
return 1
else:
return 0
except:
return 1
def get_step(x):
global step
global value
# if x != value:
if check(x, value):
step += 1
value = x
return f'Step_{step}'
df['Step_ID'] = df['A'].apply(get_step)
df.to_csv('GSH0211.csv' , index=None)
Try this. You can adjust the threshold to the value you want.
df = pd.DataFrame({'A': ['0','0','0','8.020833015','8.009259224','8.003472328','8.020833015','0','0','5','4.994213104','0','0','0','8.012152672','8.009259224','0']})
df['A'] = df['A'].astype(float)
diff = df['A']-df['A'].shift().fillna(0)
threshold = 0.1
df['Step_ID'] = (abs(diff)>threshold).cumsum().add(1)
df['Step_ID'] = 'Step_' + df['Step_ID'].astype(str)
df
A Step_ID
0 0.000000 Step_1
1 0.000000 Step_1
2 0.000000 Step_1
3 8.020833 Step_2
4 8.009259 Step_2
5 8.003472 Step_2
6 8.020833 Step_2
7 0.000000 Step_3
8 0.000000 Step_3
9 5.000000 Step_4
10 4.994213 Step_4
11 0.000000 Step_5
12 0.000000 Step_5
13 0.000000 Step_5
14 8.012153 Step_6
15 8.009259 Step_6
16 0.000000 Step_7

Pandas DataFrame groupby based on condition

The most similar question I found was here but with no proper answer.
Basically I have an issue where I'm trying to use groupby on a dataframe to generate unique IDs for bus routes. The problem is, the data I have at my disposal sometimes (though rarely) has the same values for my groupby columns, so they're considered the same bus even though they aren't.
The only other way I can think of is to group buses based on another column called "Type of stop", where there is an indicator for Start, Middle and End. I'd like to use groupby to create groups based on this column where each group starts where "type of stop" = Start, and ends where "type of stop" = End.
Consider the following data:
df = pd.DataFrame({'Vehicle_ID': ['A']*18,
'Position': ['START', 'MID', 'MID', 'END', 'MID', 'START']*3)})
Cond Position
0 A START
1 A MID
2 A MID
3 A END
4 A MID
5 A START
6 A START
7 A MID
8 A MID
9 A END
10 A MID
11 A START
12 A START
13 A MID
14 A MID
15 A END
16 A MID
17 A START
The only way I came up with to accurately group these buses together is to generate an additional column with the bus sequence id, but given that I'm working with lots of data, this isn't a very efficient solution. I'm hoping to be able to manage what I want to do with a single groupby, if possible, in order to generate the following output
Cond Position Group
0 A START 1
1 A MID 1
2 A MID 1
3 A END 1
4 A MID
5 A START 2
6 A START 2
7 A MID 2
8 A MID 2
9 A END 2
10 A MID
11 A START 3
12 A START 3
13 A MID 3
14 A MID 3
15 A END 3
16 A MID
17 A START 4
One idea is to factorize via np.select, then use a custom loop via numba:
from numba import njit
df = pd.DataFrame({'Vehicle_ID': ['A']*18,
'Position': ['START', 'MID', 'MID', 'END', 'MID', 'START']*3})
#njit
def grouper(pos):
res = np.empty(pos.shape)
num = 1
started = 0
for i in range(len(res)):
current_pos = pos[i]
if (started == 0) and (current_pos == 0):
started = 1
res[i] = num
elif (started == 1) and (current_pos == 1):
started = 0
res[i] = num
num += 1
elif (started == 1) and (current_pos in [-1, 0]):
res[i] = num
else:
res[i] = 0
return res
arr = np.select([df['Position'].eq('START'), df['Position'].eq('END')], [0, 1], -1)
df['Group'] = grouper(arr).astype(int)
Result:
print(df)
Position Vehicle_ID Group
0 START A 1
1 MID A 1
2 MID A 1
3 END A 1
4 MID A 0
5 START A 2
6 START A 2
7 MID A 2
8 MID A 2
9 END A 2
10 MID A 0
11 START A 3
12 START A 3
13 MID A 3
14 MID A 3
15 END A 3
16 MID A 0
17 START A 4
In my opinion, you should not include "blank" values as this would force your series to be object dtype, inefficient for any subsequent processing. As above, you can use 0 instead.
Performance benchmarking
numba is around ~10x faster than one pure Pandas approach:-
import pandas as pd, numpy as np
from numba import njit
df = pd.DataFrame({'Vehicle_ID': ['A']*18,
'Position': ['START', 'MID', 'MID', 'END', 'MID', 'START']*3})
df = pd.concat([df]*10, ignore_index=True)
assert joz(df.copy()).equals(jpp(df.copy()))
%timeit joz(df.copy()) # 18.6 ms per loop
%timeit jpp(df.copy()) # 1.95 ms per loop
Benchmarking functions:
def joz(df):
# identification of sequences
df['Position_Prev'] = df['Position'].shift(1)
df['Sequence'] = 0
df.loc[(df['Position'] == 'START') & (df['Position_Prev'] != 'START'), 'Sequence'] = 1
df.loc[df['Position'] == 'END', 'Sequence'] = -1
df['Sequence_Sum'] = df['Sequence'].cumsum()
df.loc[df['Sequence'] == -1, 'Sequence_Sum'] = 1
# take only items between START and END and generate Group number
df2 = df[df['Sequence_Sum'] == 1].copy()
df2.loc[df['Sequence'] == -1, 'Sequence'] = 0
df2['Group'] = df2['Sequence'].cumsum()
# merge results to one dataframe
df = df.merge(df2[['Group']], left_index=True, right_index=True, how='left')
df['Group'] = df['Group'].fillna(0)
df['Group'] = df['Group'].astype(int)
df.drop(['Position_Prev', 'Sequence', 'Sequence_Sum'], axis=1, inplace=True)
return df
#njit
def grouper(pos):
res = np.empty(pos.shape)
num = 1
started = 0
for i in range(len(res)):
current_pos = pos[i]
if (started == 0) and (current_pos == 0):
started = 1
res[i] = num
elif (started == 1) and (current_pos == 1):
started = 0
res[i] = num
num += 1
elif (started == 1) and (current_pos in [-1, 0]):
res[i] = num
else:
res[i] = 0
return res
def jpp(df):
arr = np.select([df['Position'].eq('START'), df['Position'].eq('END')], [0, 1], -1)
df['Group'] = grouper(arr).astype(int)
return df
I have some solution. You have to avoid loops and try to using sliding, slicing and merging.
This is my first prototype (should be refactored)
# identification of sequences
df['Position_Prev'] = df['Position'].shift(1)
df['Sequence'] = 0
df.loc[(df['Position'] == 'START') & (df['Position_Prev'] != 'START'), 'Sequence'] = 1
df.loc[df['Position'] == 'END', 'Sequence'] = -1
df['Sequence_Sum'] = df['Sequence'].cumsum()
df.loc[df['Sequence'] == -1, 'Sequence_Sum'] = 1
# take only items between START and END and generate Group number
df2 = df[df['Sequence_Sum'] == 1].copy()
df2.loc[df['Sequence'] == -1, 'Sequence'] = 0
df2['Group'] = df2['Sequence'].cumsum()
# merge results to one dataframe
df = df.merge(df2[['Group']], left_index=True, right_index=True, how='left')
df['Group'] = df['Group'].fillna(0)
df['Group'] = df['Group'].astype(int)
df.drop(columns=['Position_Prev', 'Sequence', 'Sequence_Sum'], inplace=True)
df
Result:
Vehicle_ID Position Group
0 A START 1
1 A MID 1
2 A MID 1
3 A END 1
4 A MID 0
5 A START 2
6 A START 2
7 A MID 2
8 A MID 2
9 A END 2
10 A MID 0
11 A START 3
12 A START 3
13 A MID 3
14 A MID 3
15 A END 3
16 A MID 0
17 A START 4

Categories