Condionally and Randomly update Pandas values? - python

I want to build a scheduling app in python using pandas.
The following DataFrame is initialised where 0 denotes if a person is busy and 1 if a person is available.
import pandas as pd
df = pd.DataFrame({'01.01.': [1,1,0], '02.01.': [0,1,1], '03.01.': [1,0,1]}, index=['Person A', 'Person B', 'Person C'])
>>> df
01.01. 02.01. 03.01.
Person A 1 0 1
Person B 1 1 0
Person C 0 1 1
I now want to randomly schedule n number of people per day if they are available. In other words, for every day, if people are available (1), randomly set n number of people to scheduled (2).
I tried something as follows:
# Required number of people across time / columns
required_number = [0, 1, 2]
# Iterate through time / columns
for col in range(len(df.columns)):
# Current number of scheduled people
current_number = (df.iloc[:, [col]].values==2).sum()
# Iterate through indices / rows / people
for ind in range(len(df.index)):
# Check if they are available (1) and
# if the required number of people has not been met yet
if (df.iloc[ind, col]==1 and
current_number<required_number[col]):
# Change "free" / 1 person to "scheduled" / 2
df.iloc[ind, col] = 2
# Increment scheduled people by one
current_number += 1
>>> df
01.01. 02.01. 03.01.
Person A 1 0 2
Person B 1 2 0
Person C 0 1 2
This works as intended but – because I'm simply looping, I have no way of adding randomness (ie. that Person A / B / C) are randomly selected so long as they are available. Is there a way of directly doing so in pandas?
Thanks. BBQuercus

You can randomly choose proper indices in a series and then change values corresponding to the chosen indices:
for i in range(len(df.columns)):
if sum(df.iloc[:,i] == 1) >= required_number[i]:
column = df.iloc[:,i].reset_index(drop=True)
#We are going to store indices in a list
a = [j for j in column.index if column[j] == 1]
random_indexes = np.random.choice(a, required_number[i], replace = False)
df.iloc[:,i] = [column[j] if j not in random_indexes else 2 for j in column.index]
Now df is the wanted result.

Related

How to select list elements based on crteria from other lists

I am new to Python, coming from SciLab (an open source MatLab ersatz), which I am using as a toolbox for my analyses (test data analysis, reliability, acoustics, ...); I am definitely not a computer science lad.
I have data in the form of lists of same length (vectors of same size in SciLab).
I use some of them as parameter in order to select data from another one; e.g.
t_v = [1:10]; // a parameter vector
p_v = [20:29]; another parameter vector
res_v(t_v > 5 & p_v < 28); // are the res_v vector elements of which "corresponding" p_v and t_v values comply with my criteria; i can use it for analyses.
This is very direct and simple in SciLab; I did not find the way to achieve the same with Python, either "Pythonically" or simply translated.
Any idea that could help me, please?
Have a nice day,
Patrick.
You could use numpy arrays. It's easy:
import numpy as np
par1 = np.array([1,1,5,5,5,1,1])
par2 = np.array([-1,1,1,-1,1,1,1])
data = np.array([1,2,3,4,5,6,7])
print(par1)
print(par2)
print(data)
bool_filter = (par1[:]>1) & (par2[:]<0)
# example to do it directly in the array
filtered_data = data[ par1[:]>1 ]
print( filtered_data )
#filtering with the two parameters
filtered_data_twice = data[ bool_filter==True ]
print( filtered_data_twice )
output:
[1 1 5 5 5 1 1]
[-1 1 1 -1 1 1 1]
[1 2 3 4 5 6 7]
[3 4 5]
[4]
Note that it does not keep the same number of elements.
Here's my modified solution according to your last comment.
t_v = list(range(1,10))
p_v = list(range(20,29))
res_v = list(range(30,39))
def first_idex_greater_than(search_number, lst):
for count, number in enumerate(lst):
if number > search_number:
return count
def first_idex_lower_than(search_number, lst):
for count, number in enumerate(lst[::-1]):
if number < search_number:
return len(lst) - count # since I searched lst from top to bottom,
# I need to also reverse count
t_v_index = first_idex_greater_than(5, t_v)
p_v_index = first_idex_lower_than(28, p_v)
print(res_v[min(t_v_index, p_v_index):max(t_v_index, p_v_index)])
It returns an array [35, 36, 37].
I'm sure you can optimize it better according to your needs.
The problem statement is not clearly defined, but this is what I interpret to be a likely solution.
import pandas as pd
tv = list(range(1, 11))
pv = list(range(20, 30))
res = list(range(30, 40))
df = pd.DataFrame({'tv': tv, 'pv': pv, 'res': res})
print(df)
def criteria(row, col1, a, col2, b):
if (row[col1] > a) & (row[col2] < b):
return True
else:
return False
df['select'] = df.apply(lambda row: criteria(row, 'tv', 5, 'pv', 28), axis=1)
selected_res = df.loc[df['select']]['res'].tolist()
print(selected_res)
# ... or another way ..
print(df.loc[(df.tv > 5) & (df.pv < 28)]['res'])
This produces a dataframe where each column is the original lists, and applies a selection criteria, based on columns tv and pv to identify the rows in which the criteria, applied dependently to the 2 lists, is satisfied (or not), and then creates a new column of booleans identifying the rows where the criteria is either True or False.
[35, 36, 37]
5 35
6 36
7 37

Pandas for loop with 2 parameters and changing column values

I have 2 columns in a dataframe, one named "day_test" and one named "Temp Column". Some of my values in Temp Column are negative, and I want them to be 1 or 2. I've made a for loop with 2 if statements:
for (i,j) in zip(df['day_test'].astype(int), df['Temp Column'].astype(int)):
if i == 2 and j < 0:
j = 2
if i == 1 and j < 0:
j = 1
I tried printing j so I know the loops are working properly, but the values that I want to change in the dataframe are staying negative.
Thanks
Your code doesn't change the values inside the dataframe, it only changes the j value temporarily.
One way to do it is this:
df['day_test'] = df['day_test'].astype(int)
df['Temp Column'] = df['Temp Column'].astype(int)
df.loc[(df['day_test']==1) & (df['Temp Column']<0),'Temp Column'] = 1
df.loc[(df['day_test']==2) & (df['Temp Column']<0),'Temp Column'] = 2

Add a value in a column as a function of the timestamp and another column

The title may not be very clear, but with an example I hope it would make some sense.
I would like to create an output column (called "outputTics"), and put a 1 in it 0.21 seconds after a 1 appears in the "inputTics" column.
As you see, there is no value 0.21 seconds exactly after another value, so I'll put the 1 in the outputTics column two rows after : an example would be at the index 3, there is a 1 at 11.4 seconds so I'm putting an 1 in the output column at 11.6 seconds
If there is a 1 in the "inputTics" column 0.21 second of earlier, do not put a one in the output column : an example would be at the index 1 in the input column
Here is an example of the red column I would like to create.
Here is the code to create the dataframe :
A = pd.DataFrame({"Timestamp":[11.1,11.2,11.3,11.4,11.5,11.6,11.7,11.8,11.9,12.0,12.1,12.2,12.3,12.4,12.5,12.6,12.7,12.8,12.9,13.0],
"inputTics":[0,1,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,1,1,1],
"outputTics":[0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0]})
You can use pd.Timedelta if you can to avoid python rounded numbers if you want
Create the column with zeros.
df['outputTics'] = 0
Define a function set_output_tic in the following manner
def set_output_tic(row):
if row['inputTics'] == 0:
return 0
index = df[df == row].dropna().index
# check for a 1 in input within 0.11 seconds
t = row['Timestamp'] + pd.TimeDelta(seconds = 0.11)
indices = df[df.Timestamp <= t].index
c = 0
for i in indices:
if df.loc[i,'inputTics'] == 0:
c = c + 1
else:
c = 0
break
if c > 0:
df.loc[indices[-1] + 1, 'outputTics'] = 1
return 0
then call the above function using df.apply
temp = df.apply(set_output_tic, axis = 1) # temp is practically useless
This was actually kinda tricky, but by playing with indices in numpy you can do it.
# Set timestamp as index for a moment
A = A.set_index(['Timestamp'])
# Find the timestamp indices of inputTics and add your 0.11
input_indices = A[A['inputTics']==1].index + 0.11
# Iterate through the indices and find the indices to update outputTics
output_indices = []
for ii in input_indices:
# Compare indices to full dataframe's timestamps
# and return index of nearest timestamp
oi = np.argmax((A.index - ii)>=0)
output_indices.append(oi)
# Create column of output ticks with 1s in the right place
output_tics = np.zeros(len(A))
output_tics[output_indices] = 1
# Add it to dataframe
A['outputTics'] = outputTics
# Add condition that if inputTics is 1, outputTics is 0
A['outputTics'] = A['outputTics'] - A['inputTics']
# Clean up negative values
A[A['outputTic']<0] = 0
# The first row becomes 1 because of indexing; change to 0
A = A.reset_index()
A.at[0, 'outputTics'] = 0

python Constraints - constraining the amount

I have a constraint problem that I'm trying to solve with python-constraint
So let's say I have 3 locations: loc1,...loc3
Also, I have 7 devices: device1,...device7
Max amount of devices in each location: loc1:3, loc2:4, loc3:2
(for example maximum of 3 devices in loc1 and so on...)
And some constraints about the locations and the devices:
loc1: device1, device3, device7,
loc2: device1, device3, device4, device5, device6, device7
loc3: device2, device4, device5, device6
(meaning for example only device1, device3 and device7 can be in loc1.)
I'm trying to get a set of possible options for devices in locations.
from constraint import *
problem = Problem()
for key in locations_devices_dict:
problem.addVariable(key,locations_devices_dict[key])
# problem.addVariable("loc1", ['device1', 'device3', 'device7'])
problem.addConstraint(AllDifferentConstraint())
and I'm stuck on how to do the constrains. I've tried:
problem.addConstraint(MaxSumConstraint(3), 'loc1')
but it doesn't work, MaxSumConstraint does not sum what I need.
All devices must be placed somewhere
possible solution:
loc1: device1, device3
loc2: device4, device6, device7
loc3: device2, device5
Anyone has an idea?
(another python package/not to use any package, is also good idea if someone has any suggestions...)
This is simple assignment-like model:
So we have a binary variable indicating if device d is assigned to location L. The linear constraints are just:
assign each device to one location
each location has a maximum number of devices
make sure to use only allowed assignments (modeled above by allowed(L,d))
This problem can be handled by any constraint solver.
Enumerating all possible solutions is a bit dangerous. For large instances there are just way too many. Even for this small problem we already have 25 solutions:
For large problems this number will be astronomically large.
Using the Python constraint package this can look like:
from constraint import *
D = 7 # number of devices
L = 3 # number of locations
maxdev = [3,4,2]
allowed = [[1,3,7],[1,3,4,5,6,7],[2,4,5,6]]
problem = Problem()
problem.addVariables(["x_L%d_d%d" %(loc+1,d+1) for loc in range(L) for d in range(D) if d+1 in allowed[loc]],[0,1])
for loc in range(L):
problem.addConstraint(MaxSumConstraint(maxdev[loc]),["x_L%d_d%d" %(loc+1,d+1) for d in range(D) if d+1 in allowed[loc]])
for d in range(D):
problem.addConstraint(ExactSumConstraint(1),["x_L%d_d%d" %(loc+1,d+1) for loc in range(L) if d+1 in allowed[loc]])
S = problem.getSolutions()
n = len(S)
n
For large problems you may want to use dicts to speed things up.
edit: I wrote this answer before I saw #ErwinKalvelagen's code. So I did not check his solution...
So I used #ErwinKalvelagen approach and created a matrix that represented the probelm.
for each (i,j), x[i,j]=1 if device i can go to location j, 0 otherwise.
Then, I used addConstraint(MaxSumConstraint(maxAmount[i]), row) for each row - this is the constraint that represent the maximum devices in each location.
and addConstraint(ExactSumConstraint(1), col) for each column - this is the constraint that each device can be placed only in one location.
next, I took all x[i,j]=0 (device i can not be in location j) and for each t(i,j) addConstraint(lambda var, val=0: var == val, (t,))
This problem is similar to the sudoku problem, and I used this example for help
The matrix for my example above is:
(devices:) 1 2 3 4 5 6 7
loc1: 1 0 1 0 0 0 1
loc2: 1 0 1 1 1 1 1
loc3: 0 1 0 1 1 1 0
My code:
problem = Problem()
rows = range(locations_amount)
cols = range(devices_amount)
matrix = [(row, col) for row in rows for col in cols]
problem.addVariables(matrix, range(0, 2)) #each cell can get 0 or 1
rowSet = [zip([el] * len(cols), cols) for el in rows]
colSet = [zip(rows, [el] * len(rows)) for el in cols]
rowsConstrains = getRowConstrains() # list that has the maximum amount in each location(3,4,2)
#from my example: loc1:3, loc2:4, loc3:2
for i,row in enumerate(rowSet):
problem.addConstraint(MaxSumConstraint(rowsConstrains[i]), row)
for col in colSet:
problem.addConstraint(ExactSumConstraint(1), col)
s = getLocationsSet() # set that has all the tuples that x[i,j] = 1
for i, loc in enumerate(locations_list):
for j, iot in enumerate(devices_list):
t=(i,j)
if t in s:
continue
problem.addConstraint(lambda var, val=0: var == val, (t,)) # the value in these cells must be 0
solver = problem.getSolution()
example for a solution:
(devices:) 1 2 3 4 5 6 7
loc1: 1 0 1 0 0 0 1
loc2: 0 0 0 1 1 1 0
loc3: 0 1 0 0 0 0 0

reordering cluster numbers for correct correspondence

I have a dataset that I clustered using two different clustering algorithms. The results are about the same, but the cluster numbers are permuted.
Now for displaying the color coded labels, I want the label ids to be same for the same clusters.
How can I get correct permutation between the two label ids?
I can do this using brute force, but perhaps there is a better/faster method. I would greatly appreciate any help or pointers. If possible I am looking for a python function.
The most well-known algorithm for finding the optimum matching is the hungarian method.
Because it cannot be explained in a few sentences, I have to refer you to a book of your choice, or Wikipedia article "Hungarian algorithm".
You can probably get good results (even perfect if the difference is indeed tiny) by simply picking the maximum of the correspondence matrix and then removing that row and column.
I have a function that works for me. But it may fail when the two cluster results are very inconsistent, which leads to duplicated max values in the contingency matrix. If your cluster results are about the same, it should work.
Here is my code:
from sklearn.metrics.cluster import contingency_matrix
def align_cluster_index(ref_cluster, map_cluster):
"""
remap cluster index according the the ref_cluster.
both inputs must be nparray and have same number of unique cluster index values.
Xin Niu Jan-15-2020
"""
ref_values = np.unique(ref_cluster)
map_values = np.unique(map_cluster)
print(ref_values)
print(map_values)
num_values = ref_values.shape[0]
if ref_values.shape[0]!=map_values.shape[0]:
print('error: both inputs must have same number of unique cluster index values.')
return()
switched_col = set()
while True:
cont_mat = contingency_matrix(ref_cluster, map_cluster)
print(cont_mat)
# divide contingency_matrix by its row and col sums to avoid potential duplicated values:
col_sum = np.matmul(np.ones((num_values, 1)), np.sum(cont_mat, axis = 0).reshape(1, num_values))
row_sum = np.matmul(np.sum(cont_mat, axis = 1).reshape(num_values, 1), np.ones((1, num_values)))
print(col_sum)
print(row_sum)
cont_mat = cont_mat/(col_sum+row_sum)
print(cont_mat)
# ignore columns that have been switched:
cont_mat[:, list(switched_col)]=-1
print(cont_mat)
sort_0 = np.argsort(cont_mat, axis = 0)
sort_1 = np.argsort(cont_mat, axis = 1)
print('argsort contmat:')
print(sort_0)
print(sort_1)
if np.array_equal(sort_1[:,-1], np.array(range(num_values))):
break
# switch values according to the max value in the contingency matrix:
# get the position of max value:
idx_max = np.unravel_index(np.argmax(cont_mat, axis=None), cont_mat.shape)
print(cont_mat)
print(idx_max)
if (cont_mat[idx_max]>0) and (idx_max[0] not in switched_col):
cluster_tmp = map_cluster.copy()
print('switch', map_values[idx_max[1]], 'and:', ref_values[idx_max[0]])
map_cluster[cluster_tmp==map_values[idx_max[1]]]=ref_values[idx_max[0]]
map_cluster[cluster_tmp==map_values[idx_max[0]]]=ref_values[idx_max[1]]
switched_col.add(idx_max[0])
print(switched_col)
else:
break
print('final argsort contmat:')
print(sort_0)
print(sort_1)
print('final cont_mat:')
cont_mat = contingency_matrix(ref_cluster, map_cluster)
col_sum = np.matmul(np.ones((num_values, 1)), np.sum(cont_mat, axis = 0).reshape(1, num_values))
row_sum = np.matmul(np.sum(cont_mat, axis = 1).reshape(num_values, 1), np.ones((1, num_values)))
cont_mat = cont_mat/(col_sum+row_sum)
print(cont_mat)
return(map_cluster)
And here is some test code:
ref_cluster = np.array([2,2,3,1,0,0,0,1,2,1,2,2,0,3,3,3,3])
map_cluster = np.array([0,0,0,1,1,3,2,3,2,2,0,0,0,2,0,3,3])
c = align_cluster_index(ref_cluster, map_cluster)
print(ref_cluster)
print(c)
>>>[2 2 3 1 0 0 0 1 2 1 2 2 0 3 3 3 3]
>>>[2 2 2 1 1 3 0 3 0 0 2 2 2 0 2 3 3]

Categories