Groupwise sorting in pandas - python

I want to sort an array within the group boundaries defined in another array. The groups are not presorted in any way and need to remain unchanged after the sorting. In numpy terms it would look like this:
import numpy as np
def groupwise_sort(group_idx, a, reverse=False):
sortidx = np.lexsort((-a if reverse else a, group_idx))
# Reverse sorting back to into grouped order, but preserving groupwise sorting
revidx = np.argsort(np.argsort(group_idx, kind='mergesort'), kind='mergesort')
return a[sortidx][revidx]
group_idx = np.array([3, 2, 3, 2, 2, 1, 2, 1, 1])
a = np.array([3, 2, 1, 7, 4, 5, 5, 9, 1])
groupwise_sort(group_idx, a)
# >>> array([1, 2, 3, 4, 5, 1, 7, 5, 9])
groupwise_sort(group_idx, a, reverse=True)
# >>> array([3, 7, 1, 5, 4, 9, 2, 5, 1])
How can I do the same with pandas? I saw df.groupby() and df.sort_values(), though I couldn't find a straight forward way to achieve the same sorting. And a fast one, if possible.

Let us first set the stage:
import pandas as pd
import numpy as np
group_idx = np.array([3, 2, 3, 2, 2, 1, 2, 1, 1])
a = np.array([3, 2, 1, 7, 4, 5, 5, 9, 1])
df = pd.DataFrame({'group': group_idx, 'values': a})
df
# group values
#0 3 3
#1 2 2
#2 3 1
#3 2 7
#4 2 4
#5 1 5
#6 2 5
#7 1 9
#8 1 1
To get a dataframe sorted by group and values (within groups):
df.sort_values(["group", "values"])
# group values
#8 1 1
#5 1 5
#7 1 9
#1 2 2
#4 2 4
#6 2 5
#3 2 7
#2 3 1
#0 3 3
To sort the values in descending order, use ascending = False. To apply different orders to different columns, you can supply a list:
df.sort_values(["group", "values"], ascending = [True, False])
# group values
#7 1 9
#5 1 5
#8 1 1
#3 2 7
#6 2 5
#4 2 4
#1 2 2
#0 3 3
#2 3 1
Here, groups are sorted in ascending order, and the values within each group are sorted in descending order.
To only sort values for contiguous rows belonging to the same group, create a new group indicator:
(I keep this in here for reference since it might be helpful for others. I wrote this in an earlier version before the OP clarified his question in the comments.)
df['new_grp'] = (df.group.diff(1) != 0).astype('int').cumsum()
df
# group values new_grp
#0 3 3 1
#1 2 2 2
#2 3 1 3
#3 2 7 4
#4 2 4 4
#5 1 5 5
#6 2 5 6
#7 1 9 7
#8 1 1 7
We can then easily sort with new_grp instead of group, leaving the original order of groups untouched.
Ordering within groups but keeping the group-specifing row-positions:
To sort the elements of each group but keep the group-specific positions in the dataframe, we need to keep track of the original row numbers. For instance, the following will do the trick:
# First, create an indicator for the original row-number:
df["ind"] = range(len(df))
# Now, sort the dataframe as before
df_sorted = df.sort_values(["group", "values"])
# sort the original row-numbers within each group
newindex = df.groupby("group").apply(lambda x: x.sort_values(["ind"]))["ind"].values
# assign the sorted row-numbers to the sorted dataframe
df_sorted["ind"] = newindex
# Sort based on the row-numbers:
sorted_asc = df_sorted.sort_values("ind")
# compare the resulting order of values with your desired output:
np.array(sorted_asc["values"])
# array([1, 2, 3, 4, 5, 1, 7, 5, 9])
This is easier to test and profile when written up in a function, so let's do that:
def sort_my_frame(frame, groupcol = "group", valcol = "values", asc = True):
frame["ind"] = range(len(frame))
frame_sorted = frame.sort_values([groupcol, valcol], ascending = [True, asc])
ind_sorted = frame.groupby(groupcol).apply(lambda x: x.sort_values(["ind"]))["ind"].values
frame_sorted["ind"] = ind_sorted
frame_sorted = frame_sorted.sort_values(["ind"])
return(frame_sorted.drop(columns = "ind"))
np.array(sort_my_frame(df, "group", "values", asc = True)["values"])
# array([1, 2, 3, 4, 5, 1, 7, 5, 9])
np.array(sort_my_frame(df, "group", "values", asc = False)["values"])
# array([3, 7, 1, 5, 4, 9, 2, 5, 1])
Note that the latter results match your desired outcome.
I am sure this can be written up in a more succinct way. For instance, if the index of your dataframe is already ordered, you can use that one instead of the indicator ind I create (i.e., following #DJK's comment, we can use sort_index instead of sort_values and avoid assigning an additional column). In any case, the above highlights one possible solution and how to approach it. An alternative would be to use your numpy functions and wrap the output around a pd.DataFrame.

Pandas is built on top of numpy. Assuming a dataframe like so:
df
Out[21]:
group values
0 3 3
1 2 2
2 3 1
3 2 7
4 2 4
5 1 5
6 2 5
7 1 9
8 1 1
Call your function.
groupwise_sort(df.group.values, df['values'].values)
Out[22]: array([1, 2, 3, 4, 5, 1, 7, 5, 9])
groupwise_sort(df.group.values, df['values'].values, reverse=True)
Out[23]: array([3, 7, 1, 5, 4, 9, 2, 5, 1])

Related

Sorting matrix columns

I have a matrix 4*5 and I need to sort it by several columns.
Given these inputs:
sort_columns = [3, 1, 2, 4, 5, 2]
matrix = [[3, 1, 8, 1, 9],
[3, 7, 8, 2, 9],
[2, 7, 7, 1, 2],
[2, 1, 7, 1, 9]]
the matrix should first be sorted by the 3nd column (so the values 8, 8, 7, 7), then the sorted result should again be sorted by column 1 (values 3, 3, 2, 2) and so on.
So, after first sorting by column 3, the matrix would be:
2 7 7 1 2
2 1 7 1 9
3 1 8 1 9
3 7 8 2 9
and sorting on column 1 then has no effect as the values are already in the right order. The next column, 2, then makes the order:
2 1 7 1 9
3 1 8 1 9
2 7 7 1 2
3 7 8 2 9
etc.
After sorting on all the sort_columns numbers, I expect to get the result:
2 7 7 1 2
3 1 8 1 9
2 1 7 1 9
3 7 8 2 9
This is my code to sort the matrix:
def sort_matrix_columns(matrix, n, sort_columns):
for col in sort_columns:
column = col - 1
for i in range(n):
for j in range(i + 1, n):
if matrix[i][column] > matrix[j][column]:
temp = matrix[i]
matrix[i] = matrix[j]
matrix[j] = temp
which is called like this:
sort_matrix_columns(matrix, len(matrix), sort_columns)
But when I do I get the following wrong result:
3 1 8 1 9
2 1 7 1 9
2 7 7 1 2
3 7 8 2 9
Why am I getting the wrong order here? Where is my sort implementation failing?
The short answer is that your sort implementation is not stable.
A sort algorithm is stable when two entries in the sorted sequence keep the same (relative) order when their sort key is the same. For example, when sorting only by the first letter, a stable algorithm will always sort the sequence ['foo', 'flub', 'bar'] to be ['bar', 'foo', 'flub'], keeping the 'foo' and 'flub' values in the same relative order. Your algorithm would swap 'foo' and 'bar' (as 'f' > 'b' is true) without touching 'flub', and so you'd end up with ['bar', 'flub', 'foo'].
You need a stable sort algorithm when applying sort multiple times as you do when using multiple columns, because subsequent sortings should leave the original order applied by preceding sort operations when the value in the current column is the same between two rows.
You can see this when your implementation sorts by column 5, after first sorting on columns 3, 1, 2, 4. After those first 4 sort operations the matrix looks like this:
2 1 7 1 9
3 1 8 1 9
2 7 7 1 2
3 7 8 2 9
Your implementation then sorts by column 5, so by 9, 9, 2, 9. The first row is then swapped with the 3rd row (2 1 7 1 9 and 2 7 7 1 2, leaving the other rows all untouched. This changed the relative order of all the columns with a 9:
2 7 7 1 2 < - was third
3 1 8 1 9 < - so this row is now re-ordered!
2 1 7 1 9 < - was first
3 7 8 2 9
Sorting the above output by the 2nd column (7, 1, 1, 7) then leads to the wrong output you see.
A stable sort algorithm would have moved the 2 7 7 1 2 row to be the first row without reordering the other rows:
2 7 7 1 2 < - was third
2 1 7 1 9 < - was first
3 1 8 1 9 < - was second, stays *after* the first row
3 7 8 2 9 < - was third, stays *after* the second row
and sorting by the second column produces the correct output.
The default Python sort implementation, TimSort (named after its inventor, Tim Peters), is a stable sort function. You could just use that (via the list.sort() method and a sort key function):
def sort_matrix_columns(matrix, sort_columns):
for col in sort_columns:
matrix.sort(key=lambda row: row[col - 1])
Heads-up: I removed the n parameter from the function, for simplicity's sake.
Demo:
>>> def pm(m): print(*(' '.join(map(str, r)) for r in m), sep="\n")
...
>>> def sort_matrix_columns(matrix, sort_columns):
... for col in sort_columns:
... matrix.sort(key=lambda row: row[col - 1])
...
>>> sort_columns = [3, 1, 2, 4, 5, 2]
>>> matrix = [[3, 1, 8, 1, 9],
... [3, 7, 8, 2, 9],
... [2, 7, 7, 1, 2],
... [2, 1, 7, 1, 9]]
>>> sort_matrix_columns(matrix, sort_columns)
>>> pm(matrix)
2 1 7 1 9
3 1 8 1 9
2 7 7 1 2
3 7 8 2 9
You don't need to use loop, if you reverse the sort_columns list and use that to create a single sort key, you can do this with a single call:
def sort_matrix_columns(matrix, sort_columns):
matrix.sort(key=lambda r: [r[c - 1] for c in sort_columns[::-1]])
This works the same way, the most significant sort is the last column, only when two rows have the same value (a tie) would the one-but-last column sort matter, etc.
There are other stable sort algorithms, e.g. insertion or bubble sort would work just as well here. Wikipedia has a handy table of comparison sort algorithms that includes a 'stable' column, if you wanted to implement sorting yourself still.
E.g. here is a version using insertion sort:
def insertionsort_matrix_columns(matrix, sort_columns):
for col in sort_columns:
column = col - 1
for i in range(1, len(matrix)):
for j in range(i, 0, -1):
if matrix[j - 1][column] <= matrix[j][column]:
break
matrix[j - 1], matrix[j] = matrix[j], matrix[j - 1]
I didn't use a temp variable to swap two rows. In Python, you can swap two values simply by using tuple assignments.
Because insertion sort is stable, this produces the expected outcome:
>>> matrix = [[3, 1, 8, 1, 9],
... [3, 7, 8, 2, 9],
... [2, 7, 7, 1, 2],
... [2, 1, 7, 1, 9]]
>>> insertionsort_matrix_columns(matrix, sort_columns)
>>> pm(matrix)
2 1 7 1 9
3 1 8 1 9
2 7 7 1 2
3 7 8 2 9

how to count the match number between two dataframe fast?

I'm writing some programs on calculate the match item number between two dataframes.
for example,
A is the dataframe as : A = pd.DataFrame({'pick_num1':[1, 2, 3], 'pick_num2':[2, 3, 4], 'pick_num3':[4, 5, 6]})
B is the answer I want to match, like:
B = pd.DataFrame({'ans_num1':[1, 2, 3], 'ans_num2':[2, 3, 4], 'ans_num3':[4, 5, 6], 'ans_num4':[7, 8, 1], 'ans_num5':[9, 1, 9]})
DataFrame A
pick_num1 pick_num2 pick_num3 match_num
0 1 2 4 2
1 2 3 5 2
2 3 4 6 2
DataFrame B
ans_num1 ans_num2 ans_num3 ans_num4 ans_num5
0 1 2 4 7 9
1 2 3 5 8 1
2 3 4 6 1 9
and I want to append a new column of ['match_num'] at the end of A.
Now I have tried to write a mapping function to compare and calculate, and I found the speed is not that fast while the dataframe is huge, the functions are below:
def win_prb_func(df1, p_name):
df1['match_num'] += np.sum(pd.concat([df1[p_name]]*5, axis=1).values==df1[open_ball_name_ls].values, 1)
return df1
def compute_win_prb(df1):
return list(map(lambda p_name: win_prb_func(df1, p_name), pick_name_ls))
df1 = pd.concat([A, B], axis=1)
df1['win prb.'] = 0
result_df = compute_win_prb(df1)
where pick_name_ls is ['pick_num1', 'pick_num2', 'pick_num3'], and open_ball_name_ls is ['ans_num1', 'ans_num2', 'ans_num3', 'ans_num4', 'ans_num5'].
I'm wondering is it possible to make the computation more fast or smart than I did?
now the performance would is: 0.015626192092895508 seconds
Thank you for helping me!
You can use broadcasting instead of concatenating the columns:
def win_prb_func(df1, p_name):
df1['match_num'] += np.sum(df1[p_name].values[:, np.newaxis] == df1[open_ball_name_ls].values, 1)
return df1
Since df1[p_name].values will return an 1-D array, you have to convert it into the column vector by adding a new axis. It only takes me 0.004 second.

Collapse multiple timestamp rows into a single one

I have a series like that:
s = pd.DataFrame({'ts': [1, 2, 3, 6, 7, 11, 12, 13]})
s
ts
0 1
1 2
2 3
3 6
4 7
5 11
6 12
7 13
I would like to collapse rows that have difference less than MAX_DIFF (2). That means that the desired output must be:
[{'ts_from': 1, 'ts_to': 3},
{'ts_from': 6, 'ts_to': 7},
{'ts_from': 11, 'ts_to': 13}]
I did some coding:
s['close'] = s.diff().shift(-1)
s['close'] = s[s['close'] > MAX_DIFF].astype('bool')
s['close'].iloc[-1] = True
parts = []
ts_from = None
for _, row in s.iterrows():
if row['close'] is True:
part = {'ts_from': ts_from, 'ts_to': row['ts']}
parts.append(part)
ts_from = None
continue
if not ts_from:
ts_from = row['ts']
This works but does not seem optimal because of iterrows(). I thought about ranks but couldn't figure out how to implement them so as to groupby rank further.
Is there way to optimes algorithm?
You can create groups by checking where the difference is more than your threshold and take a cumsum. Then agg however you'd like, perhaps first and last in this case.
gp = s['ts'].diff().abs().ge(2).cumsum().rename(None)
res = s.groupby(gp).agg(ts_from=('ts', 'first'),
ts_to=('ts', 'last'))
# ts_from ts_to
#0 1 3
#1 6 7
#2 11 13
And if you want the list of dicts then:
res.to_dict('records')
#[{'ts_from': 1, 'ts_to': 3},
# {'ts_from': 6, 'ts_to': 7},
# {'ts_from': 11, 'ts_to': 13}]
For completeness here is how the grouper aligns with the DataFrame:
s['gp'] = gp
print(s)
ts gp
0 1 0 # `1` becomes ts_from for group 0
1 2 0
2 3 0 # `3` becomes ts_to for group 0
3 6 1 # `6` becomes ts_from for group 1
4 7 1 # `7` becomes ts_to for group 1
5 11 2 # `11` becomes ts_from for group 2
6 12 2
7 13 2 # `13` becomes ts_to for group 2

Appending seperate dataframes, each as column

I am currently working on the following:
data - with the correct index
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(data_values)
wcss.append(kmeans.inertia_)
kmeans = KMeans(n_clusters=2).fit(data_values)
y = kmeans.fit_predict(data_values) # prediction of k
df= pd.DataFrame(y,index = data.index)
....
#got here multiple dicts
Example of y:
[1 2 3 4 5 2 2 5 1 0 0 1 0 0 1 0 1 4 4 4 3 1 0 0 1 0 0 ...]
f = pd.DataFrame(y, columns = [buster] )
f.to_csv('busters.csv, mode = 'a')
y = clusters after determination
I dont know how did I stuck on this.. I am iterating over 20 dataframes, each one consists of one columns and values from 1-9. The index is irrelevent. I am trying to append all frame together but instead it just prints them one after the other. If I put ".T" to transpose it , I still got rows with irrelevent values as index, which I cant remove them because they are actually headers.
Needed result
If the dicts produced in each iteration look like {'Buster1': [0, 2, 2, 4, 5]}, {'Buster2': [1, 2, 3, 4, 5]} ..., using 5 elements here for illustration purposes, and all the lists, i.e., values in the dicts, have the same number of elements (as it is the case in your example), you could create a single dict and use pd.DataFrame directly. (You may also want to take a look at pandas.DataFrame.from_dict.)
You may have lists with more than 5 elements, more than 3 dicts (and thus columns), and you will be generating the dicts with a loop, but the code below should be sufficient for getting the idea.
>>> import pandas as pd
>>>
>>> d = {}
>>> # update d in every iteration
>>> d.update({'Buster 1': [0, 2, 2, 4, 5]})
>>> d.update({'Buster 2': [1, 2, 3, 4, 5]})
>>> # ...
>>> d.update({'Buster n': [0, 9, 3, 0, 0]})
>>>
>>> pd.DataFrame(d, columns=d.keys())
Buster 1 Buster 2 Buster n
0 0 1 0
1 2 2 9
2 2 3 3
3 4 4 0
4 5 5 0
If you have the keys, e.g., 'Buster 1', and values, e.g., [0, 2, 2, 4, 5], separated, as I believe is the case, you can simplify the above (and make it more efficient) by replacing d.update({'Buster 1': [0, 2, 2, 4, 5]}) with d['Buster 1']=[0, 2, 2, 4, 5].
I included columns=d.keys() because depending on your Python and pandas version the ordering of the columns may not be as you expect it to be. You can specify the ordering of the columns through specifying the order in which you provide the keys. For example:
>>> pd.DataFrame(d, columns=sorted(d.keys(),reverse=True))
Buster n Buster 2 Buster 1
0 0 1 0
1 9 2 2
2 3 3 2
3 0 4 4
4 0 5 5
Although it may not apply to your use case, if you do not want to print the index, you can take a look at How to print pandas DataFrame without index.

Python Timeseries Pandas: remove np.nan if more than 3 occurrences continuously

I have a timeseries dataframe with column [timestamp,Digital_Data]
Could you guide me how to remove all rows that are matches if the digital_Data consecutively np.nan for more than three occurrence. data sample as below.
Sorry i am not sure how to add a table here, it turns into image when i copy and paste it from excel
Sample Data
There MUST be a pythonic way to solve it, or even a solution provided by pandas itself, and I encourage you to search! but just in case you urgently need the solution, here is how I solve it:
1. example
x = [1, 2, np.nan, np.nan, np.nan, np.nan, 2, 1, np.nan, np.nan, 3]
y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
df = pd.DataFrame({'x': x, 'y': y})
output is
x y
0 1.0 1
1 2.0 2
2 NaN 3
3 NaN 4
4 NaN 5
5 NaN 6
6 2.0 7
7 1.0 8
8 NaN 9
9 NaN 10
10 3.0 11
2. Get NaN indices
ind = df[df.x.isna()].index.tolist()
3. Get the block of adjacent NaN indices
I create an empty holder inds_to_delete and fill it with the blocks of adjacent indices. I check the adjacency by checking if the element i is 1 more than i-1
# first element by default in temp
temp = [ind[0]]
for i in range(1, len(ind)):
try:
assert ind[i] == ind[i-1] + 1
# if condition holds, append to temp
temp.append(ind[i])
except AssertionError:
# if condition doesn't hold, we have a break, append temp to holder
inds_to_delete.append(temp)
# restart temp for the next block
temp = [ind[i]]
# last block of the series also appended to the holder
inds_to_delete.append(temp)
output of inds_to_delete
[[2, 3, 4, 5], [8, 9]]
4. blocks with length more than 2 and joining
inds_to_delete = [i for i in inds_to_delete if len(i)>2]
>>> [[2, 3, 4, 5]]
inds_to_delete = [i for j in inds_to_delete for i in j]
>>> [2, 3, 4, 5]
if inds_to_delete is [[1, 2, 3], [6, 7, 8]] then final line makes it: [1, 2, 3, 6, 7, 8]
5. Drop from dataframe
df.drop(inds_to_delete, inplace=True)
output is
x y
0 1.0 1
1 2.0 2
6 2.0 7
7 1.0 8
8 NaN 9
9 NaN 10
10 3.0 11
(maybe this solution can be awarded by SO as the most unpythonic solution)
Thanks Alireza, and as you said i hope there will be a pythonic way to solve this.
I have temporarily fixed it with the below code assuming the threshold is more than 15 for nan removal:
df = pd.DataFrame(list(zip(x,y)), columns =['TimeStamp','FHR']).set_index('TimeStamp', drop=True)
df = df = df.resample('S').mean()
TimeStampToRemove = []
fhrtoremove = []
df1 = df
for i, row in enumerate(df.values):
fhr = df['FHR'][i]
if np.isnan(fhr):
TimeStampToRemove.append(df.index[i])
fhrtoremove.append(fhr)
else:
if len(TimeStampToRemove) > 15:
df1toRemove = pd.DataFrame(list(zip(TimeStampToRemove,fhrtoremove)), columns =['TimeStamp','FHR']).set_index('TimeStamp', drop=True)
TimeStampToRemove.clear()
fhrtoremove.clear()
df1 = df1.drop(df1toRemove.index.tolist())
if len(TimeStampToRemove) > 0:
df1toRemove = pd.DataFrame(list(zip(TimeStampToRemove,fhrtoremove)), columns =['TimeStamp','FHR']).set_index('TimeStamp', drop=True)
df1 = df1.drop(df1toRemove.index.tolist())
TimeStampToRemove.clear()
fhrtoremove.clear()

Categories