I need to find the cosine distance similarity from two vectors using the "user ratings" stored in the user_dict dictionary.
The ratings were imported from a CSV file then changed to a dictionary with the User as the Key with the values of each users ratings. My question is how do I loop through the dictionary to take two user's ratings and get the similarity using the cosine distance function?
The loop would need to not compare the same user together or compare the same users in a different order? (E.g. user 5 vs user 3 and user 3 vs user 5)
from scipy import spatial
d = {'U1': [3, 4, 2, 5, 0, 4, 1, 3, 0, 0, 4],
'U2': [2, 3, 1, 0, 3, 0, 2, 0, 0, 3, 0],
'U3': [0, 4, 0, 5, 0, 4, 0, 3, 0, 2, 4],
'U4': [0, 0, 2, 1, 4, 3, 2, 0, 0, 2, 0],
'U5': [0, 0, 0, 5, 0, 4, 0, 3, 0, 0, 4],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U7': [0, 4, 3, 5, 0, 5, 0, 0, 0, 0, 4],
'U8': [4, 3, 0, 3, 4, 2, 2, 0, 2, 3, 2],
'U9': [0, 2, 0, 3, 1, 0, 1, 0, 0, 2, 0],
'U10': [0, 3, 0, 4, 3, 3, 0, 3, 0, 4, 4],
'U11': [2, 2, 1, 2, 1, 0, 2, 0, 1, 0, 2],
'U12': [0, 4, 4, 5, 0, 0, 0, 3, 0, 4, 5],
'U13': [3, 3, 0, 2, 2, 3, 2, 0, 2, 0, 3],
'U14': [0, 3, 4, 5, 0, 5, 0, 0, 0, 4, 0],
'U15': [2, 0, 0, 3, 0, 2, 2, 3, 0, 0, 3],
'U16': [4, 4, 0, 4, 3, 4, 0, 3, 0, 3, 0],
'U17': [0, 2, 0, 3, 1, 0, 2, 0, 1, 0, 3],
'U18': [2, 3, 1, 0, 3, 2, 3, 2, 0, 2, 0],
'U19': [0, 5, 0, 4, 0, 3, 0, 4, 0, 0, 5],
'U20': [0, 0, 3, 0, 3, 0, 4, 0, 2, 0, 0],
'U21': [3, 0, 2, 4, 2, 3, 0, 4, 2, 3, 3],
'U22': [4, 4, 0, 5, 3, 5, 0, 4, 0, 3, 0],
'U23': [3, 0, 0, 0, 3, 0, 2, 0, 0, 4, 0],
'U24': [4, 0, 3, 0, 3, 0, 3, 0, 0, 2, 2],
'U25': [0, 5, 0, 3, 3, 4, 0, 3, 3, 4, 4]}
all_keys = list(d.keys())
for i in range(len(all_keys)):
for j in range(i+1,len(all_keys)):
print(f"Cosine similaity between {all_keys[i]} and {all_keys[j]} is {1 - spatial.distance.cosine(d[all_keys[i]], d[all_keys[j]])}")
OR
using pandas
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
d = {'U1': [3, 4, 2, 5, 0, 4, 1, 3, 0, 0, 4],
'U2': [2, 3, 1, 0, 3, 0, 2, 0, 0, 3, 0],
'U3': [0, 4, 0, 5, 0, 4, 0, 3, 0, 2, 4],
'U4': [0, 0, 2, 1, 4, 3, 2, 0, 0, 2, 0],
'U5': [0, 0, 0, 5, 0, 4, 0, 3, 0, 0, 4],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U7': [0, 4, 3, 5, 0, 5, 0, 0, 0, 0, 4],
'U8': [4, 3, 0, 3, 4, 2, 2, 0, 2, 3, 2],
'U9': [0, 2, 0, 3, 1, 0, 1, 0, 0, 2, 0],
'U10': [0, 3, 0, 4, 3, 3, 0, 3, 0, 4, 4],
'U11': [2, 2, 1, 2, 1, 0, 2, 0, 1, 0, 2],
'U12': [0, 4, 4, 5, 0, 0, 0, 3, 0, 4, 5],
'U13': [3, 3, 0, 2, 2, 3, 2, 0, 2, 0, 3],
'U14': [0, 3, 4, 5, 0, 5, 0, 0, 0, 4, 0],
'U15': [2, 0, 0, 3, 0, 2, 2, 3, 0, 0, 3],
'U16': [4, 4, 0, 4, 3, 4, 0, 3, 0, 3, 0],
'U17': [0, 2, 0, 3, 1, 0, 2, 0, 1, 0, 3],
'U18': [2, 3, 1, 0, 3, 2, 3, 2, 0, 2, 0],
'U19': [0, 5, 0, 4, 0, 3, 0, 4, 0, 0, 5],
'U20': [0, 0, 3, 0, 3, 0, 4, 0, 2, 0, 0],
'U21': [3, 0, 2, 4, 2, 3, 0, 4, 2, 3, 3],
'U22': [4, 4, 0, 5, 3, 5, 0, 4, 0, 3, 0],
'U23': [3, 0, 0, 0, 3, 0, 2, 0, 0, 4, 0],
'U24': [4, 0, 3, 0, 3, 0, 3, 0, 0, 2, 2],
'U25': [0, 5, 0, 3, 3, 4, 0, 3, 3, 4, 4]}
df = pd.DataFrame(d)
cos_df = pd.DataFrame(cosine_similarity(df.T), columns = df.columns)
cos_df.insert(0,"Columns",df.columns)
print(cos_df)
Output:
Columns U1 U2 U3 U4 ... U21 U22 U23 U24 U25
0 U1 1.000000 0.374228 0.902462 0.380803 ... 0.787351 0.805479 0.182123 0.414455 0.742959
1 U2 0.374228 1.000000 0.323498 0.648886 ... 0.428580 0.588035 0.838144 0.746816 0.574696
2 U3 0.902462 0.323498 1.000000 0.367348 ... 0.747476 0.790950 0.139942 0.181195 0.867595
3 U4 0.380803 0.648886 0.367348 1.000000 ... 0.562244 0.572351 0.631579 0.636035 0.543830
4 U5 0.829156 0.000000 0.876038 0.339457 ... 0.770675 0.651439 0.000000 0.137890 0.660241
5 U6 0.348816 0.864242 0.254164 0.650011 ... 0.500694 0.448630 0.707365 0.759113 0.553116
6 U7 0.888018 0.262071 0.870404 0.442141 ... 0.621170 0.642383 0.000000 0.249542 0.712893
7 U8 0.671751 0.808290 0.610121 0.655610 ... 0.735867 0.793363 0.749269 0.711438 0.774202
8 U9 0.561951 0.650011 0.667940 0.483810 ... 0.512989 0.681623 0.483810 0.321246 0.659221
9 U10 0.768376 0.545545 0.905945 0.584094 ... 0.817316 0.810441 0.442495 0.381958 0.930116
10 U11 0.766131 0.625543 0.584602 0.405906 ... 0.606128 0.561442 0.439732 0.700749 0.599162
11 U12 0.769604 0.451144 0.813118 0.329333 ... 0.724166 0.583435 0.250921 0.406111 0.740772
12 U13 0.806747 0.577813 0.687871 0.517409 ... 0.666687 0.708161 0.427425 0.582552 0.757112
13 U14 0.695436 0.436785 0.734756 0.612195 ... 0.644610 0.720248 0.272087 0.293578 0.662689
14 U15 0.849837 0.213504 0.759751 0.337691 ... 0.805629 0.669039 0.259762 0.448449 0.582825
15 U16 0.781028 0.663914 0.757364 0.578184 ... 0.785252 0.992774 0.561179 0.455047 0.783178
16 U17 0.713653 0.409462 0.713247 0.337227 ... 0.528221 0.456211 0.214599 0.396942 0.669745
17 U18 0.569298 0.879408 0.487692 0.733674 ... 0.573070 0.741858 0.709218 0.696631 0.664230
18 U19 0.898717 0.262071 0.949531 0.221071 ... 0.656330 0.691049 0.000000 0.146789 0.813301
19 U20 0.165567 0.540738 0.000000 0.684211 ... 0.290191 0.135557 0.447368 0.681466 0.233070
20 U21 0.787351 0.428580 0.747476 0.562244 ... 1.000000 0.809693 0.489696 0.563602 0.771035
21 U22 0.805479 0.588035 0.790950 0.572351 ... 0.809693 1.000000 0.497042 0.403039 0.782601
22 U23 0.182123 0.838144 0.139942 0.631579 ... 0.489696 0.497042 1.000000 0.795044 0.388450
23 U24 0.414455 0.746816 0.181195 0.636035 ... 0.563602 0.403039 0.795044 1.000000 0.335306
24 U25 0.742959 0.574696 0.867595 0.543830 ... 0.771035 0.782601 0.388450 0.335306 1.000000
[25 rows x 26 columns]
You can use itertools.combinations for that (dct is your input dictionary):
from itertools import combinations
for k1, k2 in combinations(dct.keys(), 2):
# compute cosine similarity between dct[k1] and dct[k2]
...
Related
I want to concatenate all 2 dimensional values in a dictionary.
The number of rows of these values is always the same.
D = {'a': [[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]],
'b': [[1, 1],
[1, 1],
[1, 1]],
'c': [[2, 2, 2, 2],
[2, 2, 2, 2],
[2, 2, 2, 2]]
}
And the output must be form of a torch tensor.
tensor([[0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2],
[0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2],
[0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2]])
Any help would be appreciated!!
import torch
print(torch.cat(tuple([torch.tensor(D[name]) for name in D.keys()]), dim=1))
Output:
tensor([[0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2],
[0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2],
[0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2]])
from itertools import chain
l = []
for i in range(len(D)):
t = [ D[k][i] for k in D ]
l.append( list(chain.from_iterable(t)) )
Output:
[[0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2],
[0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2],
[0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2]]
I have a dictionary of user ratings stored in a user_dict dictionary which looks like this:
{'U1': [3, 4, 2, 5, 0, 4, 1, 3, 0, 0, 4],
'U2': [2, 3, 1, 0, 3, 0, 2, 0, 0, 3, 0],
'U3': [0, 4, 0, 5, 0, 4, 0, 3, 0, 2, 4],
'U4': [0, 0, 2, 1, 4, 3, 2, 0, 0, 2, 0],
'U5': [0, 0, 0, 5, 0, 4, 0, 3, 0, 0, 4],
'U6': [2, 3, 4, 0, 3, 0, 3, 0, 3, 4, 0],
'U7': [0, 4, 3, 5, 0, 5, 0, 0, 0, 0, 4],
'U8': [4, 3, 0, 3, 4, 2, 2, 0, 2, 3, 2],
'U9': [0, 2, 0, 3, 1, 0, 1, 0, 0, 2, 0],
'U10': [0, 3, 0, 4, 3, 3, 0, 3, 0, 4, 4],
'U11': [2, 2, 1, 2, 1, 0, 2, 0, 1, 0, 2],
'U12': [0, 4, 4, 5, 0, 0, 0, 3, 0, 4, 5],
'U13': [3, 3, 0, 2, 2, 3, 2, 0, 2, 0, 3],
'U14': [0, 3, 4, 5, 0, 5, 0, 0, 0, 4, 0],
'U15': [2, 0, 0, 3, 0, 2, 2, 3, 0, 0, 3],
'U16': [4, 4, 0, 4, 3, 4, 0, 3, 0, 3, 0],
'U17': [0, 2, 0, 3, 1, 0, 2, 0, 1, 0, 3],
'U18': [2, 3, 1, 0, 3, 2, 3, 2, 0, 2, 0],
'U19': [0, 5, 0, 4, 0, 3, 0, 4, 0, 0, 5],
'U20': [0, 0, 3, 0, 3, 0, 4, 0, 2, 0, 0],
'U21': [3, 0, 2, 4, 2, 3, 0, 4, 2, 3, 3],
'U22': [4, 4, 0, 5, 3, 5, 0, 4, 0, 3, 0],
'U23': [3, 0, 0, 0, 3, 0, 2, 0, 0, 4, 0],
'U24': [4, 0, 3, 0, 3, 0, 3, 0, 0, 2, 2],
'U25': [0, 5, 0, 3, 3, 4, 0, 3, 3, 4, 4]
When I load this dictionary into a Pandas dataframe, I wanted the dataframe to have 3 columns: "User", "Agent", "Rating" so ran this code:
DF = pd.DataFrame()
for key in user_dict.keys():
df = pd.DataFrame(columns=['User', 'Agent', 'Rating'])
df['Rating'] = pd.Series(user_dict[key])
df['Agent'] = pd.DataFrame(df.index)
df['User'] = key
DF = pd.concat([DF, df], axis = 0)
DF = DF.reset_index(drop=True)
However, I dont want to add any entries where the rating is 0 as this indicates the the user has not rated this "agent". How do I get the program to not add/or delete entries which have a rating of 0?
You can reshape by DataFrame.unstack with DataFrame construcot, then filtering out 0 by compare for not equal, set index names for new columns names and last use Series.reset_index:
DF = (pd.DataFrame(user_dict)
.unstack()
.loc[lambda x: x!= 0]
.rename_axis(('User','Agent'))
.reset_index(name='Rating'))
print (DF)
User Agent Rating
0 U1 0 3
1 U1 1 4
2 U1 2 2
3 U1 3 5
4 U1 5 4
.. ... ... ...
155 U25 5 4
156 U25 7 3
157 U25 8 3
158 U25 9 4
159 U25 10 4
[160 rows x 3 columns]
Another idea is filter in last step by DataFrame.query:
DF = (pd.DataFrame(user_dict)
.unstack()
.rename_axis(('User','Agent'))
.reset_index(name='Rating')
.query('Rating != 0'))
print (DF)
User Agent Rating
0 U1 0 3
1 U1 1 4
2 U1 2 2
3 U1 3 5
5 U1 5 4
.. ... ... ...
269 U25 5 4
271 U25 7 3
272 U25 8 3
273 U25 9 4
274 U25 10 4
[160 rows x 3 columns]
How do you specify multiple conditions in the np.count_nonzero function.
This is for counting the numbers inside an array that have a value between two values. I know you can subtract the outcomes of two individual count_nonzero lines. But I would like to know if there is an easy way to pass multiple conditions to np.count_nonzero.
import numpy as np
array = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
[0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0],
[0, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 1, 0],
[0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 1, 0],
[0, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 5, 4, 3, 2, 0],
[0, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 0],
[0, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 5, 4, 3, 2, 0],
[0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 1, 0],
[0, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 1, 0],
[0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0],
[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
# Count occurences of values between 5 and 8 in array.
result1 = np.count_nonzero(array <= 8)
result2 = np.count_nonzero(array <= 5)
result = result 1 - result2
I would like to know if there is a way that looks something like:
np.count_nonzero(array >= 6 and array <= 8)
Can this be what you are looking for:
np.count_nonzero(np.logical_and(array>=5, array<=8))
#24
I have a dict in which each key corresponds to a gene name, and each value corresponds to a list. The length of the list is different for each gene, because each element represents a different nucleotide. The number at each position indicates the "score" of the nucleotide.
Because each gene is a different length, I want to be able to directly compare their positional score distributions by splitting each gene up into quantiles (most likely, percentiles: 100 bins).
Here is some simulation data:
myData = {
'Gene1': [3, 1, 1, 2, 3, 1, 1, 1, 3, 0, 0, 0, 3, 3, 3, 0, 1, 2, 1, 3, 2, 2, 0, 2, 0, 1, 0, 3, 0, 3, 1, 1, 0, 3, 0, 0, 1, 0, 1, 0, 1, 3, 3, 2, 3, 1, 0, 1, 2, 2, 0, 3, 0, 2, 0, 1, 1, 2, 3, 3, 1, 2, 1, 3, 1, 0, 0, 3, 2, 0, 3, 0, 2, 1, 1, 1, 2, 1, 1, 3, 0, 1, 1, 1, 3, 3, 0, 2, 2, 1, 3, 2, 3, 0, 2, 3, 2, 1, 3, 1, 3, 2, 1, 3, 0, 3, 3, 0, 0, 1, 0, 3, 1, 1, 3, 0, 0, 2, 3, 1, 0, 2, 1, 2, 1, 2, 1, 2, 0, 1, 1, 1, 3, 1, 3, 1, 3, 2, 3, 3, 3, 1, 1, 2, 1, 0, 2, 2, 2, 0, 1, 0, 3, 1, 3, 2, 1, 3, 0, 1, 3, 1, 0, 1, 2, 1, 2, 2, 3, 2, 3, 2, 2, 2, 1, 2, 2, 0, 3, 1, 2, 1, 1, 3, 2, 2, 1, 3, 1, 0, 1, 3, 2, 2, 3, 0, 0, 1, 0, 0, 3],
'Gene2': [3, 0, 0, 0, 3, 3, 1, 3, 3, 1, 0, 0, 1, 0, 1, 1, 3, 2, 2, 2, 0, 1, 3, 2, 1, 3, 1, 1, 2, 3, 0, 2, 0, 2, 1, 3, 3, 3, 1, 2, 3, 2, 3, 1, 3, 0, 1, 1, 1, 1, 3, 2, 0, 3, 0, 1, 1, 2, 3, 0, 2, 1, 3, 3, 0, 3, 2, 1, 1, 2, 0, 0, 1, 3, 3, 2, 2, 3, 1, 2, 1, 1, 0, 0, 1, 0, 3, 2, 3, 0, 2, 0, 2, 0, 2, 3, 0, 3, 0, 3, 2, 2, 0, 2, 3, 0, 2, 2, 3, 0, 3, 1, 2, 3, 0, 1, 0, 2, 3, 1, 3, 1, 2, 3, 1, 1, 0, 1, 3, 0, 2, 3, 3, 3, 3, 0, 1, 2, 2, 2, 3, 0, 3, 1, 0, 2, 3, 1, 0, 1, 1, 0, 3, 3, 1, 2, 1, 2, 3, 2, 3, 1, 2, 0, 2, 3, 1, 2, 3, 2, 1, 2, 2, 0, 0, 0, 0, 2, 0, 2, 3, 0, 2, 0, 0, 2, 0, 3, 3, 0, 1, 2, 3, 1, 3, 3, 1, 2, 1, 2, 1, 3, 2, 0, 2, 3, 0, 0, 0, 1, 1, 0, 1, 2, 0, 1, 2, 1, 3, 3, 0, 2, 2, 1, 0, 1, 1, 1, 0, 0, 2, 1, 2, 0, 1, 2, 1, 1, 3, 0, 1, 0, 1, 2, 1, 3, 0, 2, 3, 1, 2, 0, 0, 3, 2, 0, 3, 2, 1, 2, 3, 1, 0, 1, 0, 0, 1, 2, 3, 3, 2, 2, 1, 2, 2, 3, 3, 3, 3, 0, 0, 2, 2, 2, 2, 3, 2, 3, 2, 0, 3, 1, 0, 2, 3, 0, 1, 2, 2, 0, 2],
'Gene3': [2, 3, 1, 0, 3, 2, 1, 0, 1, 2, 1, 2, 1, 3, 0, 2, 2, 3, 2, 0, 0, 0, 1, 1, 1, 1, 0, 0, 2, 3, 2, 2, 1, 3, 1, 2, 3, 0, 0, 3, 1, 0, 3, 2, 2, 3, 0, 0, 3, 3, 1, 1, 1, 0, 0, 2, 3, 2, 0, 2, 0, 1, 0, 2, 3, 0, 2, 0, 3, 3, 0, 0, 1, 0, 3, 2, 1, 1, 3, 3, 0, 2, 3, 1, 1, 0, 1, 3, 2, 1, 0, 3, 2, 0, 3, 2, 1, 1, 0, 3, 0, 0, 2, 0, 3, 3, 0, 2, 0, 3, 3, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 3, 3, 3, 3, 1, 3, 0, 0, 3, 1, 0, 2, 2, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 3, 0, 0, 3, 0, 2, 2, 0, 0, 3, 0, 1, 3, 1, 1, 0, 2, 2, 3, 3, 0, 2, 0, 0, 2, 3, 1, 2, 1, 1, 2, 2, 0, 0, 3, 2, 2, 2, 1, 2, 0, 3, 2, 2, 2, 2, 1, 0, 3, 2, 2, 1, 0, 0, 2, 2, 0, 3, 2, 0, 2, 2, 1, 1, 1, 2, 1, 2, 0, 1, 0, 3, 2, 0, 2, 3, 3, 0, 2, 2, 0, 1, 1, 3, 0, 0, 1, 2, 3, 1, 3, 2, 3, 3, 2, 0, 0, 0, 0, 0, 2, 1, 0, 0, 1, 1, 2, 1, 3, 1, 3, 1, 1, 0, 3, 0, 1, 1, 1, 1, 1, 0, 2, 1, 2, 1, 2, 0, 2, 0, 0, 2, 2, 2, 3, 3, 0, 0, 3, 2, 1, 2, 1, 0, 3, 2, 3, 1, 1, 0, 1, 3, 2, 0, 3, 1, 3, 1, 2, 0, 0, 2, 3, 2, 2, 0, 3, 0, 2, 2, 2, 3, 3, 2, 1, 3, 3, 0, 2, 2, 2, 1, 1, 2, 1, 3, 2, 3, 2, 1, 3, 1, 0, 0, 2, 0, 1, 1, 3, 3, 0, 1, 2, 3, 1, 2, 3, 1, 1, 1, 2, 0, 2, 0, 1, 0, 3, 1, 0, 3, 3, 1, 3, 1, 1, 2, 2, 0, 2, 0, 1, 0, 3, 1, 1, 1, 3, 3, 0, 0, 1, 1, 2, 3, 0, 2, 0, 1, 1, 3, 3, 1, 1, 0, 0, 2, 0, 1, 2, 2, 2, 3, 1, 1, 1, 0, 3, 0, 0, 0, 1, 0, 1, 3, 1, 2, 2, 1, 2, 2]
}
As you can see, Gene1 has a length of 201, and Gene2 has a length of 301. However, Gene3 has a length of 428. I want to summarize each of these lists so that, for an arbitrary number of bins (nBins), I can partition the list into a list of lists.
For example, for the first two genes, if I chose nBins=100, then Gene1 would look like [[3,1],[1,2],[3,1],[1,1]...] while Gene2 would look like [[3,0,0],[0,3,3],[1,3,3]...]. That is, I want to partition based on the positions and not the values themselves. My dataset is large, so I'm looking for a library that can do this most efficiently.
Are you sure the length of Gene1 isn't 201?
You don't say what you want to happen in the case where the length isn't divisible by the number of bins. My code mixes sublists of length floor(length/nBins) and ceiling(length/nBins) to get the right number of bins.
new_data = {key : [value[
int(bin_number*len(value)/nBins):
int((bin_number+1)*len(value)/nBins)
]
for bin_number in range(nBins)] for key, value in myData.items()}
You don't need a library. Pure python should be fast enough in 90% of the cases:
nBins = 100
def group(l, size):
return [l[i:i + size] for i in range(0, len(l) + len(l) % size, size)]
bin_data = {k: group(l, len(l) // nBins ) for k, l in myData.items()}
print(bin_data)
I have a 2D boolean numpy array that represents an image, on which I call skimage.measure.label to label each segmented region, giving me a 2D array of int [0,500]; each value in this array represents the region label for that pixel. I would like to now remove the smallest regions. For example, if my input array is shape (n, n), I would like all labeled regions of < m pixels to be subsumed into the larger surrounding regions. For example if n=10 and m=5, my input could be,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1
0, 0, 0, 0, 0, 0, 0, 1, 1, 1
0, 0, 7, 8, 0, 0, 0, 1, 1, 1
0, 0, 0, 0, 0, 0, 0, 1, 1, 1
0, 0, 0, 0, 0, 2, 2, 2, 1, 1
4, 4, 4, 4, 2, 2, 2, 2, 1, 1
4, 6, 6, 4, 2, 2, 2, 3, 3, 3
4, 6, 6, 4, 5, 5, 5, 3, 3, 5
4, 4, 4, 4, 5, 5, 5, 5, 5, 5
4, 4, 4, 4, 5, 5, 5, 5, 5, 5
and the output is then,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1
0, 0, 0, 0, 0, 0, 0, 1, 1, 1
0, 0, 0, 0, 0, 0, 0, 1, 1, 1 # 7 and 8 are replaced by 0
0, 0, 0, 0, 0, 0, 0, 1, 1, 1
0, 0, 0, 0, 0, 2, 2, 2, 1, 1
4, 4, 4, 4, 2, 2, 2, 2, 1, 1
4, 4, 4, 4, 2, 2, 2, 3, 3, 3 # 6 is gone, but 3 remains
4, 4, 4, 4, 5, 5, 5, 3, 3, 5
4, 4, 4, 4, 5, 5, 5, 5, 5, 5
4, 4, 4, 4, 5, 5, 5, 5, 5, 5
I've looked into skimage morphology operations, including binary closing, but none seem to work well for my use case. Any suggestions?
You can do this by performing a binary dilation on the boolean region corresponding to each label. By doing this you will find the number of neighbours for each region. Using this you can then replace values as needed.
For an example code:
import numpy as np
import scipy.ndimage
m = 5
arr = [[0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
[0, 0, 7, 8, 0, 0, 0, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
[0, 0, 0, 0, 0, 2, 2, 2, 1, 1],
[4, 4, 4, 4, 2, 2, 2, 2, 1, 1],
[4, 6, 6, 4, 2, 2, 2, 3, 3, 3],
[4, 6, 6, 4, 5, 5, 5, 3, 3, 5],
[4, 4, 4, 4, 5, 5, 5, 5, 5, 5],
[4, 4, 4, 4, 5, 5, 5, 5, 5, 5]]
arr = np.array(arr)
nval = np.max(arr) + 1
# Compute number of occurances of each number
counts, _ = np.histogram(arr, bins=range(nval + 1))
# Compute the set of neighbours for each number via binary dilation
c = np.array([scipy.ndimage.morphology.binary_dilation(arr == i)
for i in range(nval)])
# Loop over the set of arrays with bad count and update them to the most common
# neighbour
for i in filter(lambda i: counts[i] < m, range(nval)):
arr[arr == i] = np.argmax(np.sum(c[:, arr == i], axis=1))
Which gives the expected result:
>>> arr.tolist()
[[0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
[0, 0, 0, 0, 0, 2, 2, 2, 1, 1],
[4, 4, 4, 4, 2, 2, 2, 2, 1, 1],
[4, 4, 4, 4, 2, 2, 2, 3, 3, 3],
[4, 4, 4, 4, 5, 5, 5, 3, 3, 5],
[4, 4, 4, 4, 5, 5, 5, 5, 5, 5],
[4, 4, 4, 4, 5, 5, 5, 5, 5, 5]]