pairs of rows with the highest string similarity

pairs of rows with the highest string similarity - python

So i have this dataframe:
import pandas as pd
d = {'id': [1,1,1,1,2,2,3,3,3,4,4,4,4],
'name':['ada','aad','ada','ada','dddd','fdd','ccc','cccd','ood','aaa','aaa','aar','rrp']
,'amount':[2,-12,12,-12,5,-5,2,3,-5,3,-10,10,-10]}
df1 = pd.DataFrame(d)
df1
id name amount
0 1 ada 2
1 1 aad -12
2 1 ada 12
3 1 ada -12
4 2 dddd 5
5 2 fdd -5
6 3 ccc 2
7 3 cccd 3
8 3 ood -5
9 4 aaa 3
10 4 aaa -10
11 4 aar 10
12 4 rrp -10
First i want to find the matching positive for negative amounts per id, which i do through this:
def match_pos_neg(df):
return df[df["amount"].isin(-df["amount"])]
df1 = df1.groupby("id").apply(match_pos_neg).reset_index(0, drop=True)
df1
id name amount
1 1 aad -12
2 1 ada 12
3 1 ada -12
4 2 dddd 5
5 2 fdd -5
10 4 aaa -10
11 4 aar 10
12 4 rrp -10
Next thing i want to do is to get only the pairs of matching pos and neg numbers that also have the highest similarity in the string column 'name'.So if an id has two other negative numbers that match with the positive i want to isolate the pairs with the highest similarity per id, so i want my desired output to be like this:
id name amount
2 1 ada 12
3 1 ada -12
4 2 dddd 5
5 2 fdd -5
10 4 aaa -10
11 4 aar 10
I guess i have to use some type of string similarity index like sequencematcher or jaccard etc., but i am not sure how to work around this. Any help on how to get my desired output would be very much appreciated.

You can try something like this:
please notice you can change the information you print as you wish, just need to edit the return values from the function create_sim
import pandas as pd
from operator import itemgetter
d = {'id': [1,1,1,1,2,2,3,3,3,4,4,4,4],
'name':['ada','aad','ada','ada','dddd','fdd','ccc','cccd','ood','aaa','aaa','aar','rrp']
,'amount':[2,-12,12,-12,5,-5,2,3,-5,3,-10,10,-10]}
df1 = pd.DataFrame(d)
def match_pos_neg(df):
return df[df["amount"].isin(-df["amount"])]
df1 = df1.groupby("id").apply(match_pos_neg).reset_index(0, drop=True)
print(df1)
def split(word):
return [char for char in word]
def DistJaccard(str1, str2):
l1 = set(split(str1))
l2 = set(split(str2))
return float(len(l1 & l2)) / len(l1 | l2)
def create_sim(df, idx):
idx_id = df['id'].values[idx]
idx_amount = df['amount'].values[idx]
idx_name = df['name'].values[idx]
df_t = df.loc[df['id'] == idx_id]
pos = [i for i in list(df_t['amount']) if i > 0] or None
neg = [i for i in list(df_t['amount']) if i < 0] or None
if pos and neg:
l = [x for x in list(df_t['amount']) if x == idx_amount * -1]
if len(l) > 0:
df_t = df.loc[df['amount'] == idx_amount * -1]
compare_list = list(df_t['name'])
list_results = []
for item in compare_list:
sim = DistJaccard(idx_name, item)
list_results.append((item, sim))
return max(list_results, key=itemgetter(1))
return None
count = 0
for index, row in df1.iterrows():
res = create_sim(df1, count)
if res:
print(f"The most similar word of {row['name']} is {res[0]} with similarity of {res[1]}")
else:
print(f"No similar words of {row['name']}")
count+=1
Edit:
In order to make a DF with the results you can change it to this:
count = 0
item1_id = []
item1_row = []
item1_name = []
item2_id = []
item2_row = []
item2_name = []
for index, row in df1.iterrows():
res = create_sim(df1, count)
item1_id.append(row['id'])
item1_row.append(count)
item1_name.append(row['name'])
if res:
row_idx = df1.loc[(df1['id'] == res[2]) & (df1['name'] == res[0]) & (df1['amount'] != row['amount']), "name"].index.tolist()
item2_id.append(row['id'])
item2_row.append(row_idx[0])
item2_name.append(res[0])
else:
item2_id.append(None)
item2_row.append(None)
item2_name.append(None)
count+=1
final = pd.DataFrame(item1_id, columns=['item 1 id'])
final['item 1 row'] = item1_row
final['item 1 name'] = item1_name
final['item 2 id'] = item2_id
final['item 2 row'] = item2_row
final['item 2 name'] = item2_name
print(final)

Related

How to count the number of times in a pandas df that the sum of consecutive values crosses a threshold?

I have a data frame that consists of a time-series of integers. I'm trying to group the data frame by year and then for each year count the number of times that the sum of the absolute value of consecutive entries with the same sign is greater than or equal to 5.
>>> import pandas as pd
>>> l = [1, -1, -4, 2, 2, 4, 5, 1, -3, -4]
>>> idx1 = pd.date_range('2019-01-01',periods=5)
>>> idx2 = pd.date_range('2020-01-01',periods=5)
>>> idx = idx1.union(idx2)
>>> df = pd.DataFrame(l, index=idx, columns=['a'])
>>> df
a
2019-01-01 1
2019-01-02 -1
2019-01-03 -4 \\ 2019 count = 1: abs(-1) + abs(-4) >= 5
2019-01-04 2
2019-01-05 2
2020-01-01 4
2020-01-02 5 \\ 2020 count = 1: abs(4) + abs(5) + abs(1) = 10 >=5
2020-01-03 1
2020-01-04 -3
2020-01-05 -4 \\ 2020 count = 2: abs(-3) + abs(-4) = 7 >= 5
The desired output is:
2019 1
2020 2
My approach to solve this problem is to chain groupby and apply. Below are the implementations of the functions I created to pass to groupby and apply respectively.
>>> def get_year(x):
return x.year
>>> def count(group, t=5):
c = 0 # counter
s = 0 # sum of consec vals w same sign
for i in range(1,len(group)):
if np.sign(group['a'].iloc[i-1]) == np.sign(group['a'].iloc[i]):
if s == 0:
s = group['a'].iloc[i-1] + group['a'].iloc[i]
else:
s += group['a'].iloc[i]
if i == (len(group) -1):
return c + 1
elif (np.sign(group['a'].iloc[i-1]) != np.sign(group['a'].iloc[i])) and (abs(s) >= t):
#if consec streak of vals w same sign is broken and abs(s) >= t then inc c and reset s
c += 1
s = 0
elif (np.sign(group['a'].iloc[i-1]) != np.sign(group['a'].iloc[i])) and (abs(s) < t):
#if consec streak of vals w same sign is broken and abs(s) < t then reset s
s = 0
return c
>>> by_year = df.groupby(get_year)
>>> by_year.apply(count)
2019 1
2020 2
My question is:
Is there a more "pythonic" implementation of the above count function that produces the desired result but doesn't rely on for loops?

How to remove rows with uncorrect id in pandas dataframe

I'm using python3. I would like to remove incorrect id's from my dataframe column.
Example:
d = {'name': ['a', 'b', 'c', 'd'], 'id': [9356622,9030321,9408530, 1112200]}
df = pd.DataFrame(data=d)
I need to verify id by multiplying each of the first six digits by a factor of 2 to 7 corresponding to their position from right to left. For example, for id 9356622:
(9×7) + (3×6) + (5×5) + (6×4) + (6×3) + (2×2) = 152. So in this case last number 2 so it's correct since the last number of id 9356622 is 2. I need to check with the last number after preforming this calculation.

Input data:
>>> df
name id
0 a 9356622
1 b 9030321
2 c 9408530
3 d 1112200
Explode the id numbers to digits:
df1 = df['id'].astype(str).map(list).apply(pd.Series).astype(int)
>>> df1
0 1 2 3 4 5 6
0 9 3 5 6 6 2 2 # 152 -> modulo(10) = 2 -> True
1 9 0 3 0 3 2 1 # 91 -> modulo(10) = 1 -> True
2 9 4 0 8 5 3 0 # 140 -> modulo(10) = 0 -> True
3 1 1 1 2 2 0 0 # 32 -> modulo(10) = 2 -> False
Now check your math operation:
>>> df1.iloc[:, :6].mul(range(7, 1, -1)).sum(axis=1).mod(10) == df1.iloc[:, 6]
0 True
1 True
2 True
3 False
dtype: bool

def fun_IMO(string):
try:
pattern = r"([0-9][0-9][0-9][0-9][0-9][0-9][0-9])"
regexFinder = re.compile(pattern)
string = string.lower()
res = regexFinder.search(string)
if res.groups():
try:
nuberIMO = res.groups()[0]
numberIMO_calc = (int(nuberIMO[0])*7) + (int(nuberIMO[1])*6) + (int(nuberIMO[2])*5) + (int(nuberIMO[3])*4) + (int(nuberIMO[4])*3) + (int(nuberIMO[5])*2)
if str(numberIMO_calc)[-1] == nuberIMO[0]:
return True
else:
return False
except Exception as e:
return e
except Exception as e:
return e

Generate combinations for a comma separated strings in a pandas row

I have a dataframe like this:
ID, Values
1 10, 11, 12, 13
2 14
3 15, 16, 17, 18
I want to create a new dataframe like this:
ID COl1 Col2
1 10 11
1 11 12
1 12 13
2 14
3 15 16
3 16 17
3 17 18
Please help me in how to do this???
Note: The rows in Values column of input df are str type.

Use list comprehension with flattening and small change - if i > 0: to if i == 2: for correct working with one element values:
from collections import deque
#https://stackoverflow.com/a/36586925
def chunks(iterable, chunk_size=2, overlap=1):
# we'll use a deque to hold the values because it automatically
# discards any extraneous elements if it grows too large
if chunk_size < 1:
raise Exception("chunk size too small")
if overlap >= chunk_size:
raise Exception("overlap too large")
queue = deque(maxlen=chunk_size)
it = iter(iterable)
i = 0
try:
# start by filling the queue with the first group
for i in range(chunk_size):
queue.append(next(it))
while True:
yield tuple(queue)
# after yielding a chunk, get enough elements for the next chunk
for i in range(chunk_size - overlap):
queue.append(next(it))
except StopIteration:
# if the iterator is exhausted, yield any remaining elements
i += overlap
if i == 2:
yield tuple(queue)[-i:]
L = [[x] + list(z) for x, y in zip(df['ID'], df['Values']) for z in (chunks(y.split(', ')))]
df = pd.DataFrame(L, columns=['ID','Col1','Col2']).fillna('')
print (df)
ID Col1 Col2
0 1 10 11
1 1 11 12
2 1 12 13
3 2 14
4 3 15 16
5 3 16 17
6 3 17 18

Tried slightly different approach. Created a function which will return numbers in pairs from the initial comma separated string.
def pairup(mystring):
"""Function to return paired up list from string"""
mylist = mystring.split(',')
if len(mylist) == 1: return [mylist]
splitlist = []
for index, item in enumerate(mylist):
try:
splitlist.append([mylist[index], mylist[index+1]])
except:
pass
return splitlist
Now let's create the new data frame.
# https://stackoverflow.com/a/39955283/3679377
new_df = df[['ID']].join(
df.Values.apply(lambda x: pd.Series(pairup(x)))
.stack()
.apply(lambda x: pd.Series(x))
.fillna("")
.reset_index(level=1, drop=True),
how='left').reset_index(drop=True)
new_df.columns = ['ID', 'Col 1', 'Col 2']
Here's the output of print(new_df).
ID Col 1 Col 2
0 1 10 11
1 1 11 12
2 1 12 13
3 2 14
4 3 15 16
5 3 16 17
6 3 17 18

Find longest run of consecutive zeros for each user in dataframe

I'm looking to find the max run of consecutive zeros in a DataFrame with the result grouped by user. I'm interested in running the RLE on usage.
sample input:
user--day--usage
A-----1------0
A-----2------0
A-----3------1
B-----1------0
B-----2------1
B-----3------0
Desired output
user---longest_run
a - - - - 2
b - - - - 1
mydata <- mydata[order(mydata$user, mydata$day),]
user <- unique(mydata$user)
d2 <- data.frame(matrix(NA, ncol = 2, nrow = length(user)))
names(d2) <- c("user", "longest_no_usage")
d2$user <- user
for (i in user) {
if (0 %in% mydata$usage[mydata$user == i]) {
run <- rle(mydata$usage[mydata$user == i]) #Run Length Encoding
d2$longest_no_usage[d2$user == i] <- max(run$length[run$values == 0])
} else {
d2$longest_no_usage[d2$user == i] <- 0 #some users did not have no-usage days
}
}
d2 <- d2[order(-d2$longest_no_usage),]
this works in R but I want to do the same thing in python, I'm totally stumped

Use groupby with size by columns user, usage and helper Series for consecutive values first:
print (df)
user day usage
0 A 1 0
1 A 2 0
2 A 3 1
3 B 1 0
4 B 2 1
5 B 3 0
6 C 1 1
df1 = (df.groupby([df['user'],
df['usage'].rename('val'),
df['usage'].ne(df['usage'].shift()).cumsum()])
.size()
.to_frame(name='longest_run'))
print (df1)
longest_run
user val usage
A 0 1 2
1 2 1
B 0 3 1
5 1
1 4 1
C 1 6 1
Then filter only zero rows, get max and add reindex for append non 0 groups:
df2 = (df1.query('val == 0')
.max(level=0)
.reindex(df['user'].unique(), fill_value=0)
.reset_index())
print (df2)
user longest_run
0 A 2
1 B 1
2 C 0
Detail:
print (df['usage'].ne(df['usage'].shift()).cumsum())
0 1
1 1
2 2
3 3
4 4
5 5
6 6
Name: usage, dtype: int32

get max number of consecutive zeros on series:
def max0(sr):
return (sr != 0).cumsum().value_counts().max() - (0 if (sr != 0).cumsum().value_counts().idxmax()==0 else 1)
max0(pd.Series([1,0,0,0,0,2,3]))
4

I think the following does what you are looking for, where the consecutive_zero function is an adaptation of the top answer here.
Hope this helps!
import pandas as pd
from itertools import groupby
df = pd.DataFrame([['A', 1], ['A', 0], ['A', 0], ['B', 0],['B',1],['C',2]],
columns=["user", "usage"])
def len_iter(items):
return sum(1 for _ in items)
def consecutive_zero(data):
x = list((len_iter(run) for val, run in groupby(data) if val==0))
if len(x)==0: return 0
else: return max(x)
df.groupby('user').apply(lambda x: consecutive_zero(x['usage']))
Output:
user
A 2
B 1
C 0
dtype: int64

If you have a large dataset and speed is essential, you might want to try the high-performance pyrle library.
Setup:
# pip install pyrle
# or
# conda install -c bioconda pyrle
import numpy as np
np.random.seed(0)
import pandas as pd
from pyrle import Rle
size = int(1e7)
number = np.random.randint(2, size=size)
user = np.random.randint(5, size=size)
df = pd.DataFrame({"User": np.sort(user), "Number": number})
df
# User Number
# 0 0 0
# 1 0 1
# 2 0 1
# 3 0 0
# 4 0 1
# ... ... ...
# 9999995 4 1
# 9999996 4 1
# 9999997 4 0
# 9999998 4 0
# 9999999 4 1
#
# [10000000 rows x 2 columns]
Execution:
for u, udf in df.groupby("User"):
r = Rle(udf.Number)
is_0 = r.values == 0
print("User", u, "Max", np.max(r.runs[is_0]))
# (Wall time: 1.41 s)
# User 0 Max 20
# User 1 Max 23
# User 2 Max 20
# User 3 Max 22
# User 4 Max 23

Binning values into groups with a minimum size using pandas

I'm trying to bin a sample of observations into n discrete groups, then combine these groups until each subgroup has a mimimum of 6 members. So far, I've generated bins, and grouped my DataFrame into them:
# df is a DataFrame containing 135 measurments
bins = np.linspace(df.heights.min(), df.heights.max(), 21)
grp = df.groupby(np.digitize(df.heights, bins))
grp.size()
1 4
2 1
3 2
4 3
5 2
6 8
7 7
8 6
9 19
10 12
11 13
12 12
13 7
14 12
15 12
16 2
17 3
18 6
19 3
21 1
So I can see that I need to combine groups 1 - 3, 3 - 5, and 16 - 21, while leaving the others intact, but I don't know how to do this programmatically.

You can do this:
df = pd.DataFrame(np.random.random_integers(1,200,135), columns=['heights'])
bins = np.linspace(df.heights.min(), df.heights.max(), 21)
grp = df.groupby(np.digitize(df.heights, bins))
sizes = grp.size()
def f(vals, max):
sum = 0
group = 1
for v in vals:
sum += v
if sum <= max:
yield group
else:
group +=1
sum = v
yield group
#I've changed 6 by 30 for the example cause I don't have your original dataset
grp.size().groupby([g for g in f(sizes, 30)])
And if you do print grp.size().groupby([g for g in f(sizes, 30)]).cumsum() you will see that the cumulative sums is grouped as expected.
Also if you want to group the original values you can do something like:
dat = np.random.random_integers(0,200,135)
dat = np.array([78,116,146,111,147,78,14,91,196,92,163,144,107,182,58,89,77,134,
83,126,94,70,121,175,174,88,90,42,93,131,91,175,135,8,142,166,
1,112,25,34,119,13,95,182,178,200,97,8,60,189,49,94,191,81,
56,131,30,107,16,48,58,65,78,8,0,11,45,179,151,130,35,64,
143,33,49,25,139,20,53,55,20,3,63,119,153,14,81,93,62,162,
46,29,84,4,186,66,90,174,55,48,172,83,173,167,66,4,197,175,
184,20,23,161,70,153,173,127,51,186,114,27,177,96,93,105,169,158,
83,155,161,29,197,143,122,72,60])
df = pd.DataFrame({'heights':dat})
bins = np.digitize(dat,np.linspace(0,200,21))
grp = df.heights.groupby(bins)
m = 15 #you should put 6 here, the minimun
s = 0
c = 1
def f(x):
global c,s
res = pd.Series([c]*x.size,index=x.index)
s += x.size
if s>m:
s = 0
c += 1
return res
g = grp.apply(f)
print df.groupby(g).size()
#another way of doing the same, just a matter of taste
m = 15 #you should put 6 here, the minimun
s = 0
c = 1
def f2(x):
global c,s
res = [c]*x.size #here is the main difference with f
s += x.size
if s>m:
s = 0
c += 1
return res
g = grp.transform(f2) #call it this way
print df.groupby(g).size()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

pairs of rows with the highest string similarity - python

Related

How to count the number of times in a pandas df that the sum of consecutive values crosses a threshold?

How to remove rows with uncorrect id in pandas dataframe

Generate combinations for a comma separated strings in a pandas row

Find longest run of consecutive zeros for each user in dataframe

Binning values into groups with a minimum size using pandas

Categories

Resources