Feature extraction from the training data - python

I have a training data like below which have all the information under a single column. The data set has above 300000 data.
id features label
1 name=John Matthew;age=25;1.=Post Graduate;2.=Football Player; 1
2 name=Mark clark;age=21;1.=Under Graduate;Interest=Video Games; 1
3 name=David;age=12;1:=High School;2:=Cricketer;native=america; 2
4 name=George;age=11;1:=High School;2:=Carpenter;married=yes 2
.
.
300000 name=Kevin;age=16;1:=High School;2:=Driver;Smoker=No 3
Now i need to convert this training data like below
id name age 1 2 Interest married Smoker
1 John Matthew 25 Post Graduate Football Player Nan Nan Nan
2 Mark clark 21 Under Graduate Nan Video Games Nan Nan
.
.
Is there any efficient way to do this. I tried the below code but it took 3 hours to complete
#Getting the proper features from the features column
cols = {}
for choices in set_label:
collection_list = []
array = train["features"][train["label"] == choices].values
for i in range(1,len(array)):
var_split = array[i].split(";")
try :
d = (dict(s.split('=') for s in var_split))
for x in d.keys():
collection_list.append(x)
except ValueError:
Error = ValueError
count = Counter(collection_list)
for k , v in count.most_common(5):
key = k.replace(":","").replace(" ","_").lower()
cols[key] = v
columns_add = list(cols.keys())
train = train.reindex(columns = np.append( train.columns.values, columns_add))
print (train.columns)
print (train.shape)
#Adding the values for the newly created problem
for row in train.itertuples():
dummy_dic = {}
new_dict={}
value = train.loc[row.Index, 'features']
v_split = value.split(";")
try :
dummy_dict = (dict(s.split('=') for s in v_split))
for k, v in dummy_dict.items():
new_key = k.replace(":","").replace(" ","_").lower()
new_dict[new_key] = v
except ValueError:
Error = ValueError
for k,v in new_dict.items():
if k in train.columns:
train.loc[row.Index, k] = v
Is there any useful function that i can apply here for efficient way of feature extraction ?

Create two DataFrames (in the first one all the features are the same for every data point and the second one is a modification of the first one introducing different features for some data points) meeting your criteria:
import pandas as pd
import numpy as np
import random
import time
import itertools
# Create a DataFrame where all the keys for each datapoint in the "features" column are the same.
num = 300000
NAMES = ['John', 'Mark', 'David', 'George', 'Kevin']
AGES = [25, 21, 12, 11, 16]
FEATURES1 = ['Post Graduate', 'Under Graduate', 'High School']
FEATURES2 = ['Football Player', 'Cricketer', 'Carpenter', 'Driver']
LABELS = [1, 2, 3]
df = pd.DataFrame()
df.loc[:num, 0]= ["name={0};age={1};feature1={2};feature2={3}"\
.format(NAMES[np.random.randint(0, len(NAMES))],\
AGES[np.random.randint(0, len(AGES))],\
FEATURES1[np.random.randint(0, len(FEATURES1))],\
FEATURES2[np.random.randint(0, len(FEATURES2))]) for i in xrange(num)]
df['label'] = [LABELS[np.random.randint(0, len(LABELS))] for i in range(num)]
df.rename(columns={0:"features"}, inplace=True)
print df.head(20)
# Create a modified sample DataFrame from the previous one, where not all the keys are the same for each data point.
mod_df = df
random_positions1 = random.sample(xrange(10), 5)
random_positions2 = random.sample(xrange(11, 20), 5)
INTERESTS = ['Basketball', 'Golf', 'Rugby']
SMOKING = ['Yes', 'No']
mod_df.loc[random_positions1, 'features'] = ["name={0};age={1};interest={2}"\
.format(NAMES[np.random.randint(0, len(NAMES))],\
AGES[np.random.randint(0, len(AGES))],\
INTERESTS[np.random.randint(0, len(INTERESTS))]) for i in xrange(len(random_positions1))]
mod_df.loc[random_positions2, 'features'] = ["name={0};age={1};smoking={2}"\
.format(NAMES[np.random.randint(0, len(NAMES))],\
AGES[np.random.randint(0, len(AGES))],\
SMOKING[np.random.randint(0, len(SMOKING))]) for i in xrange(len(random_positions2))]
print mod_df.head(20)
Assume that your original data is stored in a DataFrame called df.
Solution 1 (all the features are the same for every data point).
def func2(y):
lista = y.split('=')
value = lista[1]
return value
def function(x):
lista = x.split(';')
array = [func2(i) for i in lista]
return array
# Calculate the execution time
start = time.time()
array = pd.Series(df.features.apply(function)).tolist()
new_df = df.from_records(array, columns=['name', 'age', '1', '2'])
end = time.time()
new_df
print 'Total time:', end - start
Total time: 1.80923295021
Edit: The one thing you need to do is to edit accordingly the columns list.
Solution 2 (The features might be the same or different for every data point).
import pandas as pd
import numpy as np
import time
import itertools
# The following functions are meant to extract the keys from each row, which are going to be used as columns.
def extract_key(x):
return x.split('=')[0]
def def_columns(x):
lista = x.split(';')
keys = [extract_key(i) for i in lista]
return keys
df = mod_df
columns = pd.Series(df.features.apply(def_columns)).tolist()
flattened_columns = list(itertools.chain(*columns))
flattened_columns = np.unique(np.array(flattened_columns)).tolist()
flattened_columns
# This function turns each row from the original dataframe into a dictionary.
def function(x):
lista = x.split(';')
dict_ = {}
for i in lista:
key, val = i.split('=')
dict_[key ] = val
return dict_
df.features.apply(function)
arr = pd.Series(df.features.apply(function)).tolist()
pd.DataFrame.from_dict(arr)

Suppose your data is like this :
features= ["name=John Matthew;age=25;1:=Post Graduate;2:=Football Player;",
'name=Mark clark;age=21;1:=Under Graduate;2:=Football Player;',
"name=David;age=12;1:=High School;2:=Cricketer;",
"name=George;age=11;1:=High School;2:=Carpenter;",
'name=Kevin;age=16;1:=High School;2:=Driver; ']
df = pd.DataFrame({'features': features})
I will start by this answer and try to replace all separator (name, age , 1:= , 2:= ) by ;
with this function
def replace_feature(x):
for r in (("name=", ";"), (";age=", ";"), (';1:=', ';'), (';2:=', ";")):
x = x.replace(*r)
x = x.split(';')
return x
df = df.assign(features= df.features.apply(replace_feature))
After applying that function to your df all the values will a list of features. where you can get each one by index
then I use 4 customs function to get each attribute name, age, grade; job,
Note: There can be a better way to do this by using only one function
def get_name(df):
return df['features'][1]
def get_age(df):
return df['features'][2]
def get_grade(df):
return df['features'][3]
def get_job(df):
return df['features'][4]
And finaly applying that function to your dataframe :
df = df.assign(name = df.apply(get_name, axis=1),
age = df.apply(get_age, axis=1),
grade = df.apply(get_grade, axis=1),
job = df.apply(get_job, axis=1))
Hope this will be quick and fast

As far as I understand your code, the poor performances comes from the fact that you create the dataframe element by element. It's better to create the whole dataframe at once whith a list of dictionnaries.
Let's recreate your input dataframe :
from StringIO import StringIO
data=StringIO("""id features label
1 name=John Matthew;age=25;1.=Post Graduate;2.=Football Player; 1
2 name=Mark clark;age=21;1.=Under Graduate;2.=Football Player; 1
3 name=David;age=12;1:=High School;2:=Cricketer; 2
4 name=George;age=11;1:=High School;2:=Carpenter; 2""")
df=pd.read_table(data,sep=r'\s{3,}',engine='python')
we can check :
print df
id features label
0 1 name=John Matthew;age=25;1.=Post Graduate;2.=F... 1
1 2 name=Mark clark;age=21;1.=Under Graduate;2.=Fo... 1
2 3 name=David;age=12;1:=High School;2:=Cricketer; 2
3 4 name=George;age=11;1:=High School;2:=Carpenter; 2
Now we can create the needed list of dictionnaries with the following code :
feat=[]
for line in df['features']:
line=line.replace(':','.')
lsp=line.split(';')[:-1]
feat.append(dict([elt.split('=') for elt in lsp]))
And the resulting dataframe :
print pd.DataFrame(feat)
1. 2. age name
0 Post Graduate Football Player 25 John Matthew
1 Under Graduate Football Player 21 Mark clark
2 High School Cricketer 12 David
3 High School Carpenter 11 George

Related

Python script to sum values according to conditions in a loop

I need to sum the value contained in a column (column 9) if a condition is satisfied: the condition is that it needs to be a pair of individuals (column 1 and column 3), whether they are repeated or not.
My input file is made this way:
Sindhi_HGDP00171 0 Tunisian_39T 0 1 120437718 147097266 3.02 7.111
Sindhi_HGDP00183 1 Sindhi_HGDP00206 2 1 242708729 244766624 7.41 3.468
Sindhi_HGDP00183 1 Sindhi_HGDP00206 2 1 242708729 244766624 7.41 4.468
IBS_HG01768 2 Moroccan_MRA46 1 1 34186193 36027711 30.46 3.108
IBS_HG01710 1 Sardinian_HGDP01065 2 1 246117191 249120684 7.53 3.258
IBS_HG01768 2 Moroccan_MRA46 2 1 34186193 37320967 43.4 4.418
Therefore for instance, I would need the value of column 9 for each pair to be summed. Some of these pairs appear multiple time, in this case I would need the sum of value in column 9 betweem IBS_HG01768 and Moroccan_MRA46, and the sum of the value between Sindhi_HGDP00183 and Sindhi_HGDP00206. Some of these pairs are not repeated but I still need them to appear in the final results.
What I manage so far is to sum by group (population), so I sum column 9 value by pair of population like Sindhi and Tunisian for instance. I need to do the sum by pairs of Individuals.
My script is this:
import pandas as pd
import numpy as np
import itertools
# defines columns names
cols = ['ID1', 'HAP1', 'ID2', 'HAP2', 'CHR', 'STARTPOS', 'ENDPOS', 'LOD', 'IBDLENGTH']
# loads data (the file needs to be in the same folder where the script is)
data = pd.read_csv("./Roma_Ref_All_sorted.txt", sep = '\t', names = cols)
# removes the sample ID for ID1/ID2 columns and places it in two dedicated columns
data[['ID1', 'ID1_samples']] = data['ID1'].str.split('_', expand = True)
data[['ID2', 'ID2_samples']] = data['ID2'].str.split('_', expand = True)
# gets the groups list from both ID columns...
groups_id1 = list(data.ID1.unique())
groups_id2 = list(data.ID2.unique())
groups = list(set(groups_id1 + groups_id2))
# ... and all the possible pairs
group_pairs = [i for i in itertools.combinations(groups, 2)]
# subsets the pairs having Roma
group_pairs_roma = [x for x in group_pairs if ('Roma' in x[0] and x[0] != 'Romanian') or
('Roma' in x[1] and x[1] != 'Romanian')]
# preapres output df
result = pd.DataFrame(columns = ['ID1', 'ID2', 'IBD_sum'])
# loops all the possible pairs and computes the sum of IBD length
for idx, group_pair in enumerate(group_pairs_roma):
id1 = group_pair[0]
id2 = group_pair[1]
ibd_sum = round(data.loc[((data['ID1'] == id1) & (data['ID2'] == id2)) |
((data['ID1'] == id2) & (data['ID2'] == id1)), 'IBDLENGTH'].sum(),3)
result.loc [idx, ['ID1', 'ID2', 'IBD_sum']] = [id1, id2, ibd_sum]
# saves results
result.to_csv("./groups_pairs_sum_IBD.txt", sep = '\t', index = False)
My current output is something like this:
ID1 ID2 IBD_sum
Sindhi IBS 3.275
Sindhi Moroccan 74.201
Sindhi Sindhi 119.359
While I need something like:
ID1 ID2 IBD_sum
Sindhi_individual1 Moroccan_individual1 3.275
Sindhi_individual2 Moroccan_individual2 5.275
Sindhi_individual3 IBS_individual1 4.275
I have tried by substituting one line in my code, by writing
groups_id1 = list(data.ID1_samples.unique())
groups_id2 = list(data.ID2_samples.unique())
and later
ibd_sum = round(data.loc[((data['ID1_samples'] == id1) & (data['ID2_samples'] == id2)) |
((data['ID1_samples'] == id2) & (data['ID2_samples'] == id1)), 'IBDLENGTH'].sum(),3)
Which in theory should work because I set the individuals as pairs instead of populations as pairs, but the output was empty. What could I do to edit the code for what I need?
I have solved the problem on my own but using R language.
This is the code:
ibd <- read.delim("input.txt", sep='\t')
ibd_sum_indv <- ibd %>%
group_by(ID1, ID2) %>%
summarise(SIBD = sum(IBDLENGTH),
NIBD = n()) %>%
ungroup()

pandas Optimizing many loops into one

I have multiple dfs with same columns. Here is the list of all dfs
dfs = [df_14, df_15, df_16, df_17]
Every dataframe looks like this for example,df_14:
id
Days
001
0
004
56
013
95
015
33
Next, df_15:
Id
Days
001
0
023
18
459
19
811
35
df_16:
Id
Days
111
93
114
56
232
0
df_17:
Id
Days
532
120
113
31
065
58
015
2
My code:
rows = [['532', 120],['113', 31], ['065', 58],['025', 2]]
for row in rows:
df_14.loc[len(df_14)] = row
# and so on
The task is to append to lists of each month - the is which has 30-60 days and another separate list with id of clients which has 60-100 days.
#The result should be like this:
14_1: ['004', '015']
14_2: ['013']
15_1: ['811']
I try to use f'strings on it. Something like:
abrreviations = ['14', '15','16', '17']
c = ['_1', '_2']
#Have wrote initializing loops like
m_list=[]
for a in abrreviations:
for cp in c:
m_list.append(a+cp)
And the idea is using abbreviations in the loops with f'string or format. But don't know how to do it? Or can you offer another ideas?
This can help you
import pandas as pd
data = {'df_jan' : [['001', 0],['004', 56], ['013', 95],['015', 33]],
'df_feb' : [['001', 0],['023', 18], ['459', 19],['811', 35]],
'df_mar' : [['111', 93],['114', 56], ['232', 0]],
'df_apr' : [['532', 120],['113', 31], ['065', 58],['025', 2]]}
dfs = {}
for df in data:
dfs[df] = pd.DataFrame(data[df], columns=['id', 'days'])
months = {}
for df in dfs:
months[df.replace('df_', '') + '_30'] = dfs[df][(dfs[df].days >= 30) & (dfs[df].days <= 60)].id.to_list()
months[df.replace('df_', '') + '_90'] = dfs[df][(dfs[df].days >= 90) & (dfs[df].days <= 120)].id.to_list()
months
{'jan_30': ['004', '015'],
'jan_90': ['013'],
'feb_30': ['811'],
'feb_90': [],
'mar_30': ['114'],
'mar_90': ['111'],
'apr_30': ['113', '065'],
'apr_90': ['532']}
In response to your comment:
I created the df inside the dictionary to simplify the creation of test data.
Your code can create the df in its own way ...
df_jan = ...
df_feb = ...
df_mar = ...
df_apr = ...
and to process them you create the dictionary ...
dfs = {
'df_jan' : df_jan,
'df_feb' : df_feb,
'df_mar' : df_mar,
'df_apr' : df_apr
}
run the loop
and you can assign results to your variables
and delete dictionaries
jan_30 = months['jan_30']
jan_90 = months['jan_90']
feb_30 = months['feb_30']
feb_90 = months['feb_90']
mar_30 = months['mar_30']
mar_90 = months['mar_90']
apr_30 = months['apr_30']
apr_90 = months['apr_90']
del dfs, months
#let first create a list containing all the dataframe's
all_df=[df_jan, df_feb, df_mar, df_apr, df_may, df_jun, df_jul, df_aug, df_sep, df_oct, df_nov, df_dec]
#create 2 lists for storing the id values of 30-60 range and 90-120 range
list_30,list_90=[],[]
#1 nested for loop for handling all data frames
for cur_df in all_df:
for id,days in zip(cur_df['Id'],cur_df['Days']):
if(30<=days<=60):
list_30.append(id)
elif(90<=days<=120):
list_90.append(id)
#Now list_30 and list_90 contains the corresponding id values in that range
Hope the answer helps :)
Since you didn't provide data I made a basic example and it worked for me so here is a single for-loop as you described:
import numpy as np
import pandas as pd
dfs = [df_jan, df_feb, df_mar, df_apr, df_may, df_jun, df_jul, df_aug, df_sep, df_oct, df_nov, df_dec]
df30 = []
df90 = []
dfsChained30 = []
dfsChained90 = []
for rowsForMonths, xForMonths in enumerate(dfs):
# If January [don't consider chain];
if rowsForMonths == 0:
for dayN in range(dfs[rowsForMonths]):
if dfs[rowsForMonths][dayN] in range(30, 61):
df30.append(dfs[rowsForMonths][dayN])
elif dfs[rowsForMonths][dayN] in range(90, 121):
df90.append(dfs[rowsForMonths][dayN])
else:
pass
dfsChained30.append(df30)
dfsChained90.append(df90)
# If not January [consider chain];
else:
for dayN in range(dfs[rowsForMonths]):
if dfs[rowsForMonths][dayN] in range(30, 61) and dfs[rowsForMonths][dayN] not in set(dfsChained30):
df30.append(dfs[rowsForMonths][dayN])
elif dfs[rowsForMonths][dayN] in range(90, 121) and dfs[rowsForMonths][dayN] not in set(dfsChained90):
df90.append(dfs[rowsForMonths][dayN])
else:
pass
dfsChained30.append(df30)
dfsChained90.append(df90)

How to assign values based on an interval in Pandas

I am trying to assign a value to a dataframe column based on a value that falls IN BETWEEN two values of an other dataframe:
intervals = pd.DataFrame(columns = ['From','To','Value'], data = [[0,100,'A'],[100,200,'B'],[200,500,'C']])
print('intervals\n',intervals,'\n')
points = pd.DataFrame(columns = ['Point', 'Value'], data = [[45,'X'],[125,'X'],[145,'X'],[345,'X']])
print('points\n',points,'\n')
DesiredResult = pd.DataFrame(columns = ['Point', 'Value'], data = [[45,'A'],[125,'B'],[145,'B'],[345,'C']])
print('DesiredResult\n',DesiredResult,'\n')
Many thanks
Let's use map, first create a series using pd.IntervalIndex with from_arrays method:
intervals = intervals.set_index(pd.IntervalIndex.from_arrays(intervals['From'],
intervals['To']))['Value']
points['Value'] = points['Point'].map(intervals)
Output:
Point Value
0 45 A
1 125 B
2 145 B
3 345 C
Another approach:
def calculate_value(x):
return intervals.loc[(x >= intervals['From']) & (x < intervals['To']), 'Value'].squeeze()
desired_result = points.copy()
desired_result['Value'] = desired_result['Point'].apply(calculate_value)

Python: Storing multiple dictionaries after replacing categoricals with integers

My data looks like this:
source browser sex age country class
SEO Chrome M 39 Japan 0
Ads Chrome F 53 United States 0
SEO Opera M 53 United States 1
SEO Safari M 41 NULL 0
Ads Safari M 45 United States 0
Ads Chrome M 18 Canada 0
In trying to get it ready for machine learning, I wrote a function to replace categoricals with integers:
def str2int(data):
y2= data
S = set(y2) #set
D = dict(zip(S, range(len(S)))) # assign each string an integer, and put it in a dict
Y = [D[y2_] for y2_ in y2] # store class labels as ints
return Y
I then call it using the below to convert all string columns to integers:
cols=['sex','browser','country','source']
for col in cols:
df_fraud[col] = convert_str_int(df_fraud[col])
I would like to store the dictionary associated with each column and call it later, which I could simply say "return Y, D" in the def function, but I am not sure how I would include it in my for function below.
Frankly, I am not sure what the best way to store these references in dictionaries are and am open to suggestions.
I have simplified the example below:
This is not working when using the suggested code. Any ideas?
def str2int(data):
y2= data
S = set(y2) #set
D = dict( zip(S, range(len(S))) ) # assign each string an integer, and put it in a dict
Y = [D[y2_] for y2_ in y2] # store class labels as ints
return Y, D
def make_str2int(data):
categories = set(data)
return dict(zip(categories, range(len(categories))))
raw_data = {
'names': ['A','B','B','D','D','E','B','B','E','F'],
'gender': ['M','F','F','F','F','M','M','M','M','M']}
str2int={}
cols = ['names', 'gender']
for col in cols:
str2int[col] = make_str2int(df_fraud[col])
I haven't tested, and I'm not sure to understand exactly how you intend to use the dictionaries, but here are my suggestions.
You could store the dictionaries in a dictionary of dictionaries:
def make_str2int(data):
categories = set(data)
return dict(zip(categories, range(len(categories))
str2int = {}
cols = ['sex', 'browser', 'country', 'source']
for col in cols:
str2int[col] = make_str2int(df_fraud[col])
(Assuming df_fraud represents your table (you didn't make this clear in your question.))
And then, if you want the categories existing in one column col, you can call:
str2int[col].keys()
If you want the corresponding numbers:
str2int[col].values()
If you want the number associated to a categorical value cat_val in a known column col:
str2int[col][cat_val]
Edit: Applying on your raw_data example
def make_str2int(data):
categories = set(data)
return dict(zip(categories, range(len(categories))))
raw_data = {
'names': ['A','B','B','D','D','E','B','B','E','F'],
'gender': ['M','F','F','F','F','M','M','M','M','M']}
str2int={}
cols = raw_data.keys()
for col in cols:
str2int[col] = make_str2int(raw_data[col])
print "Conversion examples:"
element = raw_data['names'][3]
print "%s -> %s" % (element, str2int['names'][element])
element = raw_data['gender'][2]
print "%s -> %s" % (element, str2int['gender'][element])
Output:
Conversion examples:
D -> 3
F -> 1

How do to fuzzy matching on excel file using Pandas?

I have a table called account with two columns - ID & NAME. ID is a hash which is unique but NAME is a string which might have duplicates.
I'm trying to write a python script to read this excel file and match 0-3 similar NAME values, but I just cannot seem to get it to work.
Could someone help out? Thanks
import pandas as pd
from fuzzywuzzy import fuzz
import difflib
def get_spr(row):
d = name1.apply(lambda x: (fuzz.ratio(x['NAME'], row['NAME']) * 0 if row['ID'] == x['ID'] else 1), axis=1)
d = d[d>= 60]
if len(d) == 0:
v = ['']*2
else:
v = name1.ix[d.idxmax(),['ID' , 'NAME']].values
return pd.Series(v, index=['ID', 'NAME'])
def score(tablerow):
d = name1.apply(lambda x: fuzz.ratio(x['NAME'],tablerow['NAME']) * (0 if x['ID']==tablerow['ID'] else 1), axis=1)
d = d[d>90]
if len(d) == 0:
v = [''] * 2
else:
v = name1.ix[d.order(ascending=False).head(3).index, ['ID' , 'NAME']].values
return pd.DataFrame(v, index=['ID', 'NAME'])
account = "account_test.xlsx"
xl_acc1 = pd.ExcelFile(account)
xl_acc2 = pd.ExcelFile(account)
acc1 = xl_acc1.parse(xl_acc1.sheet_names[0])
acc2 = xl_acc2.parse(xl_acc2.sheet_names[0])
name1 = acc1[pd.notnull(acc1['NAME'])]
name2 = acc2[pd.notnull(acc2['NAME'])]
print 'Doing Fuzzy Matching'
name2= pd.concat((name2,name2.apply(get_spr, axis=1)), axis=1)
name2.to_excel(pd.ExcelWriter('res.xlsx'),'acc')
Any help would be much appreciated!
The file has rows like this:-
ID NAME
0016F00001c7GDZQA2 Daniela Abriani
0016F00001c7GPnQAM Daniel Abriani
0016F00001c7JRrQAM Nisha Well
0016F00001c7Jv8QAE Katherine
0016F00001c7cXiQAI Katerine
0016F00001c7dA3QAI Katherin
0016F00001c7kHyQAI Nursing and Midwifery Council Research Office
0016F00001c8G8OQAU Nisa Well
Expected (output dataframe) would be something like:
ID NAME ID2 NAME2
<hash1> katherine <hash2> katerine
<hash1> katherine <hash3> katherin
<hash4> Nisa Well <hash5> Nisha Well
Issue: The above code just reproduces the input as the output saved file without actually concatenating any matches.
I don't think you need to do this in pandas. Here is my sloppy solution but it gets your desired output using a dictionary.
from fuzzywuzzy import process
df = pd.DataFrame([
['0016F00001c7GDZQA2', 'Daniela Abriani'],
['0016F00001c7GPnQAM', 'Daniel Abriani'],
['0016F00001c7JRrQAM', 'Nisha Well'],
['0016F00001c7Jv8QAE', 'Katherine'],
['0016F00001c7cXiQAI', 'Katerine'],
['0016F00001c7dA3QAI', 'Katherin'],
['0016F00001c7kHyQAI', 'Nursing and Midwifery Council Research Office'],
['0016F00001c8G8OQAU', 'Nisa Well']],
columns=['ID', 'NAME'])
get unique hashes in to a dictionary.
hashdict = dict(zip(df['ID'], df['NAME']))
define a function checkpair. You'll need it to remove reciprocal hash pairs. This method will add (hash1, hash2) and (hash2, hash1), but I think you only want to keep one of those pairs:
def checkpair (a,b,l):
for x in l:
if (a,b) == (x[2],x[0]):
l.remove(x)
Now iterate through hashdict.items() finding the top 3 matches along the way. The fuzzywuzzy docs detail the process method.
matches = []
for k,v in hashdict.items():
#see docs for extract -- 4 because you are comparing a name to itself
top3 = process.extract(v, hashdict, limit=4)
#remove the hashID compared to itself
for h in top3:
if k == h[2]:
top3.remove(h)
#append tuples to the list "matches" if it meets a score criteria
[matches.append((k, v, x[2], x[0], x[1])) for x in top3 if x[1] > 60] #change score?
#remove reciprocal pairs
[checkpair(m[0], m[2], matches) for m in matches]
df = pd.DataFrame(matches, columns=['id1', 'name1', 'id2', 'name2', 'score'])
# write to file
writer = pd.ExcelWriter('/path/to/your/file.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
Output:
id1 name1 id2 name2 score
0 0016F00001c7JRrQAM Nisha Well 0016F00001c8G8OQAU Nisa Well 95
1 0016F00001c7GPnQAM Daniel Abriani 0016F00001c7GDZQA2 Daniela Abriani 97
2 0016F00001c7Jv8QAE Katherine 0016F00001c7dA3QAI Katherin 94
3 0016F00001c7Jv8QAE Katherine 0016F00001c7cXiQAI Katerine 94
4 0016F00001c7dA3QAI Katherin 0016F00001c7cXiQAI Katerine 88

Categories