Below is my example code:
from fuzzywuzzy import fuzz
import json
from itertools import zip_longest
synonyms = open("synonyms.json","r")
synonyms = json.loads(synonyms.read())
vendor_data = ["i7 processor","solid state","Corei5 :1135G7 (11th
Generation)","hard
drive","ddr 8gb","something1", "something2",
"something3","HT (100W) DDR4-2400"]
buyer_data = ["i7 processor 12 generation","corei7:latest technology"]
vendor = []
buyer = []
for item,value in synonyms.items():
for k,k2 in zip_longest(vendor_data,buyer_data):
for v in value:
if fuzz.token_set_ratio(k,v) > 70:
if item in k:
vendor.append(k)
else:
vendor.append(item+" "+k)
else:
#didnt get only "something" strings here !
if fuzz.token_set_ratio(k2,v) > 70:
if item in k2:
buyer.append(k2)
else:
buyer.append(item+" "+k2)
vendor = list(set(vendor))
buyer = list(set(buyer))
vendor,buyer
Note: "something" string can be anything like "battery" or "display"etc
synonyms json
{
"processor":["corei5","core","corei7","i5","i7","ryzen5","i5 processor","i7
processor","processor i5","processor i7","core generation","core gen"],
"ram":["DDR4","memory","DDR3","DDR","DDR 8gb","DDR 8 gb","DDR 16gb","DDR 16 gb","DDR
32gb","DDR 32 gb","DDR4-"],
"ssd":["solid state drive","solid drive"],
"hdd":["Hard Drive"]
}
what do i need ?
I want to add all "something" string inside vendor list dynamically.
! NOTE -- "something" string can be anything in future.
I want to add "something" string in vendor array which is not a matched value in fuzz>70! I want to basically add left out data also.
for example like below:
current output
['processor Corei5 :1135G7 (11th Generation)',
'i7 processor',
'ram HT (100W) DDR4-2400',
'ram ddr 8gb',
'hdd hard drive',
'ssd solid state']
expected output below
['processor Corei5 :1135G7 (11th Generation)',
'i7 processor',
'ram HT (100W) DDR4-2400',
'ram ddr 8gb',
'hdd hard drive',
'ssd solid state',
'something1',
'something2'
'something3'] #something string need to be added in vendor list dynamically.
what silly mistake am I doing ? Thank you.
Here's my attempt:
from fuzzywuzzy import process, fuzz
synonyms = {'processor': ['corei5', 'core', 'corei7', 'i5', 'i7', 'ryzen5', 'i5 processor', 'i7 processor', 'processor i5', 'processor i7', 'core generation', 'core gen'], 'ram': ['DDR4', 'memory', 'DDR3', 'DDR', 'DDR 8gb', 'DDR 8 gb', 'DDR 16gb', 'DDR 16 gb', 'DDR 32gb', 'DDR 32 gb', 'DDR4-'], 'ssd': ['solid state drive', 'solid drive'], 'hdd': ['Hard Drive']}
vendor_data = ['i7 processor', 'solid state', 'Corei5 :1135G7 (11th Generation)', 'hard drive', 'ddr 8gb', 'something1', 'something2', 'something3', 'HT (100W) DDR4-2400']
buyer_data = ['i7 processor 12 generation', 'corei7:latest technology']
def find_synonym(s: str, min_score: int = 60):
results = process.extractBests(s, choices=synonyms, score_cutoff=min_score)
if not results:
return None
return results[0][-1]
def process_data(l: list, min_score: int = 60):
matches = []
no_matches = []
for item in l:
syn = find_synonym(item, min_score=min_score)
if syn is not None:
new_item = f'{syn} {item}' if syn not in item else item
matches.append(new_item)
elif any(fuzz.partial_ratio(s, item) >= min_score for s in synonyms.keys()):
# one of the synonyms is already in the item string
matches.append(item)
else:
no_matches.append(item)
return matches, no_matches
For process_data(vendor_data) we get:
(['i7 processor',
'ssd solid state',
'processor Corei5 :1135G7 (11th Generation)',
'hdd hard drive',
'ram ddr 8gb',
'ram HT (100W) DDR4-2400'],
['something1', 'something2', 'something3'])
And for process_data(buyer_data):
(['i7 processor 12 generation', 'processor corei7:latest technology'], [])
I had to lower the cut-off score to 60 to also get results for ddr 8gb. The process_data function returns 2 lists: One with matches with words from the synonyms dict and one with items without matches. If you want exactly the output you listed in your question, just concatenate the two lists like this:
matches, no_matches = process_data(vendor_data)
matches + no_matches # ['i7 processor', 'ssd solid state', 'processor Corei5 :1135G7 (11th Generation)', 'hdd hard drive', 'ram ddr 8gb', 'ram HT (100W) DDR4-2400', 'something1', 'something2', 'something3']
I have tried to come up with a decent answer (certainly not the cleanest one)
import json
from itertools import zip_longest
from fuzzywuzzy import fuzz
synonyms = open("synonyms.json", "r")
synonyms = json.loads(synonyms.read())
vendor_data = ["i7 processor", "solid state", "Corei5 :1135G7 (11thGeneration)", "hard drive", "ddr 8gb", "something1",
"something2",
"something3", "HT (100W) DDR4-2400"]
buyer_data = ["i7 processor 12 generation", "corei7:latest technology"]
vendor = []
buyer = []
for k, k2 in zip_longest(vendor_data, buyer_data):
has_matched = False
for item, value in synonyms.items():
for v in value:
if fuzz.token_set_ratio(k, v) > 70:
if item in k:
vendor.append(k)
else:
vendor.append(item + " " + k)
if has_matched or k2 is None:
break
else:
has_matched = True
if fuzz.token_set_ratio(k2, v) > 70:
if item in k2:
buyer.append(k2)
else:
buyer.append(item + " " + k2)
if has_matched or k is None:
break
else:
has_matched = True
else:
continue # match not found
break # match is found
else: # only evaluates on normal loop end
# Only something strings
# do something with the new input values
continue
vendor = list(set(vendor))
buyer = list(set(buyer))
I hope you can achieve what you want with this code. Check the docs if you don't know what a for else loop does. TLDR: the else clause executes when the loop terminates normally (not with a break). Note that I put the synonyms loop inside the data loop. This is because we can't certainly know in which synonym group the data belongs, also somethimes the vendor data entry is a processor while the buyer data is memory. Also note that I have assumed an item can't match more than 1 time. If this could be the case you would need to make a more advanced check (just make a counter and break when the counter equals 2 for example).
EDIT:
I took another look at the question and came up with maybe a better answer:
v_dict = dict()
for spec in vendor_data[:]:
for item, choices in synonyms.items():
if process.extractOne(spec, choices)[1] > 70: # don't forget to import process from fuzzywuzzy
v_dict[spec] = item
break
else:
v_dict[spec] = "Something new"
This code matches the strings to the correct type. for example {'i7 processor': 'processor', 'solid state': 'ssd', 'Corei5 :1135G7 (11thGeneration)': 'processor', 'hard drive': 'ssd', 'ddr 8gb': 'ram', 'something1': 'Something new', 'something2': 'Something new', 'something3': 'Something new', 'HT (100W) DDR4-2400': 'ram'}. You can change the "Something new" with watherver you like. You could also do: v_dict[spec] = 0 (on a match) and v_dict[spec] = 1 (on no match). You could then sort the dict ->
it = iter(v_dict.values())
print(sorted(v_dict.keys(), key=lambda x: next(it)))
Which would give the wanted results (more or less), all the recognised items will be first, and then all the unrecognised items. You could do some more advanced sorting on this dict if you want. I think this code gives you enough flexibility to reach your goal.
If I understand correctly, what you are trying to do is match keywords specified by a customer and/or vendor against a predefined database of keywords you have.
First, I would highly recommend using a reversed mapping of the synonyms, so it's faster to lookup, especially when the dataset will grow.
Second, considering the fuzzywuzzy API, it looks like you simply want the best match, so extractOne is a solid choice for that.
Now, extractOne returns the best match and a score:
>>> process.extractOne("cowboys", choices)
("Dallas Cowboys", 90)
I would split the algorithm into two:
A generic part that simply gets the best match, which should always exist (even if it's not a great one)
A filter, where you could adjust the sensitivity of the algorithm, based on different criteria of your application. This sensitivity threshold should set the minimal match quality. If you're below this threshold, just use "untagged" for the category for example.
Here is the final code, which I think is very simple and easy to understand and expand:
import json
from fuzzywuzzy import process
def load_synonyms():
with open('synonyms.json') as fin:
synonyms = json.load(fin)
# Reversing the map makes it much easier to lookup
reversed_synonyms = {}
for key, values in synonyms.items():
for value in values:
reversed_synonyms[value] = key
return reversed_synonyms
def load_vendor_data():
return [
"i7 processor",
"solid state",
"Corei5 :1135G7 (11thGeneration)",
"hard drive",
"ddr 8gb",
"something1",
"something2",
"something3",
"HT (100W) DDR4-2400"
]
def load_customer_data():
return [
"i7 processor 12 generation",
"corei7:latest technology"
]
def get_tag(keyword, synonyms):
THRESHOLD = 80
DEFAULT = 'general'
tag, score = process.extractOne(keyword, synonyms.keys())
return synonyms[tag] if score > THRESHOLD else DEFAULT
def main():
synonyms = load_synonyms()
customer_data = load_customer_data()
vendor_data = load_vendor_data()
data = customer_data + vendor_data
tags_dict = { keyword: get_tag(keyword, synonyms) for keyword in data }
print(json.dumps(tags_dict, indent=4))
if __name__ == '__main__':
main()
When running with the specified inputs, the output is:
{
"i7 processor 12 generation": "processor",
"corei7:latest technology": "processor",
"i7 processor": "processor",
"solid state": "ssd",
"Corei5 :1135G7 (11thGeneration)": "processor",
"hard drive": "hdd",
"ddr 8gb": "ram",
"something1": "general",
"something2": "general",
"something3": "general",
"HT (100W) DDR4-2400": "ram"
}
I have a rule-based code that prints out the Noun which is followed by a verb in a sentence
for text_id, text in enumerate(news_df['news_title'].values):
# Remove the comma and full stops
text = text.replace(',', '').replace('.', '').replace('-','')
sentence_tags = POSTAG(text.lower())
print(text)
# Sentences parts
for index, part in enumerate(sentence_tags):
try:
if 'NN' in part[1] and 'VB' in sentence_tags[index + 1][1]:
print(">", part[0])
break
elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'VB' in sentence_tags[index + 2][1]:
print(">", part[0], sentence_tags[index + 1][0])
break
elif 'NN' in part[1] and 'NN' in sentence_tags[index + 1][1] and 'NN' in sentence_tags[index + 2][1] and 'VB' in sentence_tags[index + 3][1]:
print(">", part[0], sentence_tags[index + 1][0], sentence_tags[index + 2][0])
break
except:
pass
print()
The output of a sentence following this rule:
high school football players charged after video surfaces showing hazing
> school football players
trump accuser pushes new york to pass the adult survivors act plans to sue
>trump accuser
Is there a way to also print out the position of that Noun that was printed due to the rule?
for example :
>trump accuser , [0,5,"NN"] , [6,13,"VB"]
I changed the script and separated the state machine segment. The most serious problem with this program IMO is it's just returning the first pattern (you can fix it quickly).
import pandas as pd
import nltk
POSTAG = nltk.pos_tag
df = pd.DataFrame({'text':['high school football players charged after video surfaces showing hazing', 'trump accuser pushes new york to pass the adult survivors act plans to sue']})
for text_id, text in enumerate(df['text'].values):
# Remove the comma and full stops
text = text.replace(',', '').replace('.', '').replace('-','')
tokens = nltk.word_tokenize(text.lower())
sentence_tags = POSTAG(tokens)
words = [item[0] for item in sentence_tags]
start_end = []
temp = 0
for word in words:
start_end.append([temp, temp+len(word)])
temp+= (len(word)+1)
tags = [item[1] for item in sentence_tags]
words_to_print = []
tags_to_print = []
start_end_to_print = []
# the state machine
verb = False
first_noun = False
second_noun = False
third_noun = False
for w, t, se in zip(words, tags, start_end):
if t.startswith('NN'):
words_to_print.append(w)
tags_to_print.append(t)
start_end_to_print.append(se)
first_noun = True
elif t.startswith('NN') and first_noun:
words_to_print.append(w)
tags_to_print.append(t)
start_end_to_print.append(se)
second_noun = True
elif t.startswith('NN') and second_noun:
words_to_print.append(w)
tags_to_print.append(t)
start_end_to_print.append(se)
third_noun = True
elif t.startswith('VB') and (first_noun or second_noun or third_noun):
break
elif (first_noun or second_noun or third_noun):
words_to_print = []
tags_to_print = []
start_end_to_print = []
verb = False
first_noun, second_noun, third_noun = False, False, False
print('> ', ' '.join(words_to_print), ' '.join([str(item[0])+' '+str(item[1]) for item in zip(start_end_to_print, tags_to_print)]))
output:
> school football players [5, 11] NN [12, 20] NN [21, 28] NNS
> trump accuser [0, 5] NN [6, 13] NN
For the adjective:
"The company's customer service was terrible."
{customer service, terrible}
For the verb:
"They kept increasing my phone bill"
{phone bill, increasing}
This is a branch questions from this posting
However I'm trying to find adj and verbs corresponding to multi-token phrases/compound nouns such as "customer service" using spacy.
I'm not sure how to do this with spacy, nltk, or any other prepackaged natural language processing software, and I'd appreciate any help!
For simple examples like this, you can use spaCy's dependency parsing with a few simple rules.
First, to identify multi-word nouns similar to the examples given, you can use the "compound" dependency. After parsing a document (e.g., sentence) with spaCy, use a token's dep_ attribute to find it's dependency.
For example, this sentence has two compound nouns:
"The compound dependency identifies compound nouns."
Each token and its dependency is shown below:
import spacy
import pandas as pd
nlp = spacy.load('en')
example_doc = nlp("The compound dependency identifies compound nouns.")
for tok in example_doc:
print(tok.i, tok, "[", tok.dep_, "]")
>>>0 The [ det ]
>>>1 compound [ compound ]
>>>2 dependency [ nsubj ]
>>>3 identifies [ ROOT ]
>>>4 compound [ compound ]
>>>5 nouns [ dobj ]
>>>6 . [ punct ]
for tok in [tok for tok in example_doc if tok.dep_ == 'compound']: # Get list of
compounds in doc
noun = example_doc[tok.i: tok.head.i + 1]
print(noun)
>>>compound dependency
>>>compound nouns
The below function works for your examples. However, it will likely not work for more complicated sentences.
adj_doc = nlp("The company's customer service was terrible.")
verb_doc = nlp("They kept increasing my phone bill")
def get_compound_pairs(doc, verbose=False):
"""Return tuples of (multi-noun word, adjective or verb) for document."""
compounds = [tok for tok in doc if tok.dep_ == 'compound'] # Get list of compounds in doc
compounds = [c for c in compounds if c.i == 0 or doc[c.i - 1].dep_ != 'compound'] # Remove middle parts of compound nouns, but avoid index errors
tuple_list = []
if compounds:
for tok in compounds:
pair_item_1, pair_item_2 = (False, False) # initialize false variables
noun = doc[tok.i: tok.head.i + 1]
pair_item_1 = noun
# If noun is in the subject, we may be looking for adjective in predicate
# In simple cases, this would mean that the noun shares a head with the adjective
if noun.root.dep_ == 'nsubj':
adj_list = [r for r in noun.root.head.rights if r.pos_ == 'ADJ']
if adj_list:
pair_item_2 = adj_list[0]
if verbose == True: # For trying different dependency tree parsing rules
print("Noun: ", noun)
print("Noun root: ", noun.root)
print("Noun root head: ", noun.root.head)
print("Noun root head rights: ", [r for r in noun.root.head.rights if r.pos_ == 'ADJ'])
if noun.root.dep_ == 'dobj':
verb_ancestor_list = [a for a in noun.root.ancestors if a.pos_ == 'VERB']
if verb_ancestor_list:
pair_item_2 = verb_ancestor_list[0]
if verbose == True: # For trying different dependency tree parsing rules
print("Noun: ", noun)
print("Noun root: ", noun.root)
print("Noun root head: ", noun.root.head)
print("Noun root head verb ancestors: ", [a for a in noun.root.ancestors if a.pos_ == 'VERB'])
if pair_item_1 and pair_item_2:
tuple_list.append((pair_item_1, pair_item_2))
return tuple_list
get_compound_pairs(adj_doc)
>>>[(customer service, terrible)]
get_compound_pairs(verb_doc)
>>>[(phone bill, increasing)]
get_compound_pairs(example_doc, verbose=True)
>>>Noun: compound dependency
>>>Noun root: dependency
>>>Noun root head: identifies
>>>Noun root head rights: []
>>>Noun: compound nouns
>>>Noun root: nouns
>>>Noun root head: identifies
>>>Noun root head verb ancestors: [identifies]
>>>[(compound nouns, identifies)]
I needed to solve a similar problem and I wanted to share my solution as Spacy.io custom component.
import spacy
from spacy.tokens import Token, Span
from spacy.language import Language
#Language.component("compound_chainer")
def find_compounds(doc):
Token.set_extension("is_compound_chain", default=False)
com_range = []
max_ind = len(doc)
for idx, tok in enumerate(doc):
if((tok.dep_ == "compound") and (idx < max_ind)):
com_range.append([idx, idx+1])
to_remove = []
intersections = []
for t1 in com_range:
for t2 in com_range:
if(t1 != t2):
s1 = set(t1)
s2 = set(t2)
if(len(s1.intersection(s2)) > 0):
to_remove.append(t1)
to_remove.append(t2)
union = list(s1.union(s2))
if union not in intersections:
intersections.append(union)
r = [t for t in com_range if t not in to_remove]
compound_ranges = r + intersections
spans = []
for cr in compound_ranges:
# Example cr [[0, 1], [3, 4], [12, 13], [16, 17, 18]]
entity = Span(doc, min(cr), max(cr)+1, label="compound_chain")
for token in entity:
token._.set("is_compound_chain", True)
spans.append(entity)
doc.ents = list(doc.ents) + spans
return doc
Github link: https://github.com/eboraks/job-description-nlp-analysis/blob/main/src/components/compound_chainer.py
Intent is to capitalize based on POS tags, which I could achieve with the help of the below link.
How can I best determine the correct capitalization for a word?
Trying to achieve similar results using spacy?
def truecase(doc):
truecased_sents = [] # list of truecased sentences
tagged_sent = token.tag_([word.lower() for token in doc])
normalized_sent = [w.capitalize() if t in ["NN","NNS"] else w for (w,t) in tagged_sent]
normalized_sent[0] = normalized_sent[0].capitalize()
string = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent))
return string
it throws out this error
tagged_sent = token.tag_([word.lower() for token in doc])
NameError: global name 'token' is not defined
how to declare token as global and solve this issue. Is my approach correct?
import spacy, re
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'autonomous cars shift insurance liability toward manufacturers.')
tagged_sent = [(w.text, w.tag_) for w in doc]
normalized_sent = [w.capitalize() if t in ["NN","NNS"] else w for (w,t) in tagged_sent]
normalized_sent[0] = normalized_sent[0].capitalize()
string = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent))
print string
Output:
Autonomous Cars shift Insurance Liability toward Manufacturers.