want to add left out string in matched string - python

Below is my example code:
from fuzzywuzzy import fuzz
import json
from itertools import zip_longest
synonyms = open("synonyms.json","r")
synonyms = json.loads(synonyms.read())
vendor_data = ["i7 processor","solid state","Corei5 :1135G7 (11th
Generation)","hard
drive","ddr 8gb","something1", "something2",
"something3","HT (100W) DDR4-2400"]
buyer_data = ["i7 processor 12 generation","corei7:latest technology"]
vendor = []
buyer = []
for item,value in synonyms.items():
for k,k2 in zip_longest(vendor_data,buyer_data):
for v in value:
if fuzz.token_set_ratio(k,v) > 70:
if item in k:
vendor.append(k)
else:
vendor.append(item+" "+k)
else:
#didnt get only "something" strings here !
if fuzz.token_set_ratio(k2,v) > 70:
if item in k2:
buyer.append(k2)
else:
buyer.append(item+" "+k2)
vendor = list(set(vendor))
buyer = list(set(buyer))
vendor,buyer
Note: "something" string can be anything like "battery" or "display"etc
synonyms json
{
"processor":["corei5","core","corei7","i5","i7","ryzen5","i5 processor","i7
processor","processor i5","processor i7","core generation","core gen"],
"ram":["DDR4","memory","DDR3","DDR","DDR 8gb","DDR 8 gb","DDR 16gb","DDR 16 gb","DDR
32gb","DDR 32 gb","DDR4-"],
"ssd":["solid state drive","solid drive"],
"hdd":["Hard Drive"]
}
what do i need ?
I want to add all "something" string inside vendor list dynamically.
! NOTE -- "something" string can be anything in future.
I want to add "something" string in vendor array which is not a matched value in fuzz>70! I want to basically add left out data also.
for example like below:
current output
['processor Corei5 :1135G7 (11th Generation)',
'i7 processor',
'ram HT (100W) DDR4-2400',
'ram ddr 8gb',
'hdd hard drive',
'ssd solid state']
expected output below
['processor Corei5 :1135G7 (11th Generation)',
'i7 processor',
'ram HT (100W) DDR4-2400',
'ram ddr 8gb',
'hdd hard drive',
'ssd solid state',
'something1',
'something2'
'something3'] #something string need to be added in vendor list dynamically.
what silly mistake am I doing ? Thank you.

Here's my attempt:
from fuzzywuzzy import process, fuzz
synonyms = {'processor': ['corei5', 'core', 'corei7', 'i5', 'i7', 'ryzen5', 'i5 processor', 'i7 processor', 'processor i5', 'processor i7', 'core generation', 'core gen'], 'ram': ['DDR4', 'memory', 'DDR3', 'DDR', 'DDR 8gb', 'DDR 8 gb', 'DDR 16gb', 'DDR 16 gb', 'DDR 32gb', 'DDR 32 gb', 'DDR4-'], 'ssd': ['solid state drive', 'solid drive'], 'hdd': ['Hard Drive']}
vendor_data = ['i7 processor', 'solid state', 'Corei5 :1135G7 (11th Generation)', 'hard drive', 'ddr 8gb', 'something1', 'something2', 'something3', 'HT (100W) DDR4-2400']
buyer_data = ['i7 processor 12 generation', 'corei7:latest technology']
def find_synonym(s: str, min_score: int = 60):
results = process.extractBests(s, choices=synonyms, score_cutoff=min_score)
if not results:
return None
return results[0][-1]
def process_data(l: list, min_score: int = 60):
matches = []
no_matches = []
for item in l:
syn = find_synonym(item, min_score=min_score)
if syn is not None:
new_item = f'{syn} {item}' if syn not in item else item
matches.append(new_item)
elif any(fuzz.partial_ratio(s, item) >= min_score for s in synonyms.keys()):
# one of the synonyms is already in the item string
matches.append(item)
else:
no_matches.append(item)
return matches, no_matches
For process_data(vendor_data) we get:
(['i7 processor',
'ssd solid state',
'processor Corei5 :1135G7 (11th Generation)',
'hdd hard drive',
'ram ddr 8gb',
'ram HT (100W) DDR4-2400'],
['something1', 'something2', 'something3'])
And for process_data(buyer_data):
(['i7 processor 12 generation', 'processor corei7:latest technology'], [])
I had to lower the cut-off score to 60 to also get results for ddr 8gb. The process_data function returns 2 lists: One with matches with words from the synonyms dict and one with items without matches. If you want exactly the output you listed in your question, just concatenate the two lists like this:
matches, no_matches = process_data(vendor_data)
matches + no_matches # ['i7 processor', 'ssd solid state', 'processor Corei5 :1135G7 (11th Generation)', 'hdd hard drive', 'ram ddr 8gb', 'ram HT (100W) DDR4-2400', 'something1', 'something2', 'something3']

I have tried to come up with a decent answer (certainly not the cleanest one)
import json
from itertools import zip_longest
from fuzzywuzzy import fuzz
synonyms = open("synonyms.json", "r")
synonyms = json.loads(synonyms.read())
vendor_data = ["i7 processor", "solid state", "Corei5 :1135G7 (11thGeneration)", "hard drive", "ddr 8gb", "something1",
"something2",
"something3", "HT (100W) DDR4-2400"]
buyer_data = ["i7 processor 12 generation", "corei7:latest technology"]
vendor = []
buyer = []
for k, k2 in zip_longest(vendor_data, buyer_data):
has_matched = False
for item, value in synonyms.items():
for v in value:
if fuzz.token_set_ratio(k, v) > 70:
if item in k:
vendor.append(k)
else:
vendor.append(item + " " + k)
if has_matched or k2 is None:
break
else:
has_matched = True
if fuzz.token_set_ratio(k2, v) > 70:
if item in k2:
buyer.append(k2)
else:
buyer.append(item + " " + k2)
if has_matched or k is None:
break
else:
has_matched = True
else:
continue # match not found
break # match is found
else: # only evaluates on normal loop end
# Only something strings
# do something with the new input values
continue
vendor = list(set(vendor))
buyer = list(set(buyer))
I hope you can achieve what you want with this code. Check the docs if you don't know what a for else loop does. TLDR: the else clause executes when the loop terminates normally (not with a break). Note that I put the synonyms loop inside the data loop. This is because we can't certainly know in which synonym group the data belongs, also somethimes the vendor data entry is a processor while the buyer data is memory. Also note that I have assumed an item can't match more than 1 time. If this could be the case you would need to make a more advanced check (just make a counter and break when the counter equals 2 for example).
EDIT:
I took another look at the question and came up with maybe a better answer:
v_dict = dict()
for spec in vendor_data[:]:
for item, choices in synonyms.items():
if process.extractOne(spec, choices)[1] > 70: # don't forget to import process from fuzzywuzzy
v_dict[spec] = item
break
else:
v_dict[spec] = "Something new"
This code matches the strings to the correct type. for example {'i7 processor': 'processor', 'solid state': 'ssd', 'Corei5 :1135G7 (11thGeneration)': 'processor', 'hard drive': 'ssd', 'ddr 8gb': 'ram', 'something1': 'Something new', 'something2': 'Something new', 'something3': 'Something new', 'HT (100W) DDR4-2400': 'ram'}. You can change the "Something new" with watherver you like. You could also do: v_dict[spec] = 0 (on a match) and v_dict[spec] = 1 (on no match). You could then sort the dict ->
it = iter(v_dict.values())
print(sorted(v_dict.keys(), key=lambda x: next(it)))
Which would give the wanted results (more or less), all the recognised items will be first, and then all the unrecognised items. You could do some more advanced sorting on this dict if you want. I think this code gives you enough flexibility to reach your goal.

If I understand correctly, what you are trying to do is match keywords specified by a customer and/or vendor against a predefined database of keywords you have.
First, I would highly recommend using a reversed mapping of the synonyms, so it's faster to lookup, especially when the dataset will grow.
Second, considering the fuzzywuzzy API, it looks like you simply want the best match, so extractOne is a solid choice for that.
Now, extractOne returns the best match and a score:
>>> process.extractOne("cowboys", choices)
("Dallas Cowboys", 90)
I would split the algorithm into two:
A generic part that simply gets the best match, which should always exist (even if it's not a great one)
A filter, where you could adjust the sensitivity of the algorithm, based on different criteria of your application. This sensitivity threshold should set the minimal match quality. If you're below this threshold, just use "untagged" for the category for example.
Here is the final code, which I think is very simple and easy to understand and expand:
import json
from fuzzywuzzy import process
def load_synonyms():
with open('synonyms.json') as fin:
synonyms = json.load(fin)
# Reversing the map makes it much easier to lookup
reversed_synonyms = {}
for key, values in synonyms.items():
for value in values:
reversed_synonyms[value] = key
return reversed_synonyms
def load_vendor_data():
return [
"i7 processor",
"solid state",
"Corei5 :1135G7 (11thGeneration)",
"hard drive",
"ddr 8gb",
"something1",
"something2",
"something3",
"HT (100W) DDR4-2400"
]
def load_customer_data():
return [
"i7 processor 12 generation",
"corei7:latest technology"
]
def get_tag(keyword, synonyms):
THRESHOLD = 80
DEFAULT = 'general'
tag, score = process.extractOne(keyword, synonyms.keys())
return synonyms[tag] if score > THRESHOLD else DEFAULT
def main():
synonyms = load_synonyms()
customer_data = load_customer_data()
vendor_data = load_vendor_data()
data = customer_data + vendor_data
tags_dict = { keyword: get_tag(keyword, synonyms) for keyword in data }
print(json.dumps(tags_dict, indent=4))
if __name__ == '__main__':
main()
When running with the specified inputs, the output is:
{
"i7 processor 12 generation": "processor",
"corei7:latest technology": "processor",
"i7 processor": "processor",
"solid state": "ssd",
"Corei5 :1135G7 (11thGeneration)": "processor",
"hard drive": "hdd",
"ddr 8gb": "ram",
"something1": "general",
"something2": "general",
"something3": "general",
"HT (100W) DDR4-2400": "ram"
}

Related

Trying to create a streamlit app that uses user-provided URLs to scrape and return a downloadable df

I'm trying to use this create_df() function in Streamlit to gather a list of user-provided URLs called "recipes" and loop through each URL to return a df I've labeled "res" towards the end of the function. I've tried several approaches with the Streamlit syntax but I just cannot get this to work as I'm getting this error message:
recipe_scrapers._exceptions.WebsiteNotImplementedError: recipe-scrapers exception: Website (h) not supported.
Have a look at my entire repo here. The main.py script works just fine once you've installed all requirements locally, but when I try running the same script with Streamlit syntax in the streamlit.py script I get the above error. Once you run streamlit run streamlit.py in your terminal and have a look at the UI I've create it should be quite clear what I'm aiming at, which is providing the user with a csv of all ingredients in the recipe URLs they provided for a convenient grocery shopping list.
Any help would be greatly appreciated!
def create_df(recipes):
"""
Description:
Creates one df with all recipes and their ingredients
Arguments:
* recipes: list of recipe URLs provided by user
Comments:
Note that ingredients with qualitative amounts e.g., "scheutje melk", "snufje zout" have been ommitted from the ingredient list
"""
df_list = []
for recipe in recipes:
scraper = scrape_me(recipe)
recipe_details = replace_measurement_symbols(scraper.ingredients())
recipe_name = recipe.split("https://www.hellofresh.nl/recipes/", 1)[1]
recipe_name = recipe_name.rsplit('-', 1)[0]
print("Processing data for "+ recipe_name +" recipe.")
for ingredient in recipe_details:
try:
df_temp = pd.DataFrame(columns=['Ingredients', 'Measurement'])
df_temp[str(recipe_name)] = recipe_name
ing_1 = ingredient.split("2 * ", 1)[1]
ing_1 = ing_1.split(" ", 2)
item = ing_1[2]
measurement = ing_1[1]
quantity = float(ing_1[0]) * 2
df_temp.loc[len(df_temp)] = [item, measurement, quantity]
df_list.append(df_temp)
except (ValueError, IndexError) as e:
pass
df = pd.concat(df_list)
print("Renaming duplicate ingredients e.g., Kruimige aardappelen, Voorgekookte halve kriel met schil -> Aardappelen")
ingredient_dict = {
'Aardappelen': ('Dunne frieten', 'Half kruimige aardappelen', 'Voorgekookte halve kriel met schil',
'Kruimige aardappelen', 'Roodschillige aardappelen', 'Opperdoezer Ronde aardappelen'),
'Ui': ('Rode ui'),
'Kipfilet': ('Kipfilet met tuinkruiden en knoflook'),
'Kipworst': ('Gekruide kipworst'),
'Kipgehakt': ('Gemengd gekruid gehakt', 'Kipgehakt met Mexicaanse kruiden', 'Half-om-halfgehakt met Italiaanse kruiden',
'Kipgehakt met tuinkruiden'),
'Kipshoarma': ('Kalkoenshoarma')
}
reverse_label_ing = {x:k for k,v in ingredient_dict.items() for x in v}
df["Ingredients"].replace(reverse_label_ing, inplace=True)
print("Assigning ingredient categories")
category_dict = {
'brood': ('Biologisch wit rozenbroodje', 'Bladerdeeg', 'Briochebroodje', 'Wit platbrood'),
'granen': ('Basmatirijst', 'Bulgur', 'Casarecce', 'Cashewstukjes',
'Gesneden snijbonen', 'Jasmijnrijst', 'Linzen', 'Maïs in blik',
'Parelcouscous', 'Penne', 'Rigatoni', 'Rode kidneybonen',
'Spaghetti', 'Witte tortilla'),
'groenten': ('Aardappelen', 'Aubergine', 'Bosui', 'Broccoli',
'Champignons', 'Citroen', 'Gele wortel', 'Gesneden rodekool',
'Groene paprika', 'Groentemix van paprika, prei, gele wortel en courgette',
'IJsbergsla', 'Kumato tomaat', 'Limoen', 'Little gem',
'Paprika', 'Portobello', 'Prei', 'Pruimtomaat',
'Radicchio en ijsbergsla', 'Rode cherrytomaten', 'Rode paprika', 'Rode peper',
'Rode puntpaprika', 'Rode ui', 'Rucola', 'Rucola en veldsla', 'Rucolamelange',
'Semi-gedroogde tomatenmix', 'Sjalot', 'Sperziebonen', 'Spinazie', 'Tomaat',
'Turkse groene peper', 'Veldsla', 'Vers basilicum', 'Verse bieslook',
'Verse bladpeterselie', 'Verse koriander', 'Verse krulpeterselie', 'Wortel', 'Zoete aardappel'),
'kruiden': ('Aïoli', 'Bloem', 'Bruine suiker', 'Cranberrychutney', 'Extra vierge olijfolie',
'Extra vierge olijfolie met truffelaroma', 'Fles olijfolie', 'Gedroogde laos',
'Gedroogde oregano', 'Gemalen kaneel', 'Gemalen komijnzaad', 'Gemalen korianderzaad',
'Gemalen kurkuma', 'Gerookt paprikapoeder', 'Groene currykruiden', 'Groentebouillon',
'Groentebouillonblokje', 'Honing', 'Italiaanse kruiden', 'Kippenbouillonblokje', 'Knoflookteen',
'Kokosmelk', 'Koreaanse kruidenmix', 'Mayonaise', 'Mexicaanse kruiden', 'Midden-Oosterse kruidenmix',
'Mosterd', 'Nootmuskaat', 'Olijfolie', 'Panko paneermeel', 'Paprikapoeder', 'Passata',
'Pikante uienchutney', 'Runderbouillonblokje', 'Sambal', 'Sesamzaad', 'Siciliaanse kruidenmix',
'Sojasaus', 'Suiker', 'Sumak', 'Surinaamse kruiden', 'Tomatenblokjes', 'Tomatenblokjes met ui',
'Truffeltapenade', 'Ui', 'Verse gember', 'Visbouillon', 'Witte balsamicoazijn', 'Wittewijnazijn',
'Zonnebloemolie', 'Zwarte balsamicoazijn'),
'vlees': ('Gekruide runderburger', 'Half-om-half gehaktballetjes met Spaanse kruiden', 'Kipfilethaasjes', 'Kipfiletstukjes',
'Kipgehaktballetjes met Italiaanse kruiden', 'Kippendijreepjes', 'Kipshoarma', 'Kipworst', 'Spekblokjes',
'Vegetarische döner kebab', 'Vegetarische kaasschnitzel', 'Vegetarische schnitzel'),
'zuivel': ('Ei', 'Geraspte belegen kaas', 'Geraspte cheddar', 'Geraspte grana padano', 'Geraspte oude kaas',
'Geraspte pecorino', 'Karnemelk', 'Kruidenroomkaas', 'Labne', 'Melk', 'Mozzarella',
'Parmigiano reggiano', 'Roomboter', 'Slagroom', 'Volle yoghurt')
}
reverse_label_cat = {x:k for k,v in category_dict.items() for x in v}
df["Category"] = df["Ingredients"].map(reverse_label_cat)
col = "Category"
first_col = df.pop(col)
df.insert(0, col, first_col)
df = df.sort_values(['Category', 'Ingredients'], ascending = [True, True])
print("Merging ingredients by row across all recipe columns using justify()")
gp_cols = ['Ingredients', 'Measurement']
oth_cols = df.columns.difference(gp_cols)
arr = np.vstack(df.groupby(gp_cols, sort=False, dropna=False).apply(lambda gp: justify(gp.to_numpy(), invalid_val=np.NaN, axis=0, side='up')))
# Reconstruct DataFrame
# Remove entirely NaN rows based on the non-grouping columns
res = (pd.DataFrame(arr, columns=df.columns)
.dropna(how='all', subset=oth_cols, axis=0))
res = res.fillna(0)
res['Total'] = res.drop(['Ingredients', 'Measurement'], axis=1).sum(axis=1)
res=res[res['Total'] !=0] #To drop rows that are being duplicated with 0 for some reason; will check later
print("Processing complete!")
return res
Your function create_df needs a list as an argument but st.text_input returs always a string.
In your streamlit.py, replace this df_download = create_df(recs) by this df_download = create_df([recs]). But if you need to handle multiple urls, you should use str.split like this :
def create_df(recipes):
recipes = recipes.split(",") # <--- add this line to make a list from the user-input
### rest of the code ###
if download:
df_download = create_df(recs)
# Output :

Is there a way to get the key from a list embedded in a dictionary

I am trying to write a function that, given a dictionary of restaurants, allows you to randomly pick a restaurant based on your choice of one of the values. For example, if you say you want a bakery then it will only give you bakeries.
I have only worked on the code for choosing the type of restaurant so far, and I am struggling with how to generate a random list. So I am checking for a value and, if it has it, would want to add the key to a list. Does anyone have any suggestions on how to do this?
import random
Restaurants ={
"Eureka": ["American", "$$", "Lunch", "Dinner"],
"Le_Pain": ["Bakery", "$$", "Breakfast", "Lunch", "Dinner"],
"Creme_Bakery": ["Bakery", "$", "Snack"]
}
list=[]
def simple_chooser():
print('Would you like to lock a category or randomize? (randomize, type, price, or meal)')
start= input()
if start=="randomize":
return #completely random
elif start=="type":
print("American, Bakery, Pie, Ice_Cream, Bagels, Asian, Chocolate, Italian, Pizza, Thai, Mexican, Japanese, Acai, Mediterranean, or Boba/Coffee?")
type=input()
for lst in Restaurants.values():
for x in lst:
if x==type:
list.append(x)
return(random.choice(list))
To return completely random restaurant suggestions you need to create a list of all the types first and then you can choose one and return the names of the restaurants.
import random
Restaurants ={
"Eureka": ["American", "$$", "Lunch", "Dinner"],
"Le_Pain": ["Bakery", "$$", "Breakfast", "Lunch", "Dinner"],
"Creme_Bakery": ["Bakery", "$", "Snack"]
}
types = ['American', 'Bakery', 'Pie', 'Ice_Cream', 'Bagels', 'Asian', 'Chocolate', 'Italian',
'Pizza', 'Thai', 'Mexican', 'Japanese', 'Acai', 'Mediterranean','Boba/Coffee']
def simple_chooser():
l=[]
print('Would you like to lock a category or randomize? (randomize, type, price, or meal)')
start= input()
if start=="randomize":
type_random = random.choice(types)
for k,v in Restaurants.items():
if v[0] == type_random:
l.append(k)
elif start=="type":
print("American, Bakery, Pie, Ice_Cream, Bagels, Asian, Chocolate, Italian, Pizza, Thai, Mexican, Japanese, Acai, Mediterranean, or Boba/Coffee?")
type_chosen=input()
for k,v in Restaurants.items():
if v[0] == type_chosen:
l.append(k)
return(random.choice(l))
Also, you don't need to return in if-else statements. Once you have your list of Restaurants you can randomly choose a restaurant and return it.
In your loop, you don't have the restaurant name, as you iterate on the values, you would have need something like
for name, props in Restaurants.items():
if props[0] == type:
list.append(name)
return (random.choice(list)) # wait for the whole list to be constructed
With a better naming (don't use type and list that are builtin methods)
def simple_chooser():
start = input('Would you like to lock a category or randomize? (randomize, type, price, or meal)')
if start == "randomize":
return # completely random
elif start == "type":
restaurant_type = input("American, Bakery, Pie, Ice_Cream, Bagels, Asian, Chocolate, Italian, "
"Pizza, Thai, Mexican, Japanese, Acai, Mediterranean, or Boba/Coffee?")
matching_names = [name for name, props in Restaurants.items() if props[0] == restaurant_type]
return random.choice(matching_names)
You make processing difficult because of the design of your data structures.
Here's an idea which should be easily adapted to future needs.
import random
from operator import contains, eq
Restaurants = [
{'name': 'Eureka', 'type': 'American', 'price': '$$', 'meal': ('Dinner',)},
{'name': 'Le_Pain', 'type': 'Bakery', 'price': '$$', 'meal': ('Lunch', 'Dinner')},
{'name': 'Creme_Bakery', 'type': 'Bakery', 'price': '$', 'meal': ('Snack',)}
]
def get_attr(k):
s = set()
for r in Restaurants:
if isinstance(r[k], tuple):
for t in r[k]:
s.add(t)
else:
s.add(r[k])
return s
def choose_restaurant():
categories = ', '.join(Restaurants[0])
while True:
choice = input(f'Select by category ({categories}) or choose random: ')
if choice == 'random':
return random.choice(Restaurants)
if choice in Restaurants[0]:
choices = get_attr(choice)
if (v := input(f'Select value for {choice} from ({", ".join(choices)}): ')) in choices:
op = contains if isinstance(Restaurants[0][choice], tuple) else eq
return [r for r in Restaurants if op(r[choice], v)]
print('Invalid selection\x07')
print(choose_restaurant())
Restaurants is now a list of dictionaries which is easy to extend. You just need to make sure that each new restaurant has the same structure (keys). Also note that the 'meal' value is a tuple even if there's a single value

How to insert values to a Sqlite database from a Python Set or Dict?

I have the following function which produces results;
myNames = ['ULTA', 'CSCO', ...]
def get_from_min_match(var):
temp = []
count_elem = generate_elem_count()
for item in count_elem:
if var <= count_elem[item]:
temp.append(item)
return set(temp) if len(set(temp)) > 0 else "None"
def generate_elem_count():
result_data = []
for val in mapper.values():
if type(val) == list:
result_data += val
elif type(val) == dict:
for key in val:
result_data.append(key)
count_elem = {elem: result_data.count(elem) for elem in result_data}
return count_elem
I call this function like this;
myNames_dict_1 = ['AME', 'IEX', 'PAYC']
myNames_dict_1 = ['ULTA', 'CSCO', 'PAYC']
mapper = {1: myNames_dict_1, 2: myNames_dict_2}
print(" These meet three values ", get_from_min_match(3))
print(" These meet four values ", get_from_min_match(4))
The output I get from these functions are as follows;
These meet three values {'ULTA', 'CSCO', 'SHW', 'MANH', 'TTWO', 'SAM', 'RHI', 'PAYC', 'AME', 'CCOI', 'RMD', 'AMD', 'UNH', 'AZO', 'APH', 'EW', 'FFIV', 'IEX', 'IDXX', 'ANET', 'SWKS', 'HRL', 'ILMN', 'PGR', 'ATVI', 'CNS', 'EA', 'ORLY', 'TSCO'}
These meet four values {'EW', 'PAYC', 'TTWO', 'AME', 'IEX', 'IDXX', 'ANET', 'RMD', 'SWKS', 'HRL', 'UNH', 'CCOI', 'ORLY', 'APH', 'PGR', 'TSCO'}
Now, I want to insert the output, of the get_from_min_match function into a Sqlite database. Its structure looks like this;
dbase.execute("INSERT OR REPLACE INTO min_match (DATE, SYMBOL, NAME, NUMBEROFMETRICSMET) \
VALUES (?,?,?,?)", (datetime.today(), symbol, name, NUMBEROFMETRICSMET?))
dbase.commit()
So, it's basically a new function to calculate the "NUMBEROFMETRICSMET" parameter rather than calling each of these functions many times. And I want the output of the function inserted into the database. How to achieve this? Here 3, 4 would be the number of times the companies matched.
date ULTA name 3
date EW name 4
...
should be the result.
How can I achieve this? Thanks!
I fixed this by just using my already written function;
count_elem = generate_elem_count()
print("Count Elem: " + str(count_elem))
This prints {'AMPY': 1} and so on.

Find successively connected nouns or pronouns in string

I want to find stand-alone or successively connected nouns in a text. I put together below code, but it is neither efficient nor pythonic. Does anybody have a more pythonic way of finding these nouns with spaCy?
Below code builds a dict with all tokens and then runs through them to find stand-alone or connected PROPN or NOUN until the for-loop runs out of range. It returns a list of the collected items.
def extract_unnamed_ents(doc):
"""Takes a string and returns a list of all succesively connected nouns or pronouns"""
nlp_doc = nlp(doc)
token_list = []
for token in nlp_doc:
token_dict = {}
token_dict['lemma'] = token.lemma_
token_dict['pos'] = token.pos_
token_dict['tag'] = token.tag_
token_list.append(token_dict)
ents = []
k = 0
for i in range(len(token_list)):
try:
if token_list[k]['pos'] == 'PROPN' or token_list[k]['pos'] == 'NOUN':
ent = token_list[k]['lemma']
if token_list[k+1]['pos'] == 'PROPN' or token_list[k+1]['pos'] == 'NOUN':
ent = ent + ' ' + token_list[k+1]['lemma']
k += 1
if token_list[k+1]['pos'] == 'PROPN' or token_list[k+1]['pos'] == 'NOUN':
ent = ent + ' ' + token_list[k+1]['lemma']
k += 1
if token_list[k+1]['pos'] == 'PROPN' or token_list[k+1]['pos'] == 'NOUN':
ent = ent + ' ' + token_list[k+1]['lemma']
k += 1
if token_list[k+1]['pos'] == 'PROPN' or token_list[k+1]['pos'] == 'NOUN':
ent = ent + ' ' + token_list[k+1]['lemma']
k += 1
if ent not in ents:
ents.append(ent)
except:
pass
k += 1
return ents
Test:
extract_unnamed_ents('Chancellor Angela Merkel and some of her ministers will discuss at a cabinet '
"retreat next week ways to avert driving bans in major cities after Germany's "
'top administrative court in February allowed local authorities to bar '
'heavily polluting diesel cars.')
Out:
['Chancellor Angela Merkel',
'minister',
'cabinet retreat',
'week way',
'ban',
'city',
'Germany',
'court',
'February',
'authority',
'diesel car']
spacy has a way of doing this but I'm not sure it is giving you exactly what you are after
import spacy
text = """Chancellor Angela Merkel and some of her ministers will discuss
at a cabinet retreat next week ways to avert driving bans in
major cities after Germany's top administrative court
in February allowed local authorities to bar heavily
polluting diesel cars.
""".replace('\n', ' ')
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
print([i.text for i in doc.noun_chunks])
gives
['Chancellor Angela Merkel', 'her ministers', 'a cabinet retreat', 'ways', 'driving bans', 'major cities', "Germany's top administrative court", 'February', 'local authorities', 'heavily polluting diesel cars']
Here, however the i.lemma_ line doesn't really give you what you want (I think this might be fixed by this recent PR).
Since it isn't quite what you are after you could use itertools.groupby like so
import itertools
out = []
for i, j in itertools.groupby(doc, key=lambda i: i.pos_):
if i not in ("PROPN", "NOUN"):
continue
out.append(' '.join(k.lemma_ for k in j))
print(out)
gives
['Chancellor Angela Merkel', 'minister', 'cabinet retreat', 'week way', 'ban', 'city', 'Germany', 'court', 'February', 'authority', 'diesel car']
This should give you exactly the same output as your function (the output is slightly different here but I believe this is due to different spacy versions).
If you are feeling really adventurous you could use a list comprehension
out = [' '.join(k.lemma_ for k in j)
for i, j in itertools.groupby(doc, key=lambda i: i.pos_)
if i in ("PROPN", "NOUN")]
Note I see slightly different results with different spacy versions. The output above is from spacy-2.1.8

Categorizing sentence using dictionary

I am using below function for getting categorizing sentence in themes
def theme(x):
output =[]
category = ()
for i in x:
if 'AC' in i:
category = 'AC problem'
elif 'insects' in i:
category = 'Cleanliness'
elif 'clean' in i:
category = 'Cleanliness'
elif 'food' in i:
category = 'Food Problem'
elif 'delay' in i:
category = 'Train Delayed'
else:
category = 'None'
output.append(category)
return output
I don't want to use repeated if statements for every word in a category. Instead I want the i give a list/dictionary e.g. Cleanliness = ['Clean', 'Cleaned', 'spoilt', 'dirty'] for getting category 'Cleanliness' against the sentence if it has any of the words in list. How can i do that
You can use a dict of sets to structure your words with categories, and then generate a word-to-category lookup dict based on the said structure:
categories = {
'Cleanliness': {'insects', 'clean'},
'AC Problem': {'AC'},
'Food Problem': {'food'},
'Train Delayed': {'delay'}
}
lookup = {word: category for category, words in categories.items() for word in words}
def theme(x):
return {lookup.get(word, 'None') for word in x}
so that theme(['AC', 'clean', 'insects']) would return a set of corresponding categories:
{'Cleanliness', 'AC Problem'}
This should do what you're asking. I set all the keys to lowercase and converted i to lowercase when checking if you get a match, but with different capitalization, it still counts.
def theme(x):
output =[]
category = ()
myDict = {"ac":"AC problem", "insects":"Cleanliness", "clean":"Cleanliness", "food":"Food Problem", "delay":"Train Delayed"} #I reccomend coming up with a more suitable name for your dictionary in your actual program
for i in x:
if i.lower() in myDict: #Checks to see if i is in the dictionary before trying to print the result; prevents possible Key Errors
category = (myDict[i.lower()]) #If it is in the dictionary it category will be set to the result of the key
output.append(category)
else:
output.append("None") #If i isn't in the dictionary output will append None instead
return output
Here's some examples:
>>>print(theme(['Clean', 'Cleaned', 'spoilt', 'dirty']))
['Cleanliness', 'None', 'None', 'None']
>>>print(theme(['Delay', 'Ham', 'Cheese', 'Insects']))
['Train Delayed', 'None', 'None', 'Cleanliness']
I have worked out a another way:
def theme(x):
output = []
for i in x:
if set(cleanliness).intersection(i.lower().split()):
category = 'clean'
elif set(ac_problem).intersection(i.lower().split()):
category = 'ac problem'
else:
category = 'none'
output.append(category)
return output
Maybe you can do it like this:
def theme(x):
output = []
name_dic = {"AC": "AC problem",
"clean": "Cleanliness",
"food": "Food Problem"
}
for e in x:
output.append(name_dic.get(e))
return output
Or more exactly like this:
def theme(x):
output = []
name_list = [
("AC", "AC problem"),
("clean", "Cleanliness"),
("insects", "Cleanliness"),
("food", "Food Problem")
]
name_dic = dict(name_list)
for e in x:
output.append(name_dic.get(e))
return output
Hope it helps.

Categories