Categorizing sentence using dictionary - python

I am using below function for getting categorizing sentence in themes
def theme(x):
output =[]
category = ()
for i in x:
if 'AC' in i:
category = 'AC problem'
elif 'insects' in i:
category = 'Cleanliness'
elif 'clean' in i:
category = 'Cleanliness'
elif 'food' in i:
category = 'Food Problem'
elif 'delay' in i:
category = 'Train Delayed'
else:
category = 'None'
output.append(category)
return output
I don't want to use repeated if statements for every word in a category. Instead I want the i give a list/dictionary e.g. Cleanliness = ['Clean', 'Cleaned', 'spoilt', 'dirty'] for getting category 'Cleanliness' against the sentence if it has any of the words in list. How can i do that

You can use a dict of sets to structure your words with categories, and then generate a word-to-category lookup dict based on the said structure:
categories = {
'Cleanliness': {'insects', 'clean'},
'AC Problem': {'AC'},
'Food Problem': {'food'},
'Train Delayed': {'delay'}
}
lookup = {word: category for category, words in categories.items() for word in words}
def theme(x):
return {lookup.get(word, 'None') for word in x}
so that theme(['AC', 'clean', 'insects']) would return a set of corresponding categories:
{'Cleanliness', 'AC Problem'}

This should do what you're asking. I set all the keys to lowercase and converted i to lowercase when checking if you get a match, but with different capitalization, it still counts.
def theme(x):
output =[]
category = ()
myDict = {"ac":"AC problem", "insects":"Cleanliness", "clean":"Cleanliness", "food":"Food Problem", "delay":"Train Delayed"} #I reccomend coming up with a more suitable name for your dictionary in your actual program
for i in x:
if i.lower() in myDict: #Checks to see if i is in the dictionary before trying to print the result; prevents possible Key Errors
category = (myDict[i.lower()]) #If it is in the dictionary it category will be set to the result of the key
output.append(category)
else:
output.append("None") #If i isn't in the dictionary output will append None instead
return output
Here's some examples:
>>>print(theme(['Clean', 'Cleaned', 'spoilt', 'dirty']))
['Cleanliness', 'None', 'None', 'None']
>>>print(theme(['Delay', 'Ham', 'Cheese', 'Insects']))
['Train Delayed', 'None', 'None', 'Cleanliness']

I have worked out a another way:
def theme(x):
output = []
for i in x:
if set(cleanliness).intersection(i.lower().split()):
category = 'clean'
elif set(ac_problem).intersection(i.lower().split()):
category = 'ac problem'
else:
category = 'none'
output.append(category)
return output

Maybe you can do it like this:
def theme(x):
output = []
name_dic = {"AC": "AC problem",
"clean": "Cleanliness",
"food": "Food Problem"
}
for e in x:
output.append(name_dic.get(e))
return output
Or more exactly like this:
def theme(x):
output = []
name_list = [
("AC", "AC problem"),
("clean", "Cleanliness"),
("insects", "Cleanliness"),
("food", "Food Problem")
]
name_dic = dict(name_list)
for e in x:
output.append(name_dic.get(e))
return output
Hope it helps.

Related

Is there a way to get the key from a list embedded in a dictionary

I am trying to write a function that, given a dictionary of restaurants, allows you to randomly pick a restaurant based on your choice of one of the values. For example, if you say you want a bakery then it will only give you bakeries.
I have only worked on the code for choosing the type of restaurant so far, and I am struggling with how to generate a random list. So I am checking for a value and, if it has it, would want to add the key to a list. Does anyone have any suggestions on how to do this?
import random
Restaurants ={
"Eureka": ["American", "$$", "Lunch", "Dinner"],
"Le_Pain": ["Bakery", "$$", "Breakfast", "Lunch", "Dinner"],
"Creme_Bakery": ["Bakery", "$", "Snack"]
}
list=[]
def simple_chooser():
print('Would you like to lock a category or randomize? (randomize, type, price, or meal)')
start= input()
if start=="randomize":
return #completely random
elif start=="type":
print("American, Bakery, Pie, Ice_Cream, Bagels, Asian, Chocolate, Italian, Pizza, Thai, Mexican, Japanese, Acai, Mediterranean, or Boba/Coffee?")
type=input()
for lst in Restaurants.values():
for x in lst:
if x==type:
list.append(x)
return(random.choice(list))
To return completely random restaurant suggestions you need to create a list of all the types first and then you can choose one and return the names of the restaurants.
import random
Restaurants ={
"Eureka": ["American", "$$", "Lunch", "Dinner"],
"Le_Pain": ["Bakery", "$$", "Breakfast", "Lunch", "Dinner"],
"Creme_Bakery": ["Bakery", "$", "Snack"]
}
types = ['American', 'Bakery', 'Pie', 'Ice_Cream', 'Bagels', 'Asian', 'Chocolate', 'Italian',
'Pizza', 'Thai', 'Mexican', 'Japanese', 'Acai', 'Mediterranean','Boba/Coffee']
def simple_chooser():
l=[]
print('Would you like to lock a category or randomize? (randomize, type, price, or meal)')
start= input()
if start=="randomize":
type_random = random.choice(types)
for k,v in Restaurants.items():
if v[0] == type_random:
l.append(k)
elif start=="type":
print("American, Bakery, Pie, Ice_Cream, Bagels, Asian, Chocolate, Italian, Pizza, Thai, Mexican, Japanese, Acai, Mediterranean, or Boba/Coffee?")
type_chosen=input()
for k,v in Restaurants.items():
if v[0] == type_chosen:
l.append(k)
return(random.choice(l))
Also, you don't need to return in if-else statements. Once you have your list of Restaurants you can randomly choose a restaurant and return it.
In your loop, you don't have the restaurant name, as you iterate on the values, you would have need something like
for name, props in Restaurants.items():
if props[0] == type:
list.append(name)
return (random.choice(list)) # wait for the whole list to be constructed
With a better naming (don't use type and list that are builtin methods)
def simple_chooser():
start = input('Would you like to lock a category or randomize? (randomize, type, price, or meal)')
if start == "randomize":
return # completely random
elif start == "type":
restaurant_type = input("American, Bakery, Pie, Ice_Cream, Bagels, Asian, Chocolate, Italian, "
"Pizza, Thai, Mexican, Japanese, Acai, Mediterranean, or Boba/Coffee?")
matching_names = [name for name, props in Restaurants.items() if props[0] == restaurant_type]
return random.choice(matching_names)
You make processing difficult because of the design of your data structures.
Here's an idea which should be easily adapted to future needs.
import random
from operator import contains, eq
Restaurants = [
{'name': 'Eureka', 'type': 'American', 'price': '$$', 'meal': ('Dinner',)},
{'name': 'Le_Pain', 'type': 'Bakery', 'price': '$$', 'meal': ('Lunch', 'Dinner')},
{'name': 'Creme_Bakery', 'type': 'Bakery', 'price': '$', 'meal': ('Snack',)}
]
def get_attr(k):
s = set()
for r in Restaurants:
if isinstance(r[k], tuple):
for t in r[k]:
s.add(t)
else:
s.add(r[k])
return s
def choose_restaurant():
categories = ', '.join(Restaurants[0])
while True:
choice = input(f'Select by category ({categories}) or choose random: ')
if choice == 'random':
return random.choice(Restaurants)
if choice in Restaurants[0]:
choices = get_attr(choice)
if (v := input(f'Select value for {choice} from ({", ".join(choices)}): ')) in choices:
op = contains if isinstance(Restaurants[0][choice], tuple) else eq
return [r for r in Restaurants if op(r[choice], v)]
print('Invalid selection\x07')
print(choose_restaurant())
Restaurants is now a list of dictionaries which is easy to extend. You just need to make sure that each new restaurant has the same structure (keys). Also note that the 'meal' value is a tuple even if there's a single value

want to add left out string in matched string

Below is my example code:
from fuzzywuzzy import fuzz
import json
from itertools import zip_longest
synonyms = open("synonyms.json","r")
synonyms = json.loads(synonyms.read())
vendor_data = ["i7 processor","solid state","Corei5 :1135G7 (11th
Generation)","hard
drive","ddr 8gb","something1", "something2",
"something3","HT (100W) DDR4-2400"]
buyer_data = ["i7 processor 12 generation","corei7:latest technology"]
vendor = []
buyer = []
for item,value in synonyms.items():
for k,k2 in zip_longest(vendor_data,buyer_data):
for v in value:
if fuzz.token_set_ratio(k,v) > 70:
if item in k:
vendor.append(k)
else:
vendor.append(item+" "+k)
else:
#didnt get only "something" strings here !
if fuzz.token_set_ratio(k2,v) > 70:
if item in k2:
buyer.append(k2)
else:
buyer.append(item+" "+k2)
vendor = list(set(vendor))
buyer = list(set(buyer))
vendor,buyer
Note: "something" string can be anything like "battery" or "display"etc
synonyms json
{
"processor":["corei5","core","corei7","i5","i7","ryzen5","i5 processor","i7
processor","processor i5","processor i7","core generation","core gen"],
"ram":["DDR4","memory","DDR3","DDR","DDR 8gb","DDR 8 gb","DDR 16gb","DDR 16 gb","DDR
32gb","DDR 32 gb","DDR4-"],
"ssd":["solid state drive","solid drive"],
"hdd":["Hard Drive"]
}
what do i need ?
I want to add all "something" string inside vendor list dynamically.
! NOTE -- "something" string can be anything in future.
I want to add "something" string in vendor array which is not a matched value in fuzz>70! I want to basically add left out data also.
for example like below:
current output
['processor Corei5 :1135G7 (11th Generation)',
'i7 processor',
'ram HT (100W) DDR4-2400',
'ram ddr 8gb',
'hdd hard drive',
'ssd solid state']
expected output below
['processor Corei5 :1135G7 (11th Generation)',
'i7 processor',
'ram HT (100W) DDR4-2400',
'ram ddr 8gb',
'hdd hard drive',
'ssd solid state',
'something1',
'something2'
'something3'] #something string need to be added in vendor list dynamically.
what silly mistake am I doing ? Thank you.
Here's my attempt:
from fuzzywuzzy import process, fuzz
synonyms = {'processor': ['corei5', 'core', 'corei7', 'i5', 'i7', 'ryzen5', 'i5 processor', 'i7 processor', 'processor i5', 'processor i7', 'core generation', 'core gen'], 'ram': ['DDR4', 'memory', 'DDR3', 'DDR', 'DDR 8gb', 'DDR 8 gb', 'DDR 16gb', 'DDR 16 gb', 'DDR 32gb', 'DDR 32 gb', 'DDR4-'], 'ssd': ['solid state drive', 'solid drive'], 'hdd': ['Hard Drive']}
vendor_data = ['i7 processor', 'solid state', 'Corei5 :1135G7 (11th Generation)', 'hard drive', 'ddr 8gb', 'something1', 'something2', 'something3', 'HT (100W) DDR4-2400']
buyer_data = ['i7 processor 12 generation', 'corei7:latest technology']
def find_synonym(s: str, min_score: int = 60):
results = process.extractBests(s, choices=synonyms, score_cutoff=min_score)
if not results:
return None
return results[0][-1]
def process_data(l: list, min_score: int = 60):
matches = []
no_matches = []
for item in l:
syn = find_synonym(item, min_score=min_score)
if syn is not None:
new_item = f'{syn} {item}' if syn not in item else item
matches.append(new_item)
elif any(fuzz.partial_ratio(s, item) >= min_score for s in synonyms.keys()):
# one of the synonyms is already in the item string
matches.append(item)
else:
no_matches.append(item)
return matches, no_matches
For process_data(vendor_data) we get:
(['i7 processor',
'ssd solid state',
'processor Corei5 :1135G7 (11th Generation)',
'hdd hard drive',
'ram ddr 8gb',
'ram HT (100W) DDR4-2400'],
['something1', 'something2', 'something3'])
And for process_data(buyer_data):
(['i7 processor 12 generation', 'processor corei7:latest technology'], [])
I had to lower the cut-off score to 60 to also get results for ddr 8gb. The process_data function returns 2 lists: One with matches with words from the synonyms dict and one with items without matches. If you want exactly the output you listed in your question, just concatenate the two lists like this:
matches, no_matches = process_data(vendor_data)
matches + no_matches # ['i7 processor', 'ssd solid state', 'processor Corei5 :1135G7 (11th Generation)', 'hdd hard drive', 'ram ddr 8gb', 'ram HT (100W) DDR4-2400', 'something1', 'something2', 'something3']
I have tried to come up with a decent answer (certainly not the cleanest one)
import json
from itertools import zip_longest
from fuzzywuzzy import fuzz
synonyms = open("synonyms.json", "r")
synonyms = json.loads(synonyms.read())
vendor_data = ["i7 processor", "solid state", "Corei5 :1135G7 (11thGeneration)", "hard drive", "ddr 8gb", "something1",
"something2",
"something3", "HT (100W) DDR4-2400"]
buyer_data = ["i7 processor 12 generation", "corei7:latest technology"]
vendor = []
buyer = []
for k, k2 in zip_longest(vendor_data, buyer_data):
has_matched = False
for item, value in synonyms.items():
for v in value:
if fuzz.token_set_ratio(k, v) > 70:
if item in k:
vendor.append(k)
else:
vendor.append(item + " " + k)
if has_matched or k2 is None:
break
else:
has_matched = True
if fuzz.token_set_ratio(k2, v) > 70:
if item in k2:
buyer.append(k2)
else:
buyer.append(item + " " + k2)
if has_matched or k is None:
break
else:
has_matched = True
else:
continue # match not found
break # match is found
else: # only evaluates on normal loop end
# Only something strings
# do something with the new input values
continue
vendor = list(set(vendor))
buyer = list(set(buyer))
I hope you can achieve what you want with this code. Check the docs if you don't know what a for else loop does. TLDR: the else clause executes when the loop terminates normally (not with a break). Note that I put the synonyms loop inside the data loop. This is because we can't certainly know in which synonym group the data belongs, also somethimes the vendor data entry is a processor while the buyer data is memory. Also note that I have assumed an item can't match more than 1 time. If this could be the case you would need to make a more advanced check (just make a counter and break when the counter equals 2 for example).
EDIT:
I took another look at the question and came up with maybe a better answer:
v_dict = dict()
for spec in vendor_data[:]:
for item, choices in synonyms.items():
if process.extractOne(spec, choices)[1] > 70: # don't forget to import process from fuzzywuzzy
v_dict[spec] = item
break
else:
v_dict[spec] = "Something new"
This code matches the strings to the correct type. for example {'i7 processor': 'processor', 'solid state': 'ssd', 'Corei5 :1135G7 (11thGeneration)': 'processor', 'hard drive': 'ssd', 'ddr 8gb': 'ram', 'something1': 'Something new', 'something2': 'Something new', 'something3': 'Something new', 'HT (100W) DDR4-2400': 'ram'}. You can change the "Something new" with watherver you like. You could also do: v_dict[spec] = 0 (on a match) and v_dict[spec] = 1 (on no match). You could then sort the dict ->
it = iter(v_dict.values())
print(sorted(v_dict.keys(), key=lambda x: next(it)))
Which would give the wanted results (more or less), all the recognised items will be first, and then all the unrecognised items. You could do some more advanced sorting on this dict if you want. I think this code gives you enough flexibility to reach your goal.
If I understand correctly, what you are trying to do is match keywords specified by a customer and/or vendor against a predefined database of keywords you have.
First, I would highly recommend using a reversed mapping of the synonyms, so it's faster to lookup, especially when the dataset will grow.
Second, considering the fuzzywuzzy API, it looks like you simply want the best match, so extractOne is a solid choice for that.
Now, extractOne returns the best match and a score:
>>> process.extractOne("cowboys", choices)
("Dallas Cowboys", 90)
I would split the algorithm into two:
A generic part that simply gets the best match, which should always exist (even if it's not a great one)
A filter, where you could adjust the sensitivity of the algorithm, based on different criteria of your application. This sensitivity threshold should set the minimal match quality. If you're below this threshold, just use "untagged" for the category for example.
Here is the final code, which I think is very simple and easy to understand and expand:
import json
from fuzzywuzzy import process
def load_synonyms():
with open('synonyms.json') as fin:
synonyms = json.load(fin)
# Reversing the map makes it much easier to lookup
reversed_synonyms = {}
for key, values in synonyms.items():
for value in values:
reversed_synonyms[value] = key
return reversed_synonyms
def load_vendor_data():
return [
"i7 processor",
"solid state",
"Corei5 :1135G7 (11thGeneration)",
"hard drive",
"ddr 8gb",
"something1",
"something2",
"something3",
"HT (100W) DDR4-2400"
]
def load_customer_data():
return [
"i7 processor 12 generation",
"corei7:latest technology"
]
def get_tag(keyword, synonyms):
THRESHOLD = 80
DEFAULT = 'general'
tag, score = process.extractOne(keyword, synonyms.keys())
return synonyms[tag] if score > THRESHOLD else DEFAULT
def main():
synonyms = load_synonyms()
customer_data = load_customer_data()
vendor_data = load_vendor_data()
data = customer_data + vendor_data
tags_dict = { keyword: get_tag(keyword, synonyms) for keyword in data }
print(json.dumps(tags_dict, indent=4))
if __name__ == '__main__':
main()
When running with the specified inputs, the output is:
{
"i7 processor 12 generation": "processor",
"corei7:latest technology": "processor",
"i7 processor": "processor",
"solid state": "ssd",
"Corei5 :1135G7 (11thGeneration)": "processor",
"hard drive": "hdd",
"ddr 8gb": "ram",
"something1": "general",
"something2": "general",
"something3": "general",
"HT (100W) DDR4-2400": "ram"
}

How to avoid very long if-elif-elif-else statements in Python function

Is there a smart way to shorten very long if-elif-elif-elif... statements?
Let's say I have a function like this:
def very_long_func():
something = 'Audi'
car = ['VW', 'Audi', 'BMW']
drinks = ['Cola', 'Fanta', 'Pepsi']
countries = ['France', 'Germany', 'Italy']
if something in car:
return {'type':'car brand'}
elif something in drinks:
return {'type':'lemonade brand'}
elif something in countries:
return {'type':'country'}
else:
return {'type':'nothing found'}
very_long_func()
>>>> {'type': 'car brand'}
The actual function is much longer than the example. What would be the best way to write this function (not in terms of speed but in readability)
I was reading this, but I have trouble to apply it to my problem.
You can't hash lists as dictionary values. So go other way round. Create a mapping of type -> list. And initialize your output with the default type. This allows you to keep on adding new types to your mapping without changing any code.
def very_long_func():
something = 'Audi'
car = ['VW', 'Audi', 'BMW']
drinks = ['Cola', 'Fanta', 'Pepsi']
countries = ['France', 'Germany', 'Italy']
out = {'type': 'nothing found'} # If nothing matches
mapping = {
'car brand': car,
'lemonade brand': drinks,
'country': countries
}
for k,v in mapping.items() :
if something in v:
out['type'] = k # update if match found
break
return out # returns matched or default value
you can create dictionary like this and then use map_dict.
from functools import reduce
car = ['VW', 'Audi', 'BMW']
drinks = ['Cola', 'Fanta', 'Pepsi']
countries = ['France', 'Germany', 'Italy']
li = [car, drinks, countries]
types = ['car brand', 'lemonade brand', 'country', 'nothing found']
dl = [dict(zip(l, [types[idx]]*len(l))) for idx, l in enumerate(li)]
map_dict = reduce(lambda a, b: dict(a, **b), dl)
Try this:
def create_dct(lst, flag):
return {k:flag for k in lst}
car = ['VW', 'Audi', 'BMW']
drinks = ['Cola', 'Fanta', 'Pepsi']
countries = ['France', 'Germany', 'Italy']
merge_dcts = {}
merge_dcts.update(create_dct(car, 'car brand'))
merge_dcts.update(create_dct(drinks, 'lemonade brand'))
merge_dcts.update(create_dct(countries, 'country'))
something = 'Audi'
try:
print("type: ", merge_dcts[something])
except:
print("type: nothing found")
You can simulate a switch statement with a helper function like this:
def switch(v): yield lambda *c: v in c
The your code could be written like this:
something = 'Audi'
for case in switch(something):
if case('VW', 'Audi', 'BMW'): name = 'car brand' ; break
if case('Cola', 'Fanta', 'Pepsi'): name = 'lemonade brand' ; break
if case('France', 'Germany', 'Italy'): name = 'country' ; break
else: name = 'nothing found'
return {'type':name}
If you don't have specific code to do for each value, then a simple mapping dictionary would probably suffice. For ease of maintenance, you can start with a category-list:type-name mapping and expand it before use:
mapping = { ('VW', 'Audi', 'BMW'):'car brand',
('Cola', 'Fanta', 'Pepsi'):'lemonade brand',
('France', 'Germany', 'Italy'):'country' }
mapping = { categ:name for categs,name in mapping.items() for categ in categs }
Then your code will look like this:
something = 'Audi'
return {'type':mapping.get(something,'nothing found')}
using a defaultdict would make this even simpler to use by providing the 'nothing found' value automatically so you could write: return {'type':mapping[something]}

multiple separator in a string python

text="Brand.*/Smart Planet.#/Color.*/Yellow.#/Type.*/Sandwich Maker.#/Power Source.*/Electrical."
I have this kind of string. I am facing the problem which splits it to 2 lists. Output will be approximately like this :
name = ['Brand','Color','Type','Power Source']
value = ['Smart Plane','Yellow','Sandwich Maker','Electrical']
Is there any solution for this.
name = []
value = []
text = text.split('.#/')
for i in text:
i = i.split('.*/')
name.append(i[0])
value.append(i[1])
This is one approach using re.split and list slicing.
Ex:
import re
text="Brand.*/Smart Planet.#/Color.*/Yellow.#/Type.*/Sandwich Maker.#/Power Source.*/Electrical."
data = [i for i in re.split("[^A-Za-z\s]+", text) if i]
name = data[::2]
value = data[1::2]
print(name)
print(value)
Output:
['Brand', 'Color', 'Type', 'Power Source']
['Smart Planet', 'Yellow', 'Sandwich Maker', 'Electrical']
You can use regex to split the text, and populate the lists in a loop.
Using regex you protect your code from invalid input.
import re
name, value = [], []
for ele in re.split(r'\.#\/', text):
k, v = ele.split('.*/')
name.append(k)
value.append(v)
>>> print(name, val)
['Brand', 'Color', 'Type', 'Power Source'] ['Smart Planet', 'Yellow', 'Sandwich Maker', 'Electrical.']
text="Brand.*/Smart Planet.#/Color.*/Yellow.#/Type.*/Sandwich Maker.#/Power Source.*/Electrical."
name=[]
value=[]
word=''
for i in range(len(text)):
temp=i
if text[i]!='.' and text[i]!='/' and text[i]!='*' and text[i]!='#':
word=word+''.join(text[i])
elif temp+1<len(text) and temp+2<=len(text):
if text[i]=='.' and text[temp+1]=='*' and text[temp+2]=='/':
name.append(word)
word=''
elif text[i]=='.' and text[temp+1]=='#' and text[temp+2]=='/':
value.append(word)
word=''
else:
value.append(word)
print(name)
print(value)
this will be work...

Trouble getting right values against each item

I'm trying to parse the item names and it's corresponding values from the below snippet. dt tag holds names and dd containing values. There are few dt tags which do not have corresponding values. So, all the names do not have values. What I wish to do is keep the values blank against any name if the latter doesn't have any values.
These are the elements I would like to scrape data from:
content="""
<div class="movie_middle">
<dl>
<dt>Genres:</dt>
<dt>Resolution:</dt>
<dd>1920*1080</dd>
<dt>Size:</dt>
<dd>1.60G</dd>
<dt>Quality:</dt>
<dd>1080p</dd>
<dt>Frame Rate:</dt>
<dd>23.976 fps</dd>
<dt>Language:</dt>
</dl>
</div>
"""
I've tried like below:
soup = BeautifulSoup(content,"lxml")
title = [item.text for item in soup.select(".movie_middle dt")]
result = [item.text for item in soup.select(".movie_middle dd")]
vault = dict(zip(title,result))
print(vault)
It gives me messy results (wrong pairs):
{'Genres:': '1920*1080', 'Resolution:': '1.60G', 'Size:': '1080p', 'Quality:': '23.976 fps'}
My expected result:
{'Genres:': '', 'Resolution:': '1920*1080', 'Size:': '1.60G', 'Quality:': '1080p','Frame Rate:':'23.976 fps','Language:':''}
Any help on fixing the issue will be highly appreciated.
You can loop through the elements inside dl. If the current element is dt and the next element is dd, then store the value as the next element, else set the value as empty string.
dl = soup.select('.movie_middle dl')[0]
elems = dl.find_all() # Returns the list of dt and dd
data = {}
for i, el in enumerate(elems):
if el.name == 'dt':
key = el.text.replace(':', '')
# check if the next element is a `dd`
if i < len(elems) - 1 and elems[i+1].name == 'dd':
data[key] = elems[i+1].text
else:
data[key] = ''
You can use BeautifulSoup to parse the dl structure, and then write a function to create the dictionary:
from bs4 import BeautifulSoup as soup
import re
def parse_result(d):
while d:
a, *_d = d
if _d:
if re.findall('\<dt', a) and re.findall('\<dd', _d[0]):
yield [a[4:-5], _d[0][4:-5]]
d = _d[1:]
else:
yield [a[4:-5], '']
d = _d
else:
yield [a[4:-5], '']
d = []
print(dict(parse_result(list(filter(None, str(soup(content, 'html.parser').find('dl')).split('\n')))[1:-1])))
Output:
{'Genres:': '', 'Resolution:': '1920*1080', 'Size:': '1.60G', 'Quality:': '1080p', 'Frame Rate:': '23.976 fps', 'Language:': ''}
For a slightly longer, although cleaner solution, you can create a decorator to strip the HTML tags of the output, thus removing the need for the extra string slicing in the main parse_result function:
def strip_tags(f):
def wrapper(data):
return {a[4:-5]:b[4:-5] for a, b in f(data)}
return wrapper
#strip_tags
def parse_result(d):
while d:
a, *_d = d
if _d:
if re.findall('\<dt', a) and re.findall('\<dd', _d[0]):
yield [a, _d[0]]
d = _d[1:]
else:
yield [a, '']
d = _d
else:
yield [a, '']
d = []
print(parse_result(list(filter(None, str(soup(content, 'html.parser').find('dl')).split('\n')))[1:-1]))
Output:
{'Genres:': '', 'Resolution:': '1920*1080', 'Size:': '1.60G', 'Quality:': '1080p', 'Frame Rate:': '23.976 fps', 'Language:': ''}
from collections import defaultdict
test = soup.text.split('\n')
d = defaultdict(list)
for i in range(len(test)):
if (':' in test[i]) and (':' not in test[i+1]):
d[test[i]] = test[i+1]
elif ':' in test[i]:
d[test[i]] = ''
d
defaultdict(list,
{'Frame Rate:': '23.976 fps',
'Genres:': '',
'Language:': '',
'Quality:': '1080p',
'Resolution:': '1920*1080',
'Size:': '1.60G'})
The logic here is that you know that every key will have a colon. Knowing this, you can write an if else statement to capture the unique combinations, whether that is key followed by key or key followed by value
Edit:
In case you wanted to clean your keys, below replaces the : in each one:
d1 = { x.replace(':', ''): d[x] for x in d.keys() }
d1
{'Frame Rate': '23.976 fps',
'Genres': '',
'Language': '',
'Quality': '1080p',
'Resolution': '1920*1080',
'Size': '1.60G'}
The problem is that empty elements are not present. Since there is no hierarchy between the <dt> and the <dd>, I'm afraid you'll have to craft the dictionary yourself.
vault = {}
category = ""
for item in soup.find("dl").findChildren():
if item.name == "dt":
if category == "":
category = item.text
else:
vault[category] = ""
category = ""
elif item.name == "dd":
vault[category] = item.text
category = ""
Basically this code iterates over the child elements of the <dl> and fills the vault dictionary with the values.

Categories