Trouble getting right values against each item - python

I'm trying to parse the item names and it's corresponding values from the below snippet. dt tag holds names and dd containing values. There are few dt tags which do not have corresponding values. So, all the names do not have values. What I wish to do is keep the values blank against any name if the latter doesn't have any values.
These are the elements I would like to scrape data from:
content="""
<div class="movie_middle">
<dl>
<dt>Genres:</dt>
<dt>Resolution:</dt>
<dd>1920*1080</dd>
<dt>Size:</dt>
<dd>1.60G</dd>
<dt>Quality:</dt>
<dd>1080p</dd>
<dt>Frame Rate:</dt>
<dd>23.976 fps</dd>
<dt>Language:</dt>
</dl>
</div>
"""
I've tried like below:
soup = BeautifulSoup(content,"lxml")
title = [item.text for item in soup.select(".movie_middle dt")]
result = [item.text for item in soup.select(".movie_middle dd")]
vault = dict(zip(title,result))
print(vault)
It gives me messy results (wrong pairs):
{'Genres:': '1920*1080', 'Resolution:': '1.60G', 'Size:': '1080p', 'Quality:': '23.976 fps'}
My expected result:
{'Genres:': '', 'Resolution:': '1920*1080', 'Size:': '1.60G', 'Quality:': '1080p','Frame Rate:':'23.976 fps','Language:':''}
Any help on fixing the issue will be highly appreciated.

You can loop through the elements inside dl. If the current element is dt and the next element is dd, then store the value as the next element, else set the value as empty string.
dl = soup.select('.movie_middle dl')[0]
elems = dl.find_all() # Returns the list of dt and dd
data = {}
for i, el in enumerate(elems):
if el.name == 'dt':
key = el.text.replace(':', '')
# check if the next element is a `dd`
if i < len(elems) - 1 and elems[i+1].name == 'dd':
data[key] = elems[i+1].text
else:
data[key] = ''

You can use BeautifulSoup to parse the dl structure, and then write a function to create the dictionary:
from bs4 import BeautifulSoup as soup
import re
def parse_result(d):
while d:
a, *_d = d
if _d:
if re.findall('\<dt', a) and re.findall('\<dd', _d[0]):
yield [a[4:-5], _d[0][4:-5]]
d = _d[1:]
else:
yield [a[4:-5], '']
d = _d
else:
yield [a[4:-5], '']
d = []
print(dict(parse_result(list(filter(None, str(soup(content, 'html.parser').find('dl')).split('\n')))[1:-1])))
Output:
{'Genres:': '', 'Resolution:': '1920*1080', 'Size:': '1.60G', 'Quality:': '1080p', 'Frame Rate:': '23.976 fps', 'Language:': ''}
For a slightly longer, although cleaner solution, you can create a decorator to strip the HTML tags of the output, thus removing the need for the extra string slicing in the main parse_result function:
def strip_tags(f):
def wrapper(data):
return {a[4:-5]:b[4:-5] for a, b in f(data)}
return wrapper
#strip_tags
def parse_result(d):
while d:
a, *_d = d
if _d:
if re.findall('\<dt', a) and re.findall('\<dd', _d[0]):
yield [a, _d[0]]
d = _d[1:]
else:
yield [a, '']
d = _d
else:
yield [a, '']
d = []
print(parse_result(list(filter(None, str(soup(content, 'html.parser').find('dl')).split('\n')))[1:-1]))
Output:
{'Genres:': '', 'Resolution:': '1920*1080', 'Size:': '1.60G', 'Quality:': '1080p', 'Frame Rate:': '23.976 fps', 'Language:': ''}

from collections import defaultdict
test = soup.text.split('\n')
d = defaultdict(list)
for i in range(len(test)):
if (':' in test[i]) and (':' not in test[i+1]):
d[test[i]] = test[i+1]
elif ':' in test[i]:
d[test[i]] = ''
d
defaultdict(list,
{'Frame Rate:': '23.976 fps',
'Genres:': '',
'Language:': '',
'Quality:': '1080p',
'Resolution:': '1920*1080',
'Size:': '1.60G'})
The logic here is that you know that every key will have a colon. Knowing this, you can write an if else statement to capture the unique combinations, whether that is key followed by key or key followed by value
Edit:
In case you wanted to clean your keys, below replaces the : in each one:
d1 = { x.replace(':', ''): d[x] for x in d.keys() }
d1
{'Frame Rate': '23.976 fps',
'Genres': '',
'Language': '',
'Quality': '1080p',
'Resolution': '1920*1080',
'Size': '1.60G'}

The problem is that empty elements are not present. Since there is no hierarchy between the <dt> and the <dd>, I'm afraid you'll have to craft the dictionary yourself.
vault = {}
category = ""
for item in soup.find("dl").findChildren():
if item.name == "dt":
if category == "":
category = item.text
else:
vault[category] = ""
category = ""
elif item.name == "dd":
vault[category] = item.text
category = ""
Basically this code iterates over the child elements of the <dl> and fills the vault dictionary with the values.

Related

Get all the keys of a nested dict

With xmltodict I managed to get my code from xml in a dict and now I want to create an excel.
In this excel the header of a value is going to be all the parents (keys in the dict).
For example:
dict = {"name":"Pete", "last-name": "Pencil", "adres":{"street": "example1street", "number":"5", "roommate":{"gender":"male"}}}
The value male will have the header: adres/roommate/gender.
Here's a way to orgainze the data in the way your question asks:
d = {"name":"Pete", "last-name": "Pencil", "adres":{"street": "example1street", "number":"5", "roommate":{"gender":"male"}}}
print(d)
stack = [('', d)]
headerByValue = {}
while stack:
name, top = stack.pop()
if isinstance(top, dict):
stack += (((name + '/' if name else '') + k, v) for k, v in top.items())
else:
headerByValue[name] = top
print(headerByValue)
Output:
{'adres/roommate/gender': 'male',
'adres/number': '5',
'adres/street': 'example1street',
'last-name': 'Pencil',
'name': 'Pete'}

How to map nested list to flat values

I`m trying to parse a spreadsheet with a header that looks something like this:
My problem is those nested keys below "Контрагент". I decided to parse it like this:
['Дата',
'Номер документа',
'Дебет',
'Кредит',
['Контрагент',
['Наименование', 'ИНН', 'КПП', 'Счет', 'БИК', 'Наименование банка']],
'Назначение платежа',
'Код дебитора',
'Тип документа']
But now, I don`t really have an idea as how to map it to a flat list of values:
['21.05.2021',
'591324565436',
'0.00',
'526345428.99',
'asdasd',
'234525460140679',
'77130100123412341',
'302328105423534200000000280',
'0445252345234974',
'asdfsadfsd',
'sdfghsfgdhfdghdfgh',
'',
'dfghfgdhfdgh']
Given these variables, I want a function to return following dict:
{
"Дата": "21.05.2021",
"Номер документа": "591324565436",
"Дебет": "0.00",
"Кредит": "526345428.99",
"Контрагент": {
"Наименование": "asdasd",
"ИНН": "234525460140679",
"КПП": "77130100123412341",
"Счет": "302328105423534200000000280",
"БИК": "0445252345234974",
"Наименование банка": "asdfsadfsd"
},
"Назначение платежа": "sdfghsfgdhfdghdfgh",
"Код дебитора": "",
"Тип документа": "dfghfgdhfdgh"
}
I've gone this far without realizing it'd be raising IndexError on the 3rd line:
def map_to_schema(schema, data):
for i, elem in enumerate(data):
key = schema[i]
if isinstance(key, list):
if key[0] not in result:
result[key[0]] = {}
result[key[0]] |= {
key[1][i-len(key)]: elem
}
else:
result[key] = elem
What should I do? Maybe the structure for the schema isn't good enough? I really have no idea...
You could use a dictionary comprehension and an iterator:
headers = ['Дата', 'Номер документа', 'Дебет', 'Кредит',
['Контрагент', ['Наименование', 'ИНН', 'КПП', 'Счет', 'БИК', 'Наименование банка']],
'Назначение платежа', 'Код дебитора', 'Тип документа']
values = ['21.05.2021', '591324565436', '0.00', '526345428.99', 'asdasd', '234525460140679', '77130100123412341',
'302328105423534200000000280', '0445252345234974', 'asdfsadfsd', 'sdfghsfgdhfdghdfgh', '',
'dfghfgdhfdgh']
it = iter(values)
out = {k[0] if (islist := isinstance(k, list)) else k:
{k2: next(it) for k2 in k[1]} if islist else next(it)
for k in headers}
output:
{'Дата': '21.05.2021',
'Номер документа': '591324565436',
'Дебет': '0.00',
'Кредит': '526345428.99',
'Контрагент': {'Наименование': 'asdasd',
'ИНН': '234525460140679',
'КПП': '77130100123412341',
'Счет': '302328105423534200000000280',
'БИК': '0445252345234974',
'Наименование банка': 'asdfsadfsd'},
'Назначение платежа': 'sdfghsfgdhfdghdfgh',
'Код дебитора': '',
'Тип документа': 'dfghfgdhfdgh'}
Thanks #mozway for this solution! This is essentially the same algorithm, using a for loop.
def map(schema, s_length, row: list):
# If len(row) was less then *true* schema length, it would have thrown StopIteration.
# I ended up just extending row list by delta elements.
if (delta := s_length - len(row)) > 0:
row.extend([""] * delta)
iter_row = iter(row)
result = {}
for key in schema:
if isinstance(key, list):
result[key[0]] = {}
for sub_key in key[1]:
result[key[0]][sub_key] = next(iter_row)
else:
result[key] = next(iter_row)
return result

Python asside function returns none [duplicate]

This question already has answers here:
Recursive function does not return specified value
(2 answers)
Closed 1 year ago.
I have the orders of a platform in a csv file and I want to organize it to put in a database (probably mongodb). So the final data structure may be:
{'user1':
{'meals': {'hamburger': 2}, {'pizza': 1},
{'weekdays': {'monday': 1}},{'tuesday':1}, {'friday': 1}
}
So, I am trying to do a very simple code that organize the number of ocurrencies, but before I want to structure the dictionary keys dinamically. The data is in a csv file
import csv
import pprint
def addKeysNestedDict(dictionary, keys):
# print(dictionary)
if len(keys) > 1:
if keys[0] not in dictionary.keys():
dictionary[keys[0]] = {'meals': '', 'weekdays': ''}
print('inside addKeys IF: ')
print(dictionary)
addKeysNestedDict(dictionary, keys[1:])
else:
dictionary[keys[0]] = {'meals': '', 'weekdays': ''}
print('inside addKeys: ')
print(dictionary) # PRINTS EXPECTED VALUES
return dictionary
def organize_orders(file):
pp = pprint.PrettyPrinter(indent=4)
dict_read = csv.DictReader(file, fieldnames=['name', 'food', 'day'])
list_dict = list(dict_read)
set_names = set()
set_food = set()
set_day = set()
# people = {} # DOESN'T MATTER IF IT IS DECLARED BOFORE
for n in list_dict:
set_names.add(n['name'])
set_food.add(n['food'])
set_day.add(n['day'])
print(addKeysNestedDict({}, list(set_names))) # PRINTS NONE
people = (addKeysNestedDict({}, list(set_names)))
print('people: ')
print(people) # PRINTS NONE
# for s_n in set_names:
# for s_f in set_food:
# for s_d in set_day:
# people[s_n][s_f] = list_dict.count(s_f)
# people[s_n][s_d] = list_dict.count(s_d)
def analyze_log(path_to_file):
with open(path_to_file) as csvfile:
return(organize_orders(csvfile))
analyze_log('some csv file with path')
I culd not figure out why I get none returned from addKeysNestedDict() method, since it prints exactly what I want one line before method return
Change the function to:
def addKeysNestedDict(dictionary, keys):
# print(dictionary)
if len(keys) > 1:
if keys[0] not in dictionary.keys():
dictionary[keys[0]] = {'meals': '', 'weekdays': ''}
print('inside addKeys IF: ')
print(dictionary)
addKeysNestedDict(dictionary, keys[1:])
else:
dictionary[keys[0]] = {'meals': '', 'weekdays': ''}
print('inside addKeys: ')
print(dictionary) # PRINTS EXPECTED VALUES
return dictionary
Notice that I moved the return dictionary statement out of the specific branch, which causes both of your branches to return the dictionary you're building in your function.

multiple separator in a string python

text="Brand.*/Smart Planet.#/Color.*/Yellow.#/Type.*/Sandwich Maker.#/Power Source.*/Electrical."
I have this kind of string. I am facing the problem which splits it to 2 lists. Output will be approximately like this :
name = ['Brand','Color','Type','Power Source']
value = ['Smart Plane','Yellow','Sandwich Maker','Electrical']
Is there any solution for this.
name = []
value = []
text = text.split('.#/')
for i in text:
i = i.split('.*/')
name.append(i[0])
value.append(i[1])
This is one approach using re.split and list slicing.
Ex:
import re
text="Brand.*/Smart Planet.#/Color.*/Yellow.#/Type.*/Sandwich Maker.#/Power Source.*/Electrical."
data = [i for i in re.split("[^A-Za-z\s]+", text) if i]
name = data[::2]
value = data[1::2]
print(name)
print(value)
Output:
['Brand', 'Color', 'Type', 'Power Source']
['Smart Planet', 'Yellow', 'Sandwich Maker', 'Electrical']
You can use regex to split the text, and populate the lists in a loop.
Using regex you protect your code from invalid input.
import re
name, value = [], []
for ele in re.split(r'\.#\/', text):
k, v = ele.split('.*/')
name.append(k)
value.append(v)
>>> print(name, val)
['Brand', 'Color', 'Type', 'Power Source'] ['Smart Planet', 'Yellow', 'Sandwich Maker', 'Electrical.']
text="Brand.*/Smart Planet.#/Color.*/Yellow.#/Type.*/Sandwich Maker.#/Power Source.*/Electrical."
name=[]
value=[]
word=''
for i in range(len(text)):
temp=i
if text[i]!='.' and text[i]!='/' and text[i]!='*' and text[i]!='#':
word=word+''.join(text[i])
elif temp+1<len(text) and temp+2<=len(text):
if text[i]=='.' and text[temp+1]=='*' and text[temp+2]=='/':
name.append(word)
word=''
elif text[i]=='.' and text[temp+1]=='#' and text[temp+2]=='/':
value.append(word)
word=''
else:
value.append(word)
print(name)
print(value)
this will be work...

Categorizing sentence using dictionary

I am using below function for getting categorizing sentence in themes
def theme(x):
output =[]
category = ()
for i in x:
if 'AC' in i:
category = 'AC problem'
elif 'insects' in i:
category = 'Cleanliness'
elif 'clean' in i:
category = 'Cleanliness'
elif 'food' in i:
category = 'Food Problem'
elif 'delay' in i:
category = 'Train Delayed'
else:
category = 'None'
output.append(category)
return output
I don't want to use repeated if statements for every word in a category. Instead I want the i give a list/dictionary e.g. Cleanliness = ['Clean', 'Cleaned', 'spoilt', 'dirty'] for getting category 'Cleanliness' against the sentence if it has any of the words in list. How can i do that
You can use a dict of sets to structure your words with categories, and then generate a word-to-category lookup dict based on the said structure:
categories = {
'Cleanliness': {'insects', 'clean'},
'AC Problem': {'AC'},
'Food Problem': {'food'},
'Train Delayed': {'delay'}
}
lookup = {word: category for category, words in categories.items() for word in words}
def theme(x):
return {lookup.get(word, 'None') for word in x}
so that theme(['AC', 'clean', 'insects']) would return a set of corresponding categories:
{'Cleanliness', 'AC Problem'}
This should do what you're asking. I set all the keys to lowercase and converted i to lowercase when checking if you get a match, but with different capitalization, it still counts.
def theme(x):
output =[]
category = ()
myDict = {"ac":"AC problem", "insects":"Cleanliness", "clean":"Cleanliness", "food":"Food Problem", "delay":"Train Delayed"} #I reccomend coming up with a more suitable name for your dictionary in your actual program
for i in x:
if i.lower() in myDict: #Checks to see if i is in the dictionary before trying to print the result; prevents possible Key Errors
category = (myDict[i.lower()]) #If it is in the dictionary it category will be set to the result of the key
output.append(category)
else:
output.append("None") #If i isn't in the dictionary output will append None instead
return output
Here's some examples:
>>>print(theme(['Clean', 'Cleaned', 'spoilt', 'dirty']))
['Cleanliness', 'None', 'None', 'None']
>>>print(theme(['Delay', 'Ham', 'Cheese', 'Insects']))
['Train Delayed', 'None', 'None', 'Cleanliness']
I have worked out a another way:
def theme(x):
output = []
for i in x:
if set(cleanliness).intersection(i.lower().split()):
category = 'clean'
elif set(ac_problem).intersection(i.lower().split()):
category = 'ac problem'
else:
category = 'none'
output.append(category)
return output
Maybe you can do it like this:
def theme(x):
output = []
name_dic = {"AC": "AC problem",
"clean": "Cleanliness",
"food": "Food Problem"
}
for e in x:
output.append(name_dic.get(e))
return output
Or more exactly like this:
def theme(x):
output = []
name_list = [
("AC", "AC problem"),
("clean", "Cleanliness"),
("insects", "Cleanliness"),
("food", "Food Problem")
]
name_dic = dict(name_list)
for e in x:
output.append(name_dic.get(e))
return output
Hope it helps.

Categories