ParseError parsing empty valued xml doc - python

There are lots of articles pertaining to parsing xml with elementtree. I've gone through a bunch of them and read through the docs but I can't come up with a solution that works for me. I'm trying to supplement info thats created by another app in a nfo file but i need to preserve the conventions in the file.
Here is an example of how the file is laid out
<title>
<name>Test Name</name>
<alt name />
<file local="C:\file\file1.doc" type="word">http://filestore/file1.doc</file>
<file local="" type="excel">http://filestore/file2.xls</file>
<file local="C:\file\file3.xls" type="excel" />
<file local="" type="ppt" />
</title>
Note: Elements are not closed properly e.g...
<alt name /> should be <alt name></alt name>
This is what I'm running...
import xml.etree.ElementTree as ET
tree = ET.parse('file.nfo')
root = tree.getroot()
The error I'm getting is...
xml.etree.ElementTree.ParseError: not well-formed (invalid token):
I've tried...
myparser = ET.XMLParser(encoding='UTF-8')
tree = ET.parse('file.nfo', myparser)
Also tried, xmlparser, opening with codecs but i'm pretty sure its the formatting. I'm guessing the immediate issue is non-escaped > but i suspect ET needs opening/closing?
I'm sure i could open this file and go through it with regex but i was hoping to use ElementTree.
The end goal is to have the details from the nfo as a dictionary that looks like...
dict = {'title': [{'name': 'Test Name',
'alt name': '',
'file': [{'local': 'C:\file\file1.doc', 'type': 'word', 'url': 'http://filestore/file1.doc'},
{'local': '', 'type': 'excel', 'url': 'http://filestore/file2.xls'},
{'local': 'C:\file\file3.xls', 'type': 'excel', 'url': ''},
{'local': '', 'type': 'ppt', 'url': ''}]
}]}
I'm sure there is a better (more pythonic) way to do this but I'm pretty new to python.
Any help would be appreciated
EDIT: I'm also trying to avoid using 3rd party libraries if possible

So I ended up creating a customer parser of sorts, its not ideal but it works. It was suggested to me that lxml and html.parser may parse malformed xml better but i just went with this.
I'm also still very interested in any feedback whether it be on this or using any other method.
import re
def merge_dicts(*dict_args):
result = {}
for dictionary in dict_args:
result.update(dictionary)
return result
def make_dict(str_arg, op):
result = {}
result = dict(s.split(op) for s in str_arg.split(","))
return result
'''
Samples
lst = r' <name>Test Name</name>'
lst = r' <alt name />'
lst = r' <file local="C:\file\file1.doc" type="word">http://filestore/file1.doc</file>'
lst = r' <file local="" type="excel">http://filestore/file2.xls</file>'
lst = r' <file local="C:\file\file3.xls" type="excel" />'
lst = r' <file local="" type="ppt" />'
'''
def match_pattern(file_str):
#<description>desc blah</description>'
pattern1 = r'''(?x)
^
\s* # cut leading whitespace
(?P<whole_thing>
< (?P<tag_open> (\w+?|\w*\s\w+?)+) \b # word boundary, so we can
> # skip attributes
(?P<tag_body> .+? ) # insides
</ (?P<tag_close> (\w+?|\w*\s\w+?)+) > # closing tag, nothing interesting
)
$'''
#<alt name />
pattern2 = r'''(?x)
^
\s*
(?P<whole_thing>
< (?P<tag_open> (\w+?|\w*\s\w+?)+) \b
\s/>
)
$'''
#<file local="C:\file\file1.doc" type="word">http://filestore/file1.doc</file>'
pattern3 = r'''(?x)
^
\s*
(?P<whole_thing>
< (?P<tag_open> (\w+?|\w*\s\w+!=?)+) \b
\s
(?P<tag_attrib1> (\w*\=.*?)) # 1st attribute
\s
(?P<tag_attrib2> (\w*\=.*)) # 2nd attribute
.*? >
(?P<tag_body> .+? )
</ (?P<tag_close> (\w+?|\w*\s\w+?)+) >
)
$'''
#<file local="" type="ppt" />
pattern4 = r'''(?x)
^
\s*
(?P<whole_thing>
< (?P<tag_open> (\w+?|\w*\s\w+!=?)+) \b
\s
(?P<tag_attrib1> (\w*\=.*?)) # 1st attribute
\s
(?P<tag_attrib2> (\w*\=.*)) # 2nd attribute
\s/>
)
$'''
pat_str = 'pattern'
pat_val = 1
return_dict = {}
while (pat_val <= 4):
pattern = pat_str+str(pat_val)
matchObj = re.match(eval(pattern), file_str, re.L|re.M)
if matchObj:
#for k, v in matchObj.groupdict().items():
# print('matchObj.group({!r}) == {!r}'.format(k, v))
if pat_val == 1:
body = matchObj.group('tag_body')
return_dict = {matchObj.group('tag_open'): body}
elif pat_val == 2:
return_dict = {matchObj.group('tag_open'): ''}
elif pat_val == 3:
attr1 = make_dict(matchObj.group('tag_attrib1'), '=')
attr2 = make_dict(matchObj.group('tag_attrib2'), '=')
body = {'url': matchObj.group('tag_body')}
attrib = merge_dicts(attr1, attr2, body)
return_dict = {matchObj.group('tag_open'): attrib}
elif pat_val == 4:
attr1 = make_dict(matchObj.group('tag_attrib1'), '=')
attr2 = make_dict(matchObj.group('tag_attrib2'), '=')
body = {'url': ''}
attrib = merge_dicts(attr1, attr2, body)
return_dict = {matchObj.group('tag_open'): attrib}
return return_dict
else:
pat_val = pat_val + 1
if pat_val > 4:
print("No match!!")
#print(match_pattern(lst))
def in_file(file):
result = {}
with open(file, "r") as file:
data = (file.read().splitlines())
for d in data:
if data.index(d) == 0 or data.index(d) == len(data)-1:
if data.index(d) == 0:
print(re.sub('<|/|>', '', d))
elif d:
lst = []
dct = {}
if 'file' in match_pattern(d).keys():
for i in match_pattern(d).items():
if 'file' in result.keys():
lst = result['file']
lst.append(i[1])
dct = {i[0]: lst}
result = merge_dicts(result, dct)
#print(result['file'])
else:
dct = {i[0]: [i[1]]}
result = merge_dicts(result, dct)
else:
result = merge_dicts(result, match_pattern(d))
print('else', match_pattern(d))
return result
print(in_file('C:\\test.nfo'))
NOTE: I dropped the top most dictionary from the original post

Related

Transform Nested XML

I am currently looking to parse out a nested XML into a pandas Datatable so I can generate a CSV with each column being an element name and the value of that being the element text but I am having some issues parsing the information out. Below is an example of the nested XML and what I have tried.
The below XML can be quite large with hundreds of different records. This is what I tried:
##Import modules
import xml.etree.ElementTree as ET
import pandas as pd
from lxml import etree
tree = ET.parse("File.xml")
root = tree.getroot()
for subelement in root:
for subsub in subelement:
print(subsub.tag,",", subsub.text, subsub.attrib, subsub.items())
for subelement in root:
for subsub in subelement:
for subsubsub in subsub:
print(subsubsub.tag,",", subsubsub.text, subsubsub.attrib)
<?xml version="1.0" encoding="utf-16"?>
<test1 xmlns="test.xsd">
<test2 ID="123123123" test3="123123">
<test3>Separate</test3>
<test4>AA</test4>
<Comments>BB</Comments>
<test5>
<test6 ID="123123">
<test3>today</test3>
<test7>123 street</test7>
</test6>
</test5>
<test8>
<test10 ID="434234">
<test3>type of work</test3>
<test9>test work</test9>
</test10>
</test8>
<test11>
<test12 ID="234234234">
<test3>Social</test3>
<test14>test</test14>
</test12>
<test12 ID="123123">
<test3>Something Here</test3>
<test13>Some date</test13>
<test14>123123124433</test14>
</test12>
</test11>
<test15>
<test16 ID="6456456456">
<test3>Something Something</test3>
<test14>746745636</test14>
</test16>
</test15>
</test2>
<test2 ID="353453245" test3="list of something">
<test3>Somewhere</test3>
<test4>Someone</test4>
<Comments>Some comment</Comments>
<test5>
<test6 ID="567456756">
<test3>Not today</test3>
<test7>5634643643</test7>
<test17>Some Info</test17>
<test19>Somewhere</test19>
<test18>63243333</test18>
</test6>
</test5>
<test11>
<test12 ID="456436346">
<test3>Pattern</test3>
<test14>436346346</test14>
</test12>
<test12 ID="4364356">
<test3> ID</test3>
<test14>5674567457</test14>
</test12>
<test12 ID="123123123443">
<test3>Other ID</test3>
<test13>54234532452345</test13>
<test14>231423532452345</test14>
</test12>
</test11>
<test15>
<test16 ID="34252345">
<test3>None test</test3>
<test14>456436436346</test14>
</test16>
</test15>
</test2>
</test1>
Update So would the full code look something like this?
###TEST USING EXAMPLE HOTLIST
with open("file.csv", "w", newline='') as fout:
header = ['test3','test4','test7','test9','test13','test14','test17','test18','test19','Comments']
csvout = csv.DictWriter(fout, fieldnames=header)
csvout.writeheader()
row = {}
for _, elem in ET.iterparse('file.xml'):
# strip the namespace from the element tag name; e.g. {Test.xsd}test14 > test14
tag = re.sub("^{.*?}", "", elem.tag)
if tag == 'test2':
if len(row) != 0:
print(row)
csvout.writerow(row)
row = {}
if len(elem) == 0:
text = elem.text
old = row.get(tag)
if old is None:
# first occurrence of the tag
row[tag] = text
elif isinstance(old, str):
# second occurrence of the tag
row[tag] = [old, text]
else:
# already a list
old.append(text)
For nested XML you can use iterparse() function to iterate over all elements in the XML. You would then need to have logic to handle the elements depending on what tag it's looking at to add to a dictionary object to export as a row.
for _, elem in ET.iterparse('file.xml'):
if len(elem) == 0:
print(f'{elem.tag} {elem.attrib} text={elem.text}')
else:
print(f'{elem.tag} {elem.attrib}')
To create a row in a CSV file from the element text then can do something like this. If, for example, the "test2" marks the beginning of a new record then that can be used to write the record to a new row and clear the dictionary for the next record.
If want to output all or some attributes then need to add a few lines of code for that. If attribute names have the same name as element name or multiple elements have same attribute (e.g. ID) then need to address that in your code.
import xml.etree.ElementTree as ET
import re
import csv
with open("out.csv", "w", newline='') as fout:
header = ['test3','test4','test7','test9','test13','test14','test17','test18','test19','Comments']
csvout = csv.DictWriter(fout, fieldnames=header)
csvout.writeheader()
row = {}
for _, elem in ET.iterparse('test.xml'):
# strip the namespace from the element tag name; e.g. {Test.xsd}test14 > test14
tag = re.sub("^{.*?}", "", elem.tag)
if tag == 'test2':
if len(row) != 0:
print(row)
csvout.writerow(row)
row = {}
if len(elem) == 0:
row[tag] = elem.text
Output:
{'test3': 'Something Something', 'test4': 'AA', 'Comments': 'BB', 'test7': '123 street', 'test9': 'test work', 'test14': '746745636', 'test13': 'Some date'}
{'test3': 'None test', 'test4': 'Someone', 'Comments': 'Some comment', 'test7': '5634643643', 'test17': 'Some Info', 'test19': 'Somewhere', 'test18': '63243333', 'test14': '456436436346', 'test13': '54234532452345'}
CSV Output:
test3,test4,test7,test9,test13,test14,test17,test18,test19,Comments
Something Something,AA,123 street,test work,Some date,746745636,,,,BB
None test,Someone,5634643643,,54234532452345,456436436346,Some Info,63243333,Somewhere,Some comment
Update:
If want to handle duplicate tags and create a list of values then try something like this:
if len(elem) == 0:
text = elem.text
old = row.get(tag)
if old is None:
# first occurrence
row[tag] = text
elif isinstance(old, str):
# second occurrence > create list
row[tag] = [old, text]
else:
old.append(text)

error with regex matching over 2 source files, expected string or buffer

so I would like to from a input.txt file, create a two dictionaries
for example, here is sample of the input.txt file
#. VAR #first=Billy
#. VAR #last=Bob
#. PRINT VARS
#. VAR #petName=Gato
#. VAR #street="1234 Home Street"
#. VAR #city="New York"
#. VAR #state=NY
#. VAR #zip=21236
#. VAR #title=Dr.
#. PRINT VARS
#. FORMAT LM=5 JUST=LEFT
#. PRINT FORMAT
so VAR #varName=value
i.e in the case of #first=Billy you would get something like varDict = {"first": "Billy"} right?
Now I wanna know how to do that thru the entire file
There are two dictionaries that I would need to populate, one for the variables, and one for FORMAT, which just holds values, doesn't actually do anything for now.
As far as a desired output, In the input file, there are commands that when read, will trigger to either add variables to the directory, or print that directory, or add to the format directory. I would use the pprint function like this pprint.pprint(varDict , width=30) and would output something like this
{'first': 'Billy',
'last': 'Bob'}
{'city': 'New York',
'first': 'Billy',
'last': 'Bob',
'petName': 'Gato',
'state': 'NY',
'street': '1234 Home Street',
'title': 'Dr.',
'zip': '21236'}
{'BULLET': 'o',
'FLOW': 'YES',
'JUST': 'LEFT',
'LM': '5',
'RM': '80'}
Unfortunately i keep getting errors all over the place on the driver and source file
AttributeError: 'list' object has no attribute 'groups'
TypeError: expected string or buffer
Driver.py
input=(sys.argv[1])
# Group 1. VAR
# Group 2. #first=Mae or JUST=RIGHT FLOW=NO
# pass Group 2 as atString
regexSearch = re.compile(r'^#. ([A-Z]+) (.*)', re.MULTILINE)
regexPrintVAR = re.compile(r'^#\.\s*PRINT\s(VARS)', re.MULTILINE)
regexPrintFORMAT = re.compile(r'^#\.\s*PRINT\s(FORMAT)',re.MULTILINE)
regexERRCheck = re.compile(r'^#\.\s*FORMAT\s+BAD', re.MULTILINE)
varDictionary = dict()
formatDictionary = {"FLOW":"YES", "LM":"1", "RM":"80","JUST":"LEFT","BULLET":"o"}
file = open(input, "r")
while True:
inputLine = file.readline()
matchObj = regexSearch.search(inputLine)
command, atString = matchObj.groups()
if command == "VAR":
setVariable(atString,varDictionary)
if command == "FORMAT":
formatListERR = regexERRCheck.search(inputLine)
if formatListERR != None:
print("*** Not a recognizable command")
line = file.readline()
setFormat(atString, formatDictionary)
if command == "PRINT":
printVARObj = regexPrintVAR.search(inputLine)
printFormatObj = regexPrintFORMAT.search(inputLine)
if printVARObj != None:
pprint.pprint(varDictionary, width=30)
elif printFormatObj != None:
pprint.pprint(formatDict, width=30)
inputLine = file.readline()
file.close()
importFileIUse.py
# The atString is the remainder of the string after the VAR or FORMAT key word.
varDictionary = dict()
formatDictionary = {"FLOW":"YES", "LM":"1", "RM":"80","JUST":"LEFT","BULLET":"o"}
def setFormat(atString,formatDictionary):
regexFormat = re.compile(r'((?:(?:\w+)=(?:\w+)\s*)*)$')
line = re.split(" +", atString)
formatList = regexFormat.search(line)
if formatList:
for param in formatList[0].split():
splitParam = param.split('=')
formatDictionary[splitParam[0]] = splitParam[1]
def setVariable (atString, varDictionary):
regexVAR = re.compile(r'#(\w+)=(\w+|.*)\s*$', re.MULTILINE)
# file = open(input)
# line = file.readline()
# line = re.split(" +", atString)
#while line:
varList = regexVAR.findall(atString)
for key, value in varList:
varDictionary[key] = value

Converting an xml doc into a specific dot-expanded json structure

I have the following XML document:
<Item ID="288917">
<Main>
<Platform>iTunes</Platform>
<PlatformID>353736518</PlatformID>
</Main>
<Genres>
<Genre FacebookID="6003161475030">Comedy</Genre>
<Genre FacebookID="6003172932634">TV-Show</Genre>
</Genres>
<Products>
<Product Country="CA">
<URL>https://itunes.apple.com/ca/tv-season/id353187108?i=353736518</URL>
<Offers>
<Offer Type="HDBUY">
<Price>3.49</Price>
<Currency>CAD</Currency>
</Offer>
<Offer Type="SDBUY">
<Price>2.49</Price>
<Currency>CAD</Currency>
</Offer>
</Offers>
</Product>
<Product Country="FR">
<URL>https://itunes.apple.com/fr/tv-season/id353187108?i=353736518</URL>
<Rating>Tout public</Rating>
<Offers>
<Offer Type="HDBUY">
<Price>2.49</Price>
<Currency>EUR</Currency>
</Offer>
<Offer Type="SDBUY">
<Price>1.99</Price>
<Currency>EUR</Currency>
</Offer>
</Offers>
</Product>
</Products>
</Item>
Currently, to get it into json format I'm doing the following:
parser = etree.XMLParser(recover=True)
node = etree.fromstring(s, parser=parser)
data = xmltodict.parse(etree.tostring(node))
Of course the xmltodict is doing the heavy lifting. However, it gives me a format that is not ideal for what I'm trying to accomplish. Here is what I'd like the end data to look like:
{
"Item[#ID]": 288917, # if no preceding element, use the root node tag
"Main.Platform": "iTunes",
"Main.PlatformID": "353736518",
"Genres.Genre": ["Comedy", "TV-Show"] # list of elements if repeated
"Genres.Genre[#FacebookID]": ["6003161475030", "6003161475030"],
"Products.Product[#Country]": ["CA", "FR"],
"Products.Product.URL": ["https://itunes.apple.com/ca/tv-season/id353187108?i=353736518", "https://itunes.apple.com/fr/tv-season/id353187108?i=353736518"],
"Products.Product.Offers.Offer[#Type]": ["HDBUY", "SDBUY", "HDBUY", "SDBUY"],
"Products.Product.Offers.Offer.Price": ["3.49", "2.49", "2.49", "1.99"],
"Products.Product.Offers.Offer.Currency": "EUR"
}
This is a bit verbose, but it wasn't too hard to format this as a flat dict. Here is an example:
node = etree.fromstring(file_data.encode('utf-8'), parser=parser)
data = OrderedDict()
nodes = [(node, ''),] # format is (node, prefix)
while nodes:
for sub, prefix in nodes:
# remove the prefix tag unless its for the first attribute
tag_prefix = '.'.join(prefix.split('.')[1:]) if ('.' in prefix) else ''
atr_prefix = sub.tag if (sub == node) else tag_prefix
# tag
if sub.text.strip():
_prefix = tag_prefix + '.' + sub.tag
_value = sub.text.strip()
if data.get(_prefix): # convert it to a list if multiple values
if not isinstance(data[_prefix], list): data[_prefix] = [data[_prefix],]
data[_prefix].append(_value)
else:
data[_prefix] = _value
# atr
for k, v in sub.attrib.items():
_prefix = atr_prefix + '[#%s]' % k
_value = v
if data.get(_prefix): # convert it to a list if multiple values
if not isinstance(data[_prefix], list): data[_prefix] = [data[_prefix],]
data[_prefix].append(_value)
else:
data[_prefix] = _value
nodes.remove((sub, prefix))
for s in sub.getchildren():
_prefix = (prefix + '.' + sub.tag).strip('.')
nodes.append((s, _prefix))
if not nodes: break
You can use recursion here. One way is to store the paths progressively as your recurse the XML document, and return a result dictionary at the end, which can be serialized to JSON.
The below demo uses the standard library xml.etree.ElementTree for parsing XML documents.
Demo:
from xml.etree.ElementTree import ElementTree
from pprint import pprint
# Setup XML tree for parsing
tree = ElementTree()
tree.parse("sample.xml")
root = tree.getroot()
def collect_xml_paths(root, path=[], result={}):
"""Collect XML paths into a dictionary"""
# First collect root items
if not result:
root_id, root_value = tuple(root.attrib.items())[0]
root_key = root.tag + "[#%s]" % root_id
result[root_key] = root_value
# Go through each child from root
for child in root:
# Extract text
text = child.text.strip()
# Update path
new_path = path[:]
new_path.append(child.tag)
# Create dot separated key
key = ".".join(new_path)
# Get child attributes
attributes = child.attrib
# Ensure we have attributes
if attributes:
# Add each attribute to result
for k, v in attributes.items():
attrib_key = key + "[#%s]" % k
result.setdefault(attrib_key, []).append(v)
# Add text if it exists
if text:
result.setdefault(key, []).append(text)
# Recurse through paths once done iteration
collect_xml_paths(child, new_path)
# Separate single values from list values
return {k: v[0] if len(v) == 1 else v for k, v in result.items()}
pprint(collect_xml_paths(root))
Output:
{'Genres.Genre': ['Comedy', 'TV-Show'],
'Genres.Genre[#FacebookID]': ['6003161475030', '6003172932634'],
'Item[#ID]': '288917',
'Main.Platform': 'iTunes',
'Main.PlatformID': '353736518',
'Products.Product.Offers.Offer.Currency': ['CAD', 'CAD', 'EUR', 'EUR'],
'Products.Product.Offers.Offer.Price': ['3.49', '2.49', '2.49', '1.99'],
'Products.Product.Offers.Offer[#Type]': ['HDBUY', 'SDBUY', 'HDBUY', 'SDBUY'],
'Products.Product.Rating': 'Tout public',
'Products.Product.URL': ['https://itunes.apple.com/ca/tv-season/id353187108?i=353736518',
'https://itunes.apple.com/fr/tv-season/id353187108?i=353736518'],
'Products.Product[#Country]': ['CA', 'FR']}
If you want to serialize this dictionary to JSON, you can use json.dumps():
from json import dumps
print(dumps(collect_xml_paths(root)))
# {"Item[#ID]": "288917", "Main.Platform": "iTunes", "Main.PlatformID": "353736518", "Genres.Genre[#FacebookID]": ["6003161475030", "6003172932634"], "Genres.Genre": ["Comedy", "TV-Show"], "Products.Product[#Country]": ["CA", "FR"], "Products.Product.URL": ["https://itunes.apple.com/ca/tv-season/id353187108?i=353736518", "https://itunes.apple.com/fr/tv-season/id353187108?i=353736518"], "Products.Product.Offers.Offer[#Type]": ["HDBUY", "SDBUY", "HDBUY", "SDBUY"], "Products.Product.Offers.Offer.Price": ["3.49", "2.49", "2.49", "1.99"], "Products.Product.Offers.Offer.Currency": ["CAD", "CAD", "EUR", "EUR"], "Products.Product.Rating": "Tout public"}

Trouble getting right values against each item

I'm trying to parse the item names and it's corresponding values from the below snippet. dt tag holds names and dd containing values. There are few dt tags which do not have corresponding values. So, all the names do not have values. What I wish to do is keep the values blank against any name if the latter doesn't have any values.
These are the elements I would like to scrape data from:
content="""
<div class="movie_middle">
<dl>
<dt>Genres:</dt>
<dt>Resolution:</dt>
<dd>1920*1080</dd>
<dt>Size:</dt>
<dd>1.60G</dd>
<dt>Quality:</dt>
<dd>1080p</dd>
<dt>Frame Rate:</dt>
<dd>23.976 fps</dd>
<dt>Language:</dt>
</dl>
</div>
"""
I've tried like below:
soup = BeautifulSoup(content,"lxml")
title = [item.text for item in soup.select(".movie_middle dt")]
result = [item.text for item in soup.select(".movie_middle dd")]
vault = dict(zip(title,result))
print(vault)
It gives me messy results (wrong pairs):
{'Genres:': '1920*1080', 'Resolution:': '1.60G', 'Size:': '1080p', 'Quality:': '23.976 fps'}
My expected result:
{'Genres:': '', 'Resolution:': '1920*1080', 'Size:': '1.60G', 'Quality:': '1080p','Frame Rate:':'23.976 fps','Language:':''}
Any help on fixing the issue will be highly appreciated.
You can loop through the elements inside dl. If the current element is dt and the next element is dd, then store the value as the next element, else set the value as empty string.
dl = soup.select('.movie_middle dl')[0]
elems = dl.find_all() # Returns the list of dt and dd
data = {}
for i, el in enumerate(elems):
if el.name == 'dt':
key = el.text.replace(':', '')
# check if the next element is a `dd`
if i < len(elems) - 1 and elems[i+1].name == 'dd':
data[key] = elems[i+1].text
else:
data[key] = ''
You can use BeautifulSoup to parse the dl structure, and then write a function to create the dictionary:
from bs4 import BeautifulSoup as soup
import re
def parse_result(d):
while d:
a, *_d = d
if _d:
if re.findall('\<dt', a) and re.findall('\<dd', _d[0]):
yield [a[4:-5], _d[0][4:-5]]
d = _d[1:]
else:
yield [a[4:-5], '']
d = _d
else:
yield [a[4:-5], '']
d = []
print(dict(parse_result(list(filter(None, str(soup(content, 'html.parser').find('dl')).split('\n')))[1:-1])))
Output:
{'Genres:': '', 'Resolution:': '1920*1080', 'Size:': '1.60G', 'Quality:': '1080p', 'Frame Rate:': '23.976 fps', 'Language:': ''}
For a slightly longer, although cleaner solution, you can create a decorator to strip the HTML tags of the output, thus removing the need for the extra string slicing in the main parse_result function:
def strip_tags(f):
def wrapper(data):
return {a[4:-5]:b[4:-5] for a, b in f(data)}
return wrapper
#strip_tags
def parse_result(d):
while d:
a, *_d = d
if _d:
if re.findall('\<dt', a) and re.findall('\<dd', _d[0]):
yield [a, _d[0]]
d = _d[1:]
else:
yield [a, '']
d = _d
else:
yield [a, '']
d = []
print(parse_result(list(filter(None, str(soup(content, 'html.parser').find('dl')).split('\n')))[1:-1]))
Output:
{'Genres:': '', 'Resolution:': '1920*1080', 'Size:': '1.60G', 'Quality:': '1080p', 'Frame Rate:': '23.976 fps', 'Language:': ''}
from collections import defaultdict
test = soup.text.split('\n')
d = defaultdict(list)
for i in range(len(test)):
if (':' in test[i]) and (':' not in test[i+1]):
d[test[i]] = test[i+1]
elif ':' in test[i]:
d[test[i]] = ''
d
defaultdict(list,
{'Frame Rate:': '23.976 fps',
'Genres:': '',
'Language:': '',
'Quality:': '1080p',
'Resolution:': '1920*1080',
'Size:': '1.60G'})
The logic here is that you know that every key will have a colon. Knowing this, you can write an if else statement to capture the unique combinations, whether that is key followed by key or key followed by value
Edit:
In case you wanted to clean your keys, below replaces the : in each one:
d1 = { x.replace(':', ''): d[x] for x in d.keys() }
d1
{'Frame Rate': '23.976 fps',
'Genres': '',
'Language': '',
'Quality': '1080p',
'Resolution': '1920*1080',
'Size': '1.60G'}
The problem is that empty elements are not present. Since there is no hierarchy between the <dt> and the <dd>, I'm afraid you'll have to craft the dictionary yourself.
vault = {}
category = ""
for item in soup.find("dl").findChildren():
if item.name == "dt":
if category == "":
category = item.text
else:
vault[category] = ""
category = ""
elif item.name == "dd":
vault[category] = item.text
category = ""
Basically this code iterates over the child elements of the <dl> and fills the vault dictionary with the values.

Python regex sub confusion

There are four keywords: title, blog, tags, state
Excess keyword occurrences are being removed from their respective matches.
Example:
blog: blog state title tags and returns state title tags and instead of
blog state title tags and
The sub function should be matching .+ after it sees blog:, so I don't know why it treats blog as an exception to .+
Regex:
re.sub(r'((^|\n|\s|\b)(title|blog|tags|state)(\:\s).+(\n|$))', matcher, a)
Code:
def n15():
import re
a = """blog: blog: fooblog
state: private
title: this is atitle bun
and text"""
kwargs = {}
def matcher(string):
v = string.group(1).replace(string.group(2), '').replace(string.group(3), '').replace(string.group(4), '').replace(string.group(5), '')
if string.group(3) == 'title':
kwargs['title'] = v
elif string.group(3) == 'blog':
kwargs['blog_url'] = v
elif string.group(3) == 'tags':
kwargs['comma_separated_tags'] = v
elif string.group(3) == 'state':
kwargs['post_state'] = v
return ''
a = re.sub(r'((^|\n|\s|\b)(title|blog|tags|state)(\:\s).+(\n|$))', matcher, a)
a = a.replace('\n', '<br />')
a = a.replace('\r', '')
a = a.replace('"', r'\"')
a = '<p>' + a + '</p>'
kwargs['body'] = a
print kwargs
Output:
{'body': '<p>and text</p>', 'post_state': 'private', 'blog_url': 'foo', 'title': 'this is a bun'}
Edit:
Desired Output:
{'body': '<p>and text</p>', 'post_state': 'private', 'blog_url': 'fooblog', 'title': 'this is atitle bun'}
replace(string.group(3), '')
is replacing all occurrences of 'blog' with '' .
Rather than try to replace all the other parts of the matched string, which will be hard to get right, I suggest capture the string you actually want in the original match.
r'((^|\n|\s|\b)(title|blog|tags|state)(\:\s)(.+)(\n|$))'
which has () around the .+ to capture that part of the string, then
v = match.group(5)
at the start of matcher.

Categories