I want to use following codes to replace strings like "/xxxxx/" with "/xxxxx.html" in the page_data, but doesn't work. page_data is bytes type which is downloaded by a crawler.
page_data.replace(each, neweach)
Only when I change them to:
page_data = page_data.replace(each, neweach)
the strings(each) in page_data are actually replaceed.
The whole code is below:
import os
import sys
import re
import urllib
import urllib2
class WebGet(object):
base_url = ""
urls_list = []
history_list = []
replace_ch={}
def __init__(self, base_url):
self.base_url = base_url[:-1]
self.urls_list.append('/')
self.replace_ch[">>"] = "%3E%3E"
self.replace_ch["<<"] = "%3C%3C"
self.replace_ch["::"] = "%3A%3A"
def recurseGet(self):
'''Get page data recursively'''
while(len(self.urls_list) != 0):
url_suffix = self.urls_list[0]
self.urls_list.remove(url_suffix)
self.history_list.append(url_suffix)
url_to_get = self.base_url + url_suffix
"Get page data with url"
print "To get",url_to_get
page_data = urllib2.urlopen(url_to_get).read()
page_data_done = self.pageHandle(page_data)
"Write the page data into file"
if url_suffix[-1] == '/':
url_suffix = url_suffix[:-1]
if url_suffix == '':
url_suffix = "index"
elif url_suffix[0] == '/':
url_suffix = url_suffix[1:]
url_suffix.replace('/','\\')
url_suffix.replace('>>','%3E%3E')
url_suffix.replace('<<','%3C%3C')
url_suffix.replace('::','%3A%3A')
file_str = "e:\\reference\\"+url_suffix
if file_str.rfind("\\") != 12:
new_dir = file_str[:file_str.rfind("\\")]
if os.path.isdir(file_str) == False:
os.mkdir(file_str)
file_str = file_str.strip()+".html"
print "write file",file_str
f_page = open(file_str, "wb")
f_page.write(page_data_done)
f_page.close
def pageHandle(self, page_data):
page_data.replace("http://www.cplusplus.com/","/") #here the replace works
re_rule = '<a href="/reference(/\S{2,40}/)\">'
list_page_urls = re.findall(re_rule, page_data)
for each in list_page_urls:
neweach = each
neweach = neweach[:-1]+".html"
#page_data = page_data.replace(each, neweach)
page_data.replace(each, neweach)
if each in page_data:
print "fail replace"
if each in self.history_list:
continue
elif each in self.urls_list:
continue
elif each == '/':
continue
self.urls_list.append(each)
return page_data
def main():
url = "http://www.cplusplus.com/reference/"
fc = WebGet(url)
fc.recurseGet()
if __name__ == "__main__":
main()
Why could be this?
Because that's what the replace method does: returns a copy of the string with the relevant characters replaced.
Apart from anything else, strings are immutable in Python, so it couldn't work any other way.
Related
I am doing a thing while following a tutorial. I think I did everything correct but when starting the program I am getting an error.
Here are my files codes:
1) the main file - frs.py
from parser import Parser
from lexer import Lexer
def main():
filename = 'hello.frs'
file = open(filename, 'r')
lexer = Lexer(file)
parser = Parser(lexer.tokens)
lexer.tokenizer()
print ("TOKENS:")
print (lexer.tokens, "\n")
parser.build_AST()
print ("AST:")
print (parset.AST, "\n")
if __name__ == "__main__":
main()
2) the Lexer class - lexer.py
class Lexer:
def __init__(self, data):
self.data = data
self.tokens = []
self.keywords = [
'tosay'
]
def tokenizer(self):
for loc in self.data:
tmp = []
tid = ''
for l in loc:
if l == '"' and tid == '':
tid = 'char'
tmp = []
elif l == '"' and tid == 'char':
self.tokens.append({'id': tid, 'value': ''.join(tmp)})
tid = ''
tmp = []
elif l == ':':
self.tokens.append({'id': 'label', 'value': ''.join(tmp)})
tmp = []
elif ''.join(tmp) in self.keywords:
self.tokens.append({'id': 'keyword', 'value': ''.join(tmp)})
tmp = []
elif l == ' ' and tid != 'char':
continue
else:
tmp.append(l)
3) the Parser class - parser.py
class Parser:
def __init__(self, tokens):
self.tokens = tokens
self.AST = []
def add_node(self, parent, node):
for a in self.AST:
if parent in a:
a[parent].append(node)
def build_AST(self):
saved = {}
parent = {}
collect = False
for token in self.tokens:
if token['id'] == 'label':
t = {token['value']: []}
if parent != t:
parent = token['value']
self.AST.append(t)
elif token['id'] == 'keyword':
if token['value'] == 'stop':
t = {token['value']: 0}
self.add_node(parent, t)
else:
if collect == False:
saved = token
collect = True
else:
t = {saved['value']: token[:value]}
self.add_node(parent, t)
collect = False
elif token['id'] == 'char':
if collect = False:
saved = token
collect = True
else:
t = {saved['value']: token['value']}
self.add_node(parent, t)
collect = False
4) the file with my own language and is a goal of the tutorial - hello.frs:
commence:
tosay "Hello World"
stop
Basically, until I added the from parser import Parser, everything worked. But after adding, I am getting this error message:
Traceback (most recent call last):
File "frs.py", line 1, in <module>
from parser import Parser
ImportError: cannot import name 'Parser'
I tried renaming the class, but it still doesn't work.
Please help me!
Thank you in advance.
Two errors in your files.
1) File parser.py:
Change:
if collect = False:
To
if collect == False:
2) File frs.py
Change:
print (parset.AST, "\n")
To:
print (parser.AST, "\n")`
After Above Corrections My Output
TOKENS:
[{'id': 'label', 'value': 'commence'}, {'id': 'keyword', 'value': 'tosay'}, {'id': 'char', 'value': 'Hello World'}]
AST:
[{'commence': [{'tosay': 'Hello World'}]}]
I have a list of ts files inside a folder. I try to extract the content id from the XML which is the filename without extension. I need to search for a ts file that matches the content id. For some reason, it's failing. I am attaching the code below. I am also attaching the screenshot for the ts files.
import glob
import lxml.etree as et
import os, csv
ASSET_METADATA_PATH = '/Users/roradhak/eVision/failed_assets/'
TS_PATH = '/Users/roradhak/eVision/ts_check/'
def parse_file(path):
tree = et.parse(path)
root = tree.getroot()
trailer_id = ""
programs = root.xpath('Program[#title="Program"]')
if len(programs) == 0:
return None, None, None
program = programs[0] # TODO - Are multiple programs expected? If so, the function should return a list of tuples
# Get the Content ID
c_id = program.xpath('props/*[#title="Content ID"]')
if len(c_id) == 0:
content_id = None
else:
content_id = c_id[0].text
# Get the has_trailer attribute
has_t = program.xpath('props/*[#title="Has_Trailer"]')
has_trailer = has_t[0].text
if has_t[0].text =="Y":
trailer_id = content_id.replace('M','T',1)
# Get the content name
n = program.xpath('props/*[#title="Name"]')
if len(n) == 0:
content_name = None
else:
content_name = n[0].text
return content_id, content_name, has_trailer, trailer_id
def main():
asset_metadata = glob.glob(os.path.join(ASSET_METADATA_PATH, u'*.xml'))
movies = glob.glob(os.path.join(TS_PATH, u'*.ts'))
for p in asset_metadata:
print(u'Processing: {p}'.format(p=p).encode('utf-8'))
print content_id, content_name, has_trailer, trailer_id
content_id, content_name, has_trailer, trailer_id= parse_file(p)
if u'{c}.ts'.format(c=content_id) not in TS_PATH:
print "No Movie"
if has_trailer =="Y":
if u'{c}.ts'.format(c=trailer_id) not in movies:
print "No trailer"
if __name__ == '__main__':
main()
Output as below
/Users/roradhak/IVPGET_Local/venv/bin/python /Users/roradhak/Downloads/validate_xml.py
Processing: /Users/roradhak/eVision/failed_assets/E30000001557115265_2019_08_29T11_20_08Z.xml
MD009232 Ep 143 - Cool look Hair style N
No Movie
Processing: /Users/roradhak/eVision/failed_assets/10000000717960000_2019_10_09T15_04_20Z.xml
MZ008931 Aan: Men At Work Y TZ008931
No Movie
No trailer
Processing: /Users/roradhak/eVision/failed_assets/E30000001557537308_2019_08_09T19_15_22Z.xml
MZ010564 EP29 - Episode 29 - Raheem S1 Y TZ010564
No Movie
No trailer
Process finished with exit code 0
Here is how I would do it with pathlib and Python 3.4+:
from pathlib import Path
failed_assets_folder = Path('/Users/roradhak/eVision/failed_assets')
ts_folder = Path('/Users/roradhak/eVision/ts_check')
def main():
for failed_asset in failed_assets_folder.glob('*.xml'):
print(f'Processing: {failed_asset.name}')
content_id, content_name, has_trailer, trailer_id = parse_file(failed_asset.name)
print(f'{content_id}, {content_name}, {has_trailer}, {trailer_id}')
if not Path(ts_folder / f'{content_id}.ts').exists():
print('No Movie')
if has_trailer == 'Y':
if not Path(ts_folder / f'{trailer_id}.ts').exists():
print('No trailer')
It just implements the file search portion and it is not tested though.
Guys do anyone know how to read event log file in C:\Windows\System32\winevt\Logs with .evtx extension?
I have already tried to open it using notepad and read using python but notepad says access is denied...
Do anyone know how to do it? Thanks in advance..
This is how you would read the file "Forwarded Events" from the event viewer. You need admin access so I would run it as admin but I it will prompt you for a password if you don't.
import win32evtlog
import xml.etree.ElementTree as ET
import ctypes
import sys
def is_admin():
try:
return ctypes.windll.shell32.IsUserAnAdmin()
except:
return False
if is_admin():
# open event file
query_handle = win32evtlog.EvtQuery(
'C:\Windows\System32\winevt\Logs\ForwardedEvents.evtx',
win32evtlog.EvtQueryFilePath)
read_count = 0
a = 1
while a == 1:
a += 1
# read 1 record(s)
events = win32evtlog.EvtNext(query_handle, 1)
read_count += len(events)
# if there is no record break the loop
if len(events) == 0:
break
for event in events:
xml_content = win32evtlog.EvtRender(event, win32evtlog.EvtRenderEventXml)
# parse xml content
xml = ET.fromstring(xml_content)
# xml namespace, root element has a xmlns definition, so we have to use the namespace
ns = '{http://schemas.microsoft.com/win/2004/08/events/event}'
substatus = xml[1][9].text
event_id = xml.find(f'.//{ns}EventID').text
computer = xml.find(f'.//{ns}Computer').text
channel = xml.find(f'.//{ns}Channel').text
execution = xml.find(f'.//{ns}Execution')
process_id = execution.get('ProcessID')
thread_id = execution.get('ThreadID')
time_created = xml.find(f'.//{ns}TimeCreated').get('SystemTime')
#data_name = xml.findall('.//EventData')
#substatus = data_name.get('Data')
#print(substatus)
event_data = f'Time: {time_created}, Computer: {computer}, Substatus: {substatus}, Event Id: {event_id}, Channel: {channel}, Process Id: {process_id}, Thread Id: {thread_id}'
print(event_data)
user_data = xml.find(f'.//{ns}UserData')
# user_data has possible any data
else:
ctypes.windll.shell32.ShellExecuteW(None, "runas", sys.executable, " ".join(sys.argv), None, 1)
input()
.evtx is the extension for Windows Eventlog files. It contains data in a special binary format designed by Microsoft so you cannot simply open it in a text editor.
The are open source tools to read .evtx and the NXLog EE can also read .evtx files. (Disclaimer: I'm affiliated with the latter).
I modified the accepted answer a bit as following, so it becomes reusable:
import xml.etree.ElementTree as Et
import win32evtlog
from collections import namedtuple
class EventLogParser:
def __init__(self, exported_log_file):
self.exported_log_file = exported_log_file
def get_all_events(self):
windows_events = []
query_handle = win32evtlog.EvtQuery(str(self.exported_log_file),
win32evtlog.EvtQueryFilePath | win32evtlog.EvtQueryReverseDirection)
while True:
raw_event_collection = win32evtlog.EvtNext(query_handle, 1)
if len(raw_event_collection) == 0:
break
for raw_event in raw_event_collection:
windows_events.append(self.parse_raw_event(raw_event))
return windows_events
def parse_raw_event(self, raw_event):
xml_content = win32evtlog.EvtRender(raw_event, win32evtlog.EvtRenderEventXml)
root = Et.fromstring(xml_content)
ns = "{" + root.tag.split('}')[0].strip('{') + "}"
system = root.find(f'{ns}System')
event_id = system.find(f'{ns}EventID').text
level = system.find(f'{ns}Level').text
time_created = system.find(f'{ns}TimeCreated').get('SystemTime')
computer = system.find(f'{ns}Computer').text
WindowsEvent = namedtuple('WindowsEvent',
'event_id, level, time_created, computer')
return WindowsEvent(event_id, level, time_created, computer)
I use the "python-evtx" library, you can install it using this command:
pip install python-evtx
In my case, I'm not interested in reading records with the "Information" level.
import os
import codecs
from lxml import etree
import Evtx.Evtx as evtx
def evtxFile(absolutePath, filenameWithExt, ext, _fromDate, _toDate):
print("Reading: " + filenameWithExt)
outText = ""
channel = ""
#read the windows event viewer log and convert its contents to XML
with codecs.open(tempFilePath, "a+", "utf-8", "ignore") as tempFile:
with evtx.Evtx(absolutePath) as log:
for record in log.records():
xmlLine = record.xml()
xmlLine = xmlLine.replace(" xmlns=\"http://schemas.microsoft.com/win/2004/08/events/event\"", "")
xmlParse = etree.XML(xmlLine)
level = parseXMLtoString(xmlParse, ".//Level/text()")
if not level == "0" and not level == "4":
providerName = parseXMLtoString(xmlParse, ".//Provider/#Name")
qualifiers = parseXMLtoString(xmlParse, ".//EventID/#Qualifiers")
timestamp = parseXMLtoString(xmlParse, ".//TimeCreated/#SystemTime")
eventID = parseXMLtoString(xmlParse, ".//EventID/text()")
task = parseXMLtoString(xmlParse, ".//Task/text()")
keywords = parseXMLtoString(xmlParse, ".//Keywords/text()")
eventRecordID = parseXMLtoString(xmlParse, ".//EventRecordID/text()")
channel = parseXMLtoString(xmlParse, ".//Channel/text()")
computer = parseXMLtoString(xmlParse, ".//Computer/text()")
message = parseXMLtoString(xmlParse, ".//Data/text()")
if level == "1":
level = "Critical"
elif level == "2":
level = "Error"
elif level == "3":
level = "Warning"
date = timestamp[0:10]
time = timestamp[11:19]
time = time.replace(".", "")
_date = datetime.strptime(date, "%Y-%m-%d").date()
if _fromDate <= _date <= _toDate:
message = message.replace("<string>", "")
message = message.replace("</string>", "")
message = message.replace("\r\n", " ")
message = message.replace("\n\r", " ")
message = message.replace("\n", " ")
message = message.replace("\r", " ")
outText = date + " " + time + "|" + level + "|" + message.strip() + "|" + task + "|" + computer + "|" + providerName + "|" + qualifiers + "|" + eventID + "|" + eventRecordID + "|" + keywords + "\n"
tempFile.writelines(outText)
with codecs.open(tempFilePath, "r", "utf-8", "ignore") as tempFile2:
myLinesFromDateRange = tempFile2.readlines()
#delete the temporary file that was created
os.remove(tempFilePath)
if len(myLinesFromDateRange) > 0:
createFolder("\\filtered_data_files\\")
outFilename = "windows_" + channel.lower() + "_event_viewer_logs" + ext
myLinesFromDateRange.sort()
#remove duplicate records from the list
myFinalLinesFromDateRange = list(set(myLinesFromDateRange))
myFinalLinesFromDateRange.sort()
with codecs.open(os.getcwd() + "\\filtered_data_files\\" + outFilename, "a+", "utf-8", "ignore") as linesFromDateRange:
linesFromDateRange.seek(0)
if len(linesFromDateRange.read(100)) > 0:
linesFromDateRange.writelines("\n")
linesFromDateRange.writelines(myFinalLinesFromDateRange)
del myLinesFromDateRange[:]
del myFinalLinesFromDateRange[:]
else:
print("No data was found within the specified date range.")
print("Closing: " + filenameWithExt)
I hope it helps you or someone else in the future.
EDIT:
The "tempFilePath" can be anything you want, for example:
tempFilePath = os.getcwd() + "\\tempFile.txt"
I collected some information first before calling the "evtxFile" function:
The "From" and the "To" dates are in the following format: YYYY-MM-DD
Converted the dates to "date" data type:
_fromDate = datetime.strptime(fromDate, "%Y-%m-%d").date()
_toDate = datetime.strptime(toDate, "%Y-%m-%d").date()
Divided the directory where the .evtx files are located into different parts:
def splitDirectory(root, file):
absolutePathOfFile = os.path.join(root, file)
filePathWithoutFilename = os.path.split(absolutePathOfFile)[0]
filenameWithExt = os.path.split(absolutePathOfFile)[1]
filenameWithoutExt = os.path.splitext(filenameWithExt)[0]
extension = os.path.splitext(filenameWithExt)[1]
return absolutePathOfFile, filePathWithoutFilename, filenameWithExt, filenameWithoutExt, extension
for root, subFolders, files in os.walk(directoryPath):
for f in files:
absolutePathOfFile, filePathWithoutFilename, filenameWithExt,
filenameWithoutExt, extension = splitDirectory(root, f)
if extension == ".evtx":
evtxFile(absolutePathOfFile, filenameWithExt, ".txt", _fromDate, _toDate)
I have a script to extract data from here: http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/
Part of obtaining the data in the script looks like this:
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
I see that the names like fg and pts correspond to the table headlines, but I don't understand why certain ones are abbreviated in the script.
I want to modify the script to obtain the headlines on this table: http://espn.go.com/nba/statistics/player/_/stat/rebounds. I tried doing this by just plugging in the names as they appear at the top of the table but the resulting CSV file had missing information.
Full code :
import os
import csv
import time
import urllib2
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes'
def get_data():
try:
req = urllib2.Request(uri)
response = urllib2.urlopen(req, timeout=600)
content = response.read()
return content
except Exception, e:
print "\n[!] Error: " + str(e)
print ''
return False
def extract(data,rk):
print '\n[+] Extracting data.'
start = 0
while True:
store = [rk]
if data.find('nba/player/',start) == -1:
break
with open("data.csv", "ab") as fcsv:
main = data.find('nba/player/',start)
name_start = data.find('>',main) + 1
name_end = data.find('<',name_start)
store.append(data[name_start:name_end])
team_start = data.find('">',name_end) + 2
team_end = data.find('<',team_start)
store.append(data[team_start:team_end])
gp_start = data.find(' >',team_end) + 2
gp_end = data.find('<',gp_start)
store.append(data[gp_start:gp_end])
mpg_start = data.find(' >',gp_end) + 2
mpg_end = data.find('<',mpg_start)
store.append(data[mpg_start:mpg_end])
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
m3_start = data.find(' >',fg_end) + 2
m3_end = data.find('<',m3_start)
store.append(data[m3_start:m3_end])
p3_start = data.find(' >',m3_end) + 2
p3_end = data.find('<',p3_start)
store.append(data[p3_start:p3_end])
ft_start = data.find(' >',p3_end) + 2
ft_end = data.find('<',ft_start)
store.append(data[ft_start:ft_end])
ftp_start = data.find(' >',ft_end) + 2
ftp_end = data.find('<',ftp_start)
store.append(data[ftp_start:ftp_end])
start = name_end
rk = rk + 1
csv.writer(fcsv).writerow(store)
fcsv.close()
def main():
print "\n[+] Initializing..."
if not os.path.exists("data.csv"):
with open("data.csv", "ab") as fcsv:
csv.writer(fcsv).writerow(["RK","PLAYER","TEAM","GP", "MPG","PTS","FGM-FGA","FG%","3PM-3PA","3P%","FTM-FTA","FT%"])
fcsv.close()
rk = 1
global uri
while True:
time.sleep(1)
start = 0
print "\n[+] Getting data, please wait."
data = get_data()
if not data:
break
extract(data,rk)
print "\n[+] Preparing for next page."
time.sleep(1.5)
rk = rk + 40
if rk > 300:
print "\n[+] All Done !\n"
break
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/sort/avg48Points/count/' + str(rk)
if __name__ == '__main__':
main()
I specifically want to know how to grab info based on the headlines. Like TEAM GP MPG PTS FGM-FGA FG% 3PM-3PA 3P% FTM-FTA FT%
So the script doesn't need to be changed besides things like pts or mpg in pts_start = data.find('">',mpg_end) + 2
I don't understand why I can't just input the name of the headline in the table has shown for certain ones. Like instead of FTM-FTA, the script puts ft.
Extracting html data rather easy with BeautifulSoup. Following example is you to get the idea but not a complete solution to your problem. However you can easily extend.
from bs4 import BeautifulSoup
import urllib2
def get_html_page_dom(url):
response = urllib2.urlopen(url)
html_doc = response.read()
return BeautifulSoup(html_doc, 'html5lib')
def extract_rows(dom):
table_rows = dom.select('.mod-content tbody tr')
for tr in table_rows:
# skip headers
klass = tr.get('class')
if klass is not None and 'colhead' in klass:
continue
tds = tr.select('td')
yield {'RK': tds[0].string,
'PLAYER': tds[1].select('a')[0].string,
'TEAM': tds[2].string,
'GP': tds[3].string
# you can fetch rest of the indexs for corresponding headers
}
if __name__ == '__main__':
dom = get_html_page_dom('http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/')
for data in extract_rows(dom):
print(data)
You can simply run and see the result ;).
I'd like to make an QAbstractItemModel that gets its data from a series of Xml files, all situated in the same directory. Since PyQt5 no longer supports QDomDocument (or atleast i couldn't find a way to make it work), i've had to resort to a QXmlStreamReader. I'm putting the data itself in a giant python dictionary (well... not exactly giant by computer science standards) that contains other dictionaries under various keys to create a tree-like structure.
this is my code so far:
class DataModel(QtCore.QAbstractItemModel):
def __init__(self, settingsDirectory, parent = None):
super(DataModel, self).__init__(parent)
settingsDirectory.setNameFilters(["*.xml"])
files = settingsDirectory.entryList()
print(files)
self.data = {}
for i in range(len(files)):
filePath = str(files[i])
file = QtCore.QFile(settingsDirectory.absolutePath() + "/" + str(filePath))
fileOpens = file.open(file.ReadOnly | file.Text)
if fileOpens:
parser = QtCore.QXmlStreamReader(file)
print("--------Beginning parsing----------")
print("Reading file: "+str(filePath))
while not parser.atEnd():
parser.readNext()
token = parser.tokenType()
print("Reading tag: " + str(parser.name()))
print("Tag type is: " + str(token))
if token == parser.StartDocument:
self.data["XML Version"] = str(parser.documentVersion())
self.data["XML Encoding"] = str(parser.documentEncoding())
if token == parser.StartElement:
tokenName = parser.name()
if parser.tokenType() == parser.Characters:
tokenText = parser.text()
print("This tag has a text value: " + str(tokenText))
print("current data: " + str(self.data))
if token == parser.EndElement:
if tokenText != None:
self.data[tokenName] = tokenText
else:
self.data[tokenName] = {}
tokenName = None
tokenText = None
else:
print(self.tr("xml file did not open properly"))
print(self.data)
While this code doesn't crash or anything, it does have a few issues that i have no idea why they're happening or how to fix:
1.the tokenName never changes from None for some reason - solved
2.the structure of the self.data dictionary does not turn into a tree-like one, no idea why :|
example data:
<?xml version="1.0" encoding="UTF-8"?>
<tag>
<description>This is a text</description>
<types>
<typesAllowed></typesAllowed>
<typesEnabled></typesEnabled>
</types>
</tag>
yields the final result:
{'XML Encoding': 'UTF-8', 'XML Version': '1.0', 'typesAllowed': '\n\t\t', None: '\n', 'typesEnabled': '\n\t\t', 'description': 'This is a text'}
instead of the wanted:
{'XML Encoding': 'UTF-8', 'XML Version': '1.0', 'tag': {'description': 'this is a text', typesAllowed': '\n\t\t', 'typesEnabled': '\n\t\t'}}
I know these issues are most likely a result of my poor understanding of how a StreamReader works, so any and all tips would be welcome :)
edit 1:
the tokenName change was a silly positioning error, silly me. the code reflects the fix.
edit 2:
added an example and example output
This question is now solved; I took a different approach to the problem.
I basically took a list into which i appended tuples (name, {}) if the StartElement token had the attribute parseAs == "element" and put an evaluated string (parseText function) into the last tuple's dictionary. When it meets an EndElement token, it finds the tuple with name == tokenName, which is the name of the current token, puts it into the previous tuple's dictionary as an entry with key name.
There's a few more details as to how it works, but I'd probably just overly complicate my explanation if I included them (how it knows when to submit currData to self.data etc.)
class DataModel(QtCore.QAbstractItemModel):
def __init__(self, settingsDirectory, parent = None):
super(DataModel, self).__init__(parent)
settingsDirectory.setNameFilters(["*.xml"])
files = settingsDirectory.entryList()
print(files)
self.data = {}
self.parsingLog = {}
for i in range(len(files)):
filePath = str(files[i])
file = QtCore.QFile(settingsDirectory.absolutePath() + "/" + str(filePath))
fileOpens = file.open(file.ReadOnly | file.Text)
if fileOpens:
parser = QtCore.QXmlStreamReader(file)
currData = []
haveStartToken = False
print(self.tr("--------Beginning parsing--------"))
print(self.tr("Reading file: "+str(filePath)))
print(self.tr("---------------------------------"))
while not parser.atEnd():
if not parser.hasError():
parser.readNext()
token = parser.tokenType()
print(self.tr("--------------------"))
print(self.tr("Token type: " + str(self.printTokenType(token))))
if token == parser.StartElement:
tokenName = parser.name()
attributes = parser.attributes()
parseAs = attributes.value("parseAs")
print(self.tr("Reading StartElement: " + str(tokenName)))
print(self.tr("parseAs: " + str(parseAs)))
if parseAs == "text":
textValue = self.parseText(parser.readElementText())
print(self.tr("Text Value: " + str(textValue)))
if len(currData) != 0:
currData[len(currData)-1][1][tokenName] = textValue
else:
print(self.tr("*******Terminating application*******"))
print(self.tr("Reason: currData is empty"))
print(self.tr("*******Terminating application*******"))
sys.exit()
elif parseAs == "element":
currData.append((tokenName, {}))
else:
print(self.tr("******WARNING******"))
print(self.tr("parseAs attribute is not given correctly"))
print(self.tr("******WARNING******"))
print(self.tr("--------------------"))
elif token == parser.EndElement:
tokenName = parser.name()
print(self.tr("Reading EndElement: " + str(tokenName)))
print(self.tr("currData before: " + str(currData)))
if not haveStartToken:
startToken = currData[0][0]
haveStartToken = True
for i in currData:
if i[0] == tokenName:
print(self.tr("Closing token: " + str(tokenName)))
if i[0] != startToken:
currData[len(currData)-2][1][tokenName] = currData[len(currData)-1][1]
del currData[len(currData)-1]
print(self.tr("currData after: " + str(currData)))
print(self.tr("--------------------"))
elif i[0] == startToken:
print(self.tr("This is the final token, writing to self.data"), end = "")
self.data[startToken] = currData[0][1]
for i in range(5):
time.sleep(0.25)
print(self.tr("."), end = "")
print(self.tr("done."))
print(self.tr("--------------------"))
elif token == parser.Characters:
print(self.tr("Characters value: " + str(parser.text())))
print(self.tr("--------------------"))
elif token == parser.StartDocument:
self.parsingLog["File: "+str(filePath)] = {}
self.parsingLog["File: "+str(filePath)]["XML Version"] = str(parser.documentVersion())
self.parsingLog["File: "+str(filePath)]["XML Encoding"] = str(parser.documentEncoding())
print(self.tr("File Version: " + str(self.parsingLog["File: "+str(filePath)]["XML Version"])))
print(self.tr("File Encoding: " + str(self.parsingLog["File: "+str(filePath)]["XML Encoding"])))
elif token == parser.EndDocument:
print(self.tr("Cleaning up"), end = "")
for i in range(5):
time.sleep(0.25)
print(self.tr("."), end = "")
time.sleep(0.1)
print(self.tr("done."))
print(self.tr("self.data: " + str(self.data)))
print(self.tr("types of data: yesNo (should be str) - " +
str(type(self.data["building"]["specialSlot"]["yesNo"])) +
" - id - should be int - " + str(type(self.data["building"]["specialSlot"]["id"])) +
" - isItFloat - should be float - " + str(type(self.data["building"]["specialSlot"]["isItFloat"]))))
print(self.tr("--------------------"))
else:
print(self.tr("XML file is not well-formatted"))
else:
print(self.tr("xml file did not open properly"))
def parseText(self, text):
if isinstance(text, str):
if text == "":
return str(text)
for i in text:
if i not in ("0123456789."):
return str(text)
for j in text:
if j not in ("0123456789"):
return float(text)
return int(text)
else:
return ValueError
def printTokenType(self, token):
if token == QtCore.QXmlStreamReader.NoToken:
return "NoToken"
elif token == 1:
return "Invalid"
elif token == QtCore.QXmlStreamReader.StartDocument:
return "StartDocument"
elif token == QtCore.QXmlStreamReader.EndDocument:
return "EndDocument"
elif token == QtCore.QXmlStreamReader.StartElement:
return "StartElement"
elif token == QtCore.QXmlStreamReader.EndElement:
return "EndElement"
elif token == QtCore.QXmlStreamReader.Characters:
return "Characters"
elif token == QtCore.QXmlStreamReader.Comment:
return "Comment"
elif token == QtCore.QXmlStreamReader.DTD:
return "DTD"
elif token == QtCore.QXmlStreamReader.EntityReference:
return "EntityReference"
elif token == QtCore.QXmlStreamReader.ProcessingInstruction:
return "ProcessingInstruction"