Python XML Sax Truncates String with no Special Characters - python

I downloaded some US census area file in KML format. You can download the file here. I am trying to grab the area name and the coordinate boundaries. For some reason, some of the coordinate fields are truncated and not read correctly. For example, the coordinates for "Bloomsburg-Berwick-Sunbury, PA" appears in the KML file as
<coordinates>-77.36418,40.846937,0.0 -77.357113,40.844484,0.0 -77.356628,40.807334,0.0 -77.354097,40.701667,0.0 -77.287941,40.693595,0.0 -77.150516,40.677074,0.0 -77.109453,40.691552,0.0 -77.093607,40.676121,0.0 -77.060451,40.679854,0.0 -77.035549,40.676918,0.0 -77.034409,40.659928,0.0 -77.008418,40.659912,0.0 -76.996995,40.635778,0.0 -76.965528,40.647149,0.0 -76.944828,40.650209,0.0 -76.939883,40.638142,0.0 -76.949148,40.628167,0.0 -76.918672,40.603466,0.0 -76.886411,40.617758,0.0 -76.864254,40.627585,0.0 -76.840104,40.625439,0.0 -76.810269,40.634526,0.0 -76.810044,40.640102,0.0 -76.804867,40.646839,0.0 -76.793851,40.640514,0.0 -76.745894,40.654464,0.0 -76.701624,40.658082,0.0 -76.700546,40.663114,0.0 -76.662137,40.674013,0.0 -76.562175,40.709007,0.0 -76.469523,40.743188,0.0 -76.380334,40.775445,0.0 -76.30717,40.801809,0.0 -76.2991,40.831191,0.0 -76.284611,40.883588,0.0 -76.207827,40.94974,0.0 -76.231194,41.050168,0.0 -76.228975,41.138466,0.0 -76.277639,41.131804,0.0 -76.317953,41.205453,0.0 -76.319957,41.211255,0.0 -76.310261,41.310198,0.0 -76.407934,41.308418,0.0 -76.447597,41.275629,0.0 -76.592607,41.157765,0.0 -76.640767,41.155718,0.0 -76.678776,41.154172,0.0 -76.732672,41.17204,0.0 -76.790807,41.175732,0.0 -76.828168,41.16578,0.0 -76.880963,41.158044,0.0 -76.884245,41.157099,0.0 -76.885228,41.155973,0.0 -76.888145,41.153807,0.0 -76.889338,41.151988,0.0 -76.889669,41.150791,0.0 -76.896114,41.13907,0.0 -76.960229,41.148801,0.0 -76.977939,41.087883,0.0 -77.058088,41.085575,0.0 -77.113839,41.069032,0.0 -77.144111,41.06884,0.0 -77.14416,41.044338,0.0 -77.204027,40.99271,0.0 -77.279236,40.90971,0.0 -77.36418,40.846937,0.0</coordinates>
But is truncated at character 297 out of 1664. This happens seemingly randomly for others as well. Size doesn't seem to be an issue.
['-77.36418,40.846937,0.0 -77.357113,40.844484,0.0 -77.356628,40.807334,0.0 -77.354097,40.701667,0.0 -77.287941,40.693595,0.0 -77.150516,40.677074,0.0 -77.109453,40.691552,0.0 -77.093607,40.676121,0.0 -77.060451,40.679854,0.0 -77.035549,40.676918,0.0 -77.034409,40.659928,0.0 -77.00841']
I tried on two different ec2 machines so I don't think it's a memory/hardware issue. Any idea what is going on?
from xml.sax.handler import ContentHandler
from xml.sax import parse
class KMLHandler(ContentHandler):
def __init__(self):
super().__init__()
self.place_names = []
self.current_name = None
self.coordinates = []
self.temp_coordinates = []
self.start_placemark = False
self.capture_place_name = False
self.capture_cordinates = False
self.mapping_dict = {}
def startElement(self, name, attrs):
if name == 'Placemark':
self.first_placemark = True
self.start_placemark = True
self.temp_coordinates = []
self.current_name = None
else:
pass
if name == "SimpleData":
if attrs['name'] == "NAME":
self.capture_place_name = True
if name == "coordinates":
self.capture_cordinates = True
def endElement(self, name):
if name == "Placemark":
self.start_placemark = False
self.coordinates.append(self.temp_coordinates)
self.mapping_dict[self.current_name] = self.temp_coordinates
def characters(self, content):
if content.strip() != "":
if self.capture_place_name == True:
self.place_names.append(content)
self.current_name = content
self.capture_place_name = False
if self.capture_cordinates == True:
str_vals = [x.split(',')[0:2] for x in content.split(' ')]
self.temp_coordinates.append(content)
self.capture_cordinates = False
fname='./cb_2020_us_csa_5m.kml'
# fname='./test_small2.kml'
handler = KMLHandler()
parse(fname, handler)

As indicated in the comments, each characters event returns a chunk, which may or may not be the entire tag contents. It's similar to reading from a network; you might not get everything at once.
I reworked your code below, and it seems to report the right answer for Berwick. On my machine, the first chunk is 283 characters and the 2nd chunk is 1353 characters. 283 + 1353 = 1636, which matches the size of the data in the file.
Instead of a set of Booleans, I think it's simpler to capture the tag name, and then test for that when you're processing characters. There's only one controlling value, and it's set & reset in one place.
I didn't see a need for temp_coordinates. It wasn't clear to me whether you want coordinates to be a list or what, exactly, so I just grab the string.
from xml.sax import parse
class KMLHandler(ContentHandler):
def __init__(self):
super().__init__()
self.place_names = []
self.current_name = None
self.coordinates = []
self.start_placemark = False
self.capture_place_name = False
self.mapping_dict = {}
self.capture = ''
def startElement(self, name, attrs):
self.capture = ''
if name == 'Placemark':
self.first_placemark = True
self.start_placemark = True
self.current_name = None
else:
pass
if name == "SimpleData":
if attrs['name'] == "NAME":
self.capture = name
if name == "coordinates":
self.capture = name
def endElement(self, name):
if name == "Placemark":
self.start_placemark = False
self.mapping_dict[self.current_name] = self.coordinates
self.coordinates = []
def characters(self, content):
if content.strip() != "":
if self.capture == 'SimpleData':
self.place_names.append(content)
self.current_name = content
self.capture_place_name = False
if self.capture == "coordinates":
self.coordinates.append(content)
print( '%d coordinates for %s: {%s}' % (len(content),
self.current_name,
self.coordinates) )
fname='./cb_2020_us_csa_5m.kml'
# fname='./test_small2.kml'
handler = KMLHandler()
parse(fname, handler)

Related

python OOP functions into classes/method

I am coding a huffman coding tree in python, I have used one class for tree nodes, but I want the whole program to be object oriented. I just cant seem to be able to turn my functions into classes and run the whole thing as OOP. Is it possible to convert functions into classes/methods or does it involve rewriting the entire code in OOP style. The code works ok, im just trying to get my head around OOP and how to implement it. Any help would be great! Code below.
'''
import heapq
class TreeNode(object):
def __init__(self, freq, char=None, left=None, right=None):
self.char = char
self.freq = freq
self.left = left
self.right = right
def __lt__(self, other):
return self.freq < other.freq
def isLeaf(self):
return (self.left == None and self.right == None)
def createTree(freqData):
huffmanNodes = []
for char in freqData:
huffmanNodes.append(TreeNode(freqData[char], char))
heapq.heapify(huffmanNodes)
while (len(huffmanNodes) > 1):
# obtain the two minimum-frequency Huffman nodes
child1 = heapq.heappop(huffmanNodes)
child2 = heapq.heappop(huffmanNodes)
parent = TreeNode(child1.freq + child2.freq, left=child1, right=child2)
heapq.heappush(huffmanNodes, parent)
return None if huffmanNodes == [] else heapq.heappop(huffmanNodes)
def hTreeToHCode(hTree):
code = dict()
def getCode(hNode, curCode=""):
if (hNode == None): return
if (hNode.left == None and hNode.right == None):
code[hNode.char] = curCode
getCode(hNode.left, curCode + "0")
getCode(hNode.right, curCode + "1")
if hNode.char == None:
print("")
else:
print('Character = {} : Freq = {} --- Encoded into {}'.format(hNode.char, hNode.freq, curCode))
getCode(hTree)
return code
def encode(s, freqData):
hTree = createTree(freqData)
hCode = hTreeToHCode(hTree)
hEncoded = ""
for char in s:
hEncoded += hCode[char]
return hEncoded.strip()
def decode(s, freqData):
hTree = createTree(freqData)
decodedStr = ""
curTreeNode = hTree
for charCode in s:
if (charCode == "0"):
curTreeNode = curTreeNode.left
else:
curTreeNode = curTreeNode.right
if (curTreeNode.isLeaf()):
decodedStr += curTreeNode.char
curTreeNode = hTree
return decodedStr
words = "hello welcome to my huffman algorithm code"
charlst = {}
for char in words:
charlst[char] = charlst.get(char,0) + 1
freqData = charlst
encodedStr = encode(words, freqData)
print("encodedStr", encodedStr)
decodedStr = decode(encodedStr, freqData)
print("decodedStr", decodedStr)
'''
you can put function outside the NodeTree class in a Main class and add a run method with var initialisation etc and put at the end of your program a
if __name__=='__main__':
Main.run()

Why inputbox is cleaning when i write something?

I have a curses program on python and i have this fragment of class. I am running def control_chat(self, sel_chat) from another class. When messages (self.messages is list like ["message", id_of_msg]) are updating, inputbox's window cleans itself. But i clear all screen by entering emptu lines, not touching last 3 lines (in this last 3 lines my input box).
def input_validator(self, key):
if key == 10 or self.new_msg:
return 7
else:
return key
def get_msg(self):
while self.chatting:
win = curses.newwin(2, self.cols, self.lines-2, 0)
inp = curses.textpad.Textbox(win)
text = inp.edit(validate=self.input_validator)
if text != "":
self.vk.send_message(self.peer_id, text)
def chat_clear(self):
lines = self.lines - 3
for y in range(lines):
self.wprint(self.empity_line, y=y, x=0)
def control_chat(self, sel_chat):
self.sel_chat = sel_chat
self.peer_id = self.vk.id_to_peer_id(self.sel_chat)
self.input_thread = Thread(target=self.get_msg)
self.input_thread.start()
while self.chatting:
self.messages = self.vk.list_chat(self.peer_id)
self.draw_chat()
def draw_chat(self):
self.chat_clear()
self.top_bar_draw()
for message in self.messages:
name_line = str(message[1])
self.stdscr.attron(color_pair(1))
self.stdscr.addstr(name_line)
self.stdscr.attroff(color_pair(1))
self.stdscr.addstr(": "+message[0]+"\n")
self.wprint(self.screen_line, y=self.lines-3, x=0)
self.messages = []
# time.sleep()
def wprint(self, str, y=-1, x=-1):
if (y == -1) and (x == -1):
self.stdscr.addstr(str+"\n")
else:
self.stdscr.addstr(y, x, str+"\n")
self.stdscr.refresh()

Class inheritance type checking after pickling in Python

Is there a sure-fire way to check that the class of an object is a sub-class of the desired super?
For Example, in a migration script that I'm writing, I have to convert objects of a given type to dictionaries in a given manner to ensure two-way compatability of the data.
This is best summed up like so:
Serializable
User
Status
Issue
Test
Set
Step
Cycle
However, when I'm recursively checking objects after depickling, I receive a Test object that yields the following results:
Testing data object type:
type(data)
{type}< class'__main.Test' >
Testing Class type:
type(Test())
{type}< class'__main.Test' >
Testing object type against class type:
type(Test()) == type(data)
{bool}False
Testing if object isinstance() of Class:
isinstance(data, Test)
{bool}False
Testing if Class isinstance() of Super Class:
isinstance(Test(), Serializable)
{bool}True
Testing isinstance() of Super Class::
isinstance(data, Serializable)
{bool}False
Interestingly, it doesn't appear to have any such problem prior to pickling as it handles the creation of dictionary and integrity hash just fine.
This only crops up with depickled objects in both Pickle and Dill.
For Context, here's the code in it's native environment - the DataCache object that is pickled:
class DataCache(object):
_hash=""
_data = None
#staticmethod
def genHash(data):
dataDict = DataCache.dictify(data)
datahash = json.dumps(dataDict, sort_keys=True)
return hashlib.sha256(datahash).digest()
#staticmethod
def dictify(data):
if isinstance(data,list):
datahash = []
for item in data:
datahash.append(DataCache.dictify(item))
elif isinstance(data,(dict, collections.OrderedDict)):
datahash = collections.OrderedDict()
for key,value in datahash.iteritems():
datahash[key]= DataCache.dictify(value)
elif isinstance(data, Serializable):
datahash = data.toDict()
else:
datahash = data
return datahash
def __init__(self, restoreDict = {}):
if restoreDict:
self.__dict__.update(restoreDict)
def __getinitargs__(self):
return (self.__dict__)
def set(self, data):
self._hash = DataCache.genHash(data)
self._data = data
def verify(self):
dataHash = DataCache.genHash(self._data)
return (self._hash == dataHash)
def get(self):
return self._data
Finally, I know there's arguments for using JSON for readability in storage, I needed Pickle's ability to convert straight to and from Objects without specifying the object type myself. (thanks to the nesting, it's not really feasible)
Am I going mad here or does pickling do something to the class definitions?
EDIT:
Minimal Implementation:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
from aenum import Enum
import json # _tricks
import base64
import argparse
import os
import sys
import datetime
import dill
import hashlib
import collections
class Serializable(object):
def __init__(self, initDict={}):
if initDict:
self.__dict__.update(initDict)
def __str__(self):
return str(self.sortSelf())
def sortSelf(self):
return collections.OrderedDict(sorted(self.__dict__.items()))
def toDict(self):
return self.__dict__
def fromDict(self, dict):
# Not using __dict__.update(...) to avoid polluting objects with the excess data
varMap = self.__dict__
if dict and varMap:
for key in varMap:
if (key in dict):
varMap[key] = dict[key]
self.__dict__.update(varMap)
return self
return None
class Issue(Serializable):
def __init__(self, initDict={}):
self.id = 0
self.key = ""
self.fields = {}
if initDict:
self.__dict__.update(initDict)
Serializable.__init__(self)
def fieldToDict(self, obj, key, type):
if key in obj:
result = obj[key]
else:
return None
if result is None:
return None
if isinstance(result, type):
return result.toDict()
return result
def fromDict(self, jsonDict):
super(Issue, self).fromDict(jsonDict)
self.fields["issuetype"] = IssueType().fromDict(self.fields["issuetype"])
self.fields["assignee"] = User().fromDict(self.fields["assignee"])
self.fields["creator"] = User().fromDict(self.fields["creator"])
self.fields["reporter"] = User().fromDict(self.fields["reporter"])
return self
def toDict(self):
result = super(Issue, self).toDict()
blankKeys = []
for fieldName, fieldValue in self.fields.iteritems():
if fieldValue is None:
blankKeys.append(fieldName)
if blankKeys:
for key in blankKeys:
self.fields.pop(key, None)
result["fields"]["issuetype"] = self.fieldToDict(result["fields"], "issuetype", IssueType)
result["fields"]["creator"] = self.fieldToDict(result["fields"], "creator", User)
result["fields"]["reporter"] = self.fieldToDict(result["fields"], "reporter", User)
result["fields"]["assignee"] = self.fieldToDict(result["fields"], "assignee", User)
return result
class IssueType(Serializable):
def __init__(self):
self.id = 0
self.name = ""
def toDict(self):
return {"id": str(self.id)}
class Project(Serializable):
def __init__(self):
Serializable.__init__(self)
self.id = 0
self.name = ""
self.key = ""
class Cycle(Serializable):
def __init__(self):
self.id = 0
self.name = ""
self.totalExecutions = 0
self.endDate = ""
self.description = ""
self.totalExecuted = 0
self.started = ""
self.versionName = ""
self.projectKey = ""
self.versionId = 0
self.environment = ""
self.totalCycleExecutions = 0
self.build = ""
self.ended = ""
self.name = ""
self.modifiedBy = ""
self.projectId = 0
self.startDate = ""
self.executionSummaries = {'executionSummary': []}
class Step(Serializable):
def __init__(self):
self.id = ""
self.orderId = 0
self.step = ""
self.data = ""
self.result = ""
self.attachmentsMap = {}
def toDict(self):
dict = {}
dict["step"] = self.step
dict["data"] = self.data
dict["result"] = self.result
dict["attachments"] = []
return dict
class Status(Serializable):
def __init__(self):
self.id = 0
self.name = ""
self.description = ""
self.isFinal = True
self.color = ""
self.isNative = True
self.statusCount = 0
self.statusPercent = 0.0
class User(Serializable):
def __init__(self):
self.displayName = ""
self.name = ""
self.emailAddress = ""
self.key = ""
self.active = False
self.timeZone = ""
class Execution(Serializable):
def __init__(self):
self.id = 0
self.orderId = 0
self.cycleId = -1
self.cycleName = ""
self.issueId = 0
self.issueKey = 0
self.projectKey = ""
self.comment = ""
self.versionId = 0,
self.versionName = "",
self.executedOn = ""
self.creationDate = ""
self.executedByUserName = ""
self.assigneeUserName = ""
self.status = {}
self.executionStatus = ""
def fromDict(self, jsonDict):
super(Execution, self).fromDict(jsonDict)
self.status = Status().fromDict(self.status)
# This is already listed as Execution Status, need to associate and convert!
return self
def toDict(self):
result = super(Execution, self).toDict()
result['status'] = result['status'].toDict()
return result
class ExecutionContainer(Serializable):
def __init__(self):
self.executions = []
def fromDict(self, jsonDict):
super(ExecutionContainer, self).fromDict(jsonDict)
self.executions = []
for executionDict in jsonDict["executions"]:
self.executions.append(Execution().fromDict(executionDict))
return self
class Test(Issue):
def __init__(self, initDict={}):
if initDict:
self.__dict__.update(initDict)
Issue.__init__(self)
def toDict(self):
result = super(Test, self).toDict()
stepField = "CustomField_0001"
if result["fields"][stepField]:
steps = []
for step in result["fields"][stepField]["steps"]:
steps.append(step.toDict())
result["fields"][stepField] = steps
return result
def fromDict(self, jsonDict):
super(Test, self).fromDict(jsonDict)
stepField = "CustomField_0001"
steps = []
if stepField in self.fields:
for step in self.fields[stepField]["steps"]:
steps.append(Step().fromDict(step))
self.fields[stepField] = {"steps": steps}
return self
class Set(Issue):
def __init__(self, initDict={}):
self.__dict__.update(initDict)
Issue.__init__(self)
class DataCache(object):
_hash = ""
_data = None
#staticmethod
def genHash(data):
dataDict = DataCache.dictify(data)
datahash = json.dumps(dataDict, sort_keys=True)
return hashlib.sha256(datahash).digest()
#staticmethod
def dictify(data):
if isinstance(data, list):
datahash = []
for item in data:
datahash.append(DataCache.dictify(item))
elif isinstance(data, (dict, collections.OrderedDict)):
datahash = collections.OrderedDict()
for key, value in datahash.iteritems():
datahash[key] = DataCache.dictify(value)
elif isinstance(data, Serializable):
datahash = data.toDict()
else:
datahash = data
return datahash
def __init__(self, restoreDict={}):
if restoreDict:
self.__dict__.update(restoreDict)
def __getinitargs__(self):
return (self.__dict__)
def set(self, data):
self._hash = DataCache.genHash(data)
self._data = data
def verify(self):
dataHash = DataCache.genHash(self._data)
return (self._hash == dataHash)
def get(self):
return self._data
def saveCache(name, projectKey, object):
filePath = "migration_caches/{projectKey}".format(projectKey=projectKey)
if not os.path.exists(path=filePath):
os.makedirs(filePath)
cache = DataCache()
cache.set(object)
targetFile = open("{path}/{name}".format(name=name, path=filePath), 'wb')
dill.dump(obj=cache, file=targetFile)
targetFile.close()
def loadCache(name, projectKey):
filePath = "migration_caches/{projectKey}/{name}".format(name=name, projectKey=projectKey)
result = False
try:
targetFile = open(filePath, 'rb')
try:
cache = dill.load(targetFile)
if isinstance(cache, DataCache):
if cache.verify():
result = cache.get()
except EOFError:
# except BaseException:
print ("Failed to load cache from file: {filePath}\n".format(filePath=filePath))
except IOError:
("Failed to load cache file at: {filePath}\n".format(filePath=filePath))
targetFile.close()
return result
testIssue = Test().fromDict({"id": 1000,
"key": "TEST",
"fields": {
"issuetype": {
"id": 1,
"name": "TestIssue"
},
"assignee": "Minothor",
"reporter": "Minothor",
"creator": "Minothor",
}
})
saveCache("Test", "TestProj", testIssue)
result = loadCache("Test", "TestProj")
EDIT 2
The script in it's current form, now seems to work correctly with vanilla Pickle, (initially switched to Dill due to a similar issue, which was solved by the switch).
However, if you are here with this issue and require Dill's features, then as Mike noted in the comments - it's possible to change the settings in dill.settings to have Dill behave pickle referenced items only with joblib mode, effectively mirroring pickle's standard pickling behaviour.

Why does this character ▯ appear?

So this character ▯ appears when I run my code which I think means there is a missing character therefor it can't be displayed. (Not sure correct me if I am wrong) And well basically I want to be able to get rid of that character. Here is what it looks like when I run my code:
However in the back-end in the idle when I click on one of the boxes for it to be displayed up top it doesn't register and looks like this in idle:
Why does it appear on screen if it isn't going to appear in idle?
Also how can I get rid of the ▯ character from the main screen?
Here is my full code.
Here are segments in which I think the problem lies. (However I have not been able to solve the problem)
My classes for Tree comparison to find the sentences and their frequent use:
class Branch():
def __init__(self, value):
self.left = None
self.right = None
self.value = value
self.frequency = 1
def incFreq(self):
self.frequency = self.frequency + 1
def freq(self):
return self.frequency
class Tree():
highest = []
def __init__(self):
self.root = None
self.found = False
def findHighest(self):
from operator import itemgetter, attrgetter
self.highest = []
self.inorder(self.root)
self.highest = sorted(self.highest, key=itemgetter(1), reverse=True)
return self.highest
#lessThan function needed to compare strings
def lessThan(self, a, b):
if len(a) < len(b):
loopCount = len(a)
else:
loopCount = len(b)
for pos in range(0, loopCount):
if a[pos] > b[pos]:
return False
return True
def outputTree(self):
self.inorder(self.root)
def insert(self, value):
#increment freq if already exists, else insert
if not self.exists(value):
self.root = self.insertAtBranch(self.root, value)
def exists(self, value):
#set the class variable found to False to assume it is not there
self.found = False
self.findAtBranch(self.root, value)
return self.found
#Used to fine a value in a tree
def findAtBranch(self, branch, value):
if branch == None:
pass
else:
#print ("[" + branch.value + "][" + value + "]") # Error checking
if branch.value == value:
self.found = True
#print("found " + value)
branch.incFreq()
#print(branch.freq())
else:
self.findAtBranch(branch.left, value)
self.findAtBranch(branch.right, value)
def insertAtBranch(self, branch, value):
if branch == None:
return Branch(value)
else:
if self.lessThan(branch.value, value):
branch.right = self.insertAtBranch(branch.right, value)
else:
branch.left = self.insertAtBranch(branch.left, value)
return branch
def inorder(self, branch):
if branch == None: return
self.highest.append((branch.value, branch.freq()))
#print (branch.value)
#print (branch.freq())
#print(self.highest[0])
self.inorder(branch.left)
self.inorder(branch.right)
This is where I use the tree and pass sentences to be used on a different function:
def getPhrases(self, numToReturn):
topPhrases = []
phrasesTree = Tree()
#load tree with phrases from phrase text file
file = open('setPhrases.txt', 'r')
for line in file:
phrasesTree.insert(line)
#create a list of the top n of phrases to return
val = 0
for phrase in phrasesTree.findHighest():
if val < numToReturn:
topPhrases.append(phrase)
val = val + 1
return topPhrases
This is where I use the sentences to be able to display them on the screen:
def createPhrases(self):
print("createPhrases")
self.deletePanes()
self.show_keyboard = False
self.show_words = False
self.show_phrases = True
self.show_terminal = True
words = self.getPhrases(10)
for word, count in words:
self.addPane("{}".format(word, count), WORDS)
self.addPane("Boxes", PHRASE)
self.addPane("Keyboard", PHRASE)
self.addPane("OK", PHRASE)
self.drawPanes()
When you read lines from file, newline characters are at the end. pygame's documentation states that:
The text can only be a single line: newline characters are not rendered.
So, you should change this fragment:
file = open('setPhrases.txt', 'r')
for line in file:
phrasesTree.insert(line)
to this:
file = open('setPhrases.txt', 'r')
for line in file:
phrasesTree.insert(line.strip())

Python generate sorted list

I want to compress my movies automatically. So I've written a mediainfo wrapper class in python, to generate a xml output, which I then parse to a movieinfo class, with a list of audio and subtitle tracks.
__author__ = 'dominik'
class Error(Exception):
""" Error class
"""
class ValidationError(Error):
""" Invalid or missing xml items
"""
class MovieInfo(object):
""" Description of movie file
"""
def __init__(self, media_info):
self._video_track = None
self._audio_tracks = []
self._subtitle_tracks = []
self.valid_movie = True
for track in media_info.tracks:
if track.track_type == "Audio":
self._audio_tracks.append(AudioTrack(track))
elif track.track_type == "Text":
self._subtitle_tracks.append(SubtitleTrack(track))
elif track.track_type == "Video":
self._video_track = VideoTrack(track)
#property
def audio_tracks(self):
if not hasattr(self, "_audio_tracks"):
self._audio_tracks = []
if len(self._audio_tracks) != 0:
return self._audio_tracks
#property
def subtitle_tracks(self):
if not hasattr(self, "_subtitle_tracks"):
self._subtitle_tracks = []
if len(self._subtitle_tracks) != 0:
return self._subtitle_tracks
class Track(object):
""" Abstract track class for audio and subtitle tracks
"""
__KNOWN_LANGUAGE_CODES = {"en": "ENG", "de": "DE"}
def __init__(self, track, valid_codecs):
self._valid = True
track_id = int(track.id)
codec_id = self._determine_codec(track.codec_id, valid_codecs)
language = self._determine_language(track.language)
self._id = track_id
self._codec_id = codec_id
self._language = language
def _determine_codec(self, track_codec, types):
result = types.get(track_codec, None)
if result is None:
self._valid = False
return result
def _determine_language(self, track_language, types=__KNOWN_LANGUAGE_CODES):
result = types.get(track_language, None)
if result is None:
self._valid = False
return result
class AudioTrack(Track):
""" Audio track class
"""
__KNOWN_AUDIO_CODECS = {"A_DTS": "DTS", "A_AC3": "AC3"}
def __init__(self, track):
self._type = 1
Track.__init__(self, track, self.__KNOWN_AUDIO_CODECS)
class SubtitleTrack(Track):
""" Subtitle track class
"""
__KNOWN_SUBTITLE_CODECS = {"S_VOBSUB": "VOBSUB"}
def __init__(self, track):
self._type = 2
if track.forced == "Yes":
self._forced = True
else:
self._forced = False
Track.__init__(self, track, self.__KNOWN_SUBTITLE_CODECS)
class VideoTrack(object):
""" Video track class (only one video track in movie info!)
"""
def __init__(self, track):
self._type = 0
self._framerate = float(track.frame_rate)
self._width = track.width
self._height = track.height
Here is the mediainfo class (it's the pymediainfo class):
from subprocess import Popen
import os
from tempfile import mkstemp
from bs4 import BeautifulSoup, NavigableString
from setuptools.compat import unicode
class Track(object):
""" Hold the track information
"""
def __getattr__(self, item):
try:
return object.__getattribute__(self, item)
except:
pass
return None
def __init__(self, xml_track):
self.xml_track = xml_track
self.track_type = xml_track.attrs["type"]
for child in self.xml_track.children:
if not isinstance(child, NavigableString):
node_name = child.name.lower().strip()
node_value = unicode(child.string)
node_other_name = "other_%s" % node_name
if getattr(self, node_name) is None:
setattr(self, node_name, node_value)
else:
if getattr(self, node_other_name) is None:
setattr(self, node_other_name, [node_value, ])
else:
getattr(self, node_other_name).append(node_value)
for key in [c for c in self.__dict__.keys() if c.startswith("other_")]:
try:
primary = key.replace("other_", "")
setattr(self, primary, int(getattr(self, primary)))
except:
for value in getattr(self, key):
try:
actual = getattr(self, primary)
setattr(self, primary, int(value))
getattr(self, key).append(actual)
break
except:
pass
def __repr__(self):
return("<Track id='{0}', type='{1}'>".format(self.id, self.track_type))
def to_data(self):
data = {}
for k, v in self.__dict__.items():
if k != 'xml_track':
data[k] = v
return data
class Mediainfo(object):
""" MediaInfo wrapper
"""
def __init__(self, xml):
self.xml_dom = xml
if isinstance(xml, str):
self.xml_dom = BeautifulSoup(xml, "xml")
def _populate_tracks(self):
if self.xml_dom is None:
return
for xml_track in self.xml_dom.Mediainfo.File.find_all("track"):
self._tracks.append(Track(xml_track))
#property
def tracks(self):
if not hasattr(self, "_tracks"):
self._tracks = []
if len(self._tracks) == 0:
self._populate_tracks()
return self._tracks
#staticmethod
def parse(filename):
filehandler_out, filename_out = mkstemp(".xml", "mediainfo-")
filehandler_err, filename_err = mkstemp(".error", "mediainfo-")
filepointer_out = os.fdopen(filehandler_out, "r+b")
filepointer_err = os.fdopen(filehandler_err, "r+b")
mediainfo_command = ["mediainfo", "-f", "--Output=XML", filename]
p = Popen(mediainfo_command, stdout=filepointer_out, stderr=filepointer_err)
p.wait()
filepointer_out.seek(0)
xml_dom = BeautifulSoup(filepointer_out.read(), "xml")
filepointer_out.close()
filepointer_err.close()
print(xml_dom)
return Mediainfo(xml_dom)
def to_data(self):
data = {'tracks': []}
for track in self.tracks:
data['tracks'].append(track.to_data())
return data
This class gives me every track in the xml and then I parse the relevant info in movieinfo.
Ok now I have a list of audiotracks e.g. 3 tracks one in german language and DTS, one in german and AC3 and one in english and AC3. Now I want to get the ids from the tracks in the format "1,2,3" to give it to handbrake cli.
My problem is the order of the tracks. If there is a german DTS track this schould be the first track, the second track should be also the first, but compressed to aac and the third track should be one english track in AAC. If there is only a german AC3 track then the first track should be this track but compressed to AAC, and the second track should englisch and AAC.
I don't know exactly how I can achive that, can you help me? I'm new to python, and come from C, C++ and C#. In C# this is very easy to get with lambda.
Assuming you know to define a compare-tor that given two items can define which is bigger then Python functions as well as C or C++.
Start here -
1. https://wiki.python.org/moin/HowTo/Sorting/
https://developers.google.com/edu/python/sorting
http://docs.python.org/2/library/functions.html#sorted
Using sorted method and define the key you want.

Categories