I have two distinct programs that processed the same input files. During the processing, both programs generated files with names based on their original counterpart. The possible changes, without considering the modified paths and file extensions, would be, in regex terms, as follow:
Program1 can append (_[[:alnum:]]+)* to the original filenames.
Program2 can append (_[[:alnum:]]+)* and prepend ([[:alnum:]]+_)* to the original filenames.
Now, given a list of several millions of file paths from Program2, I need to match each one of them to an unique file path from Program1.
What would be a sensible way to do it?
Here's where I'm stuck:
#!/usr/bin/env python3
import os
##### DATA #####
program1_files = [
# Around 10 thousand of unique items (parsed from a file).
'path1/jobxxx/n1_file_001_xxx.tiff',
'path1/jobxxx/n1_file_002_yyy_xxx.tiff',
'path1/jobxxx/n2_file_001_yyy.tiff'
]
program2_files = [
# Around 10 millions of items (parsed from a file),
# with only around 10 thousand of unique items
# IMO the items should be directly processed/matched while parsing the file
'path2/JXX/00001_n1_file_001_XXX_yyy_zz.mrc',
'path2/JXX/00001_n1_file_001_XXX_yyy_zz.mrc',
'path2/JXX/00002_n1_file_002_XXX_yyy_zz.mrc',
'path2/jXX/00003_n2_file_001_XXX_yyy_zz.mrc',
'path2/JZZ/00101_YYY_n1_file_001_xx_zzz.mrc',
'path2/JZZ/00102_YYY_n2_file_001_xx_zzz.mrc'
]
################
def get_parts(str):
return os.path.splitext(os.path.basename(str))[0].split('_')
program1_fileparts = dict(zip(program1_files, map(get_parts,program1_files)))
file2_to_file1 = {}
for file2 in program2_files:
if file2 in file2_to_file1: continue # skip already processed file2
file2_parts = get_parts(file2)
for file1 in program1_files:
file1_parts = program1_fileparts[file1]
# here I'm a little lost on what to do
Here's a solution that uses a prefix tree to quickly match against the program 1 output files:
#!/usr/bin/env python3
import os
from collections import defaultdict
from typing import Any, Optional
##### DATA #####
program1_files = [
"path1/jobxxx/n1_file_001_xxx.tiff",
"path1/jobxxx/n1_file_002_yyy_xxx.tiff",
"path1/jobxxx/n2_file_001_yyy.tiff",
]
program2_files = [
"path2/JXX/00001_n1_file_001_XXX_yyy_zz.mrc",
"path2/JXX/00002_n1_file_002_XXX_yyy_zz.mrc",
"path2/jXX/00003_n2_file_001_XXX_yyy_zz.mrc",
"path2/JZZ/00101_YYY_n1_file_001_xx_zzz.mrc",
"path2/JZZ/00102_YYY_n2_file_001_xx_zzz.mrc",
]
################
def get_parts(s: str) -> list[str]:
return os.path.splitext(os.path.basename(s))[0].split("_")
def build_prefix_tree(files: list[str]) -> dict[str, Any]:
"""Build a prefix tree of file path parts from program 1."""
# Node = dict[str, Node | tuple[list[str], str]]
def insert(node: dict[str, Any], parts: list[str], path: str) -> None:
if parts[0] not in node:
# create leaf entry
node[parts[0]] = (parts[1:], path)
return
child = node[parts[0]]
if isinstance(child, tuple):
# found a collision with an existing leaf: expand it to a full node
child_parts, child_path = child
child = {child_parts[0]: (child_parts[1:], child_path)}
node[parts[0]] = child
# recurse into next entry
insert(child, parts[1:], path)
prefix_tree: dict[str, Any] = {}
for file in files:
insert(prefix_tree, get_parts(file), file)
return prefix_tree
def recursive_match(node: dict[str, Any], parts: list[str]) -> Optional[str]:
"""Lookup a prefix match in `node`."""
if parts[0] not in node:
# couldn't match parts
return None
child = node[parts[0]]
if isinstance(child, tuple):
# found matching file
return child[1]
return recursive_match(child, parts[1:])
tree = build_prefix_tree(program1_files)
file1_to_file2: dict[str, list[str]] = defaultdict(list)
file2_to_file1: dict[str, str] = {}
for file2 in program2_files:
if file2 in file2_to_file1:
continue # skip already processed file2
file2_parts = get_parts(file2)
for skip_parts in range(len(file2_parts)):
# first try skipping no parts, then skip 1 part, then 2, and so on
parts = file2_parts[skip_parts:]
if (file1 := recursive_match(tree, parts)) is not None:
file1_to_file2[file1].append(file2)
file2_to_file1[file2] = file1
break
else:
# reached the end of the loop without breaking
print(f"Warning: couldn't find match for {file2}")
Related
I use ruamel.yaml in order to parse YAML files and I'd like to identify if the key is the anchor itself or just a pointer. Given the following:
foo: &some_anchor
bar: 1
baz: *some_anchor
I'd like to understand that foo is the actual anchor and baz is a pointer. From what I can see, there's an anchor property on the node (and also yaml_anchor method), but both baz and foo show that their anchor is some_anchor - meaning that I cannot differentiate.
How can I get this info?
Since PyYaml and Ruamel.yaml load an alias node as a reference of the object loaded from the corresponding anchor node, you can traverse an object tree and check if each node is a reference of a previous visited object or not.
The following is a simple example only checking dictionaries.
from ruamel.yaml import YAML
root = YAML().load('''
foo: &some_anchor
bar: 1
baz: *some_anchor
''')
dict_ids = set()
def visit(parent):
if isinstance(parent, dict):
i = id(parent)
print(parent, ', is_alias:', i in dict_ids)
dict_ids.add(i)
for k, v in parent.items():
visit(v)
elif isinstance(parent, list):
for e in parent:
visit(e)
visit(root)
This will output the following.
ordereddict([('foo', ordereddict([('bar', 1)])), ('baz', ordereddict([('bar', 1)]))]) , is_alias: False
ordereddict([('bar', 1)]) , is_alias: False
ordereddict([('bar', 1)]) , is_alias: True
In your example &some_anchor is the anchor for the single element mapping bar: 1 and
*some_anchor is the alias. Writing the "foo is the actual anchor and baz is pointer`" is
in IMO both incorrect terminology and confusing keys with their (anchored/aliased) values. If you had a YAML document:
- 3
- 5
- 9
- &some_anchor
bar: 1
- 42
- *some_anchor
would you actually say, probably after carefully counting,
that '4 is the anchor and 6 is the pointer(or3and5` depending on
where you start counting)?
If you want to test if a key of a dict has a value that was an anchored node in YAML, or if that
value was an aliased node, you'll have to look at the value, and you'll find that they are the same Python data structure
for keys foo resp. baz.
What determines on dumping, which key's value gets the anchor and which key's (or keys') value(s) are dumped as an alias,
is entirely determined
by which gets dumped first, as the YAML specification stats that an anchor has to come before its use as an alias (an
anchor can come after an alias if it is re-defined).
As #relent95 describes you should recursively walk over the
data structure you loaded (to see which key gets there first) and in both ruamel.yaml and PyYAML look at the id().
But for PyYAML that only works for complex data (dict, list, objects) as it throws away anchoring information and will
not find the same id() on e.g. an anchored integer value.
The alternative to using the id is to look at the actual anchor name that ruamel.yaml stores in attribute/property anchor.
If you know up front that your YAML document is as simple as your example ( anchored/aliased nodes are values for
the root level mapping ) you can do:
import sys
import ruamel.yaml
yaml_str = """\
foo: &some_anchor
bar: 1
baz: *some_anchor
oof: 42
"""
def is_keys_value_anchor(key, data, verbose=0):
anchor_found = set()
for k, v in data.items():
res = None
try:
anchor = v.anchor.value
if anchor is not None:
res = anchor not in anchor_found
anchor_found.add(anchor)
except AttributeError:
pass
if k == key:
break
if verbose > 0:
print(f'key "{key}" {res}')
return res
yaml = ruamel.yaml.YAML()
data = yaml.load(yaml_str)
is_keys_value_anchor('foo', data, verbose=1)
is_keys_value_anchor('baz', data, verbose=1)
is_keys_value_anchor('oof', data, verbose=1)
which gives:
key "foo" True
key "baz" False
key "oof" None
But this in ineffecient for root mappings with lots of keys, and won't find anchors/aliases that were nested deeply
in the document. A more generic approach is to recursively walk the data structure once and create dict with
as key the anchor used, and as value a list of "paths", A path itself being a list of keys/indices with which
which you can traverse the data structure starting at the root. The first path in the list being the anchor, the rest aliases:
import sys
import ruamel.yaml
yaml_str = """\
foo: &some_anchor
- bar: 1
- klm: &anchored_num 42
baz:
xyz:
- *some_anchor
oof: [1, 2, c: 13, magic: [*anchored_num]]
"""
def find_anchor_alias_paths(data, path=None, res=None):
def check_add_anchor(d, path, anchors):
# returns False when an alias is found, to prevent recursing into a node twice.
try:
anchor = d.anchor.value
if anchor is not None:
tmp = anchors.setdefault(anchor, [])
tmp.append(path)
return len(tmp) == 1
except AttributeError:
pass
return True
if path is None:
path = []
if res is None:
res = {}
if isinstance(data, dict):
for k, v in data.items():
next_path = path.copy()
next_path.append(k)
if check_add_anchor(v, next_path, res):
find_anchor_alias_paths(v, next_path, res)
elif isinstance(data, list):
for idx, elem in enumerate(data):
next_path = path.copy()
next_path.append(idx)
if check_add_anchor(elem, next_path, res):
find_anchor_alias_paths(elem, next_path, res)
return res
yaml = ruamel.yaml.YAML()
data = yaml.load(yaml_str)
anchor_alias_paths = find_anchor_alias_paths(data)
for anchor, paths in anchor_alias_paths.items():
print(f'anchor: "{anchor}", anchor_path: {paths[0]}, alias_path(s): {paths[1:]}')
print('value for last anchor/alias found', data.mlget(paths[-1], list_ok=True))
which gives:
anchor: "some_anchor", anchor_path: ['foo'], alias_path(s): [['baz', 'xyz', 0]]
anchor: "anchored_num", anchor_path: ['foo', 1, 'klm'], alias_path(s): [['oof', 3, 'magic', 0]]
value for last anchor/alias found 42
You can then test your the paths you are interested in against the values returned by find_anchor_alias_paths,
or the key against the final elements of such paths.
This question already has answers here:
What are type hints in Python 3.5?
(5 answers)
What does -> mean in Python function definitions?
(11 answers)
Closed 2 years ago.
i am using python 3.7 and i have just started my own opensource project. Some time ago a very skilled software developer decided to help, then he didn't have enough time to continue. So i am taking his work back to develop new features for the project. Now he has designed a script to manage the reading of text from pdf and doc files. He has developed it very well but there is something i don't understand:
#classmethod
def extract_document_data(cls, file_path : str) -> DocumentData:
"""
Entry point of the module, it extracts the data from the document
whose path is passed as input.
The extraction strategy is automatically chosen based on the MIME type
of the file.
#type file_path: str
#param file_path: The path of the document to be parsed.
#rtype: DocumentData
#returns: An object containing the data of the parsed document.
"""
mime = magic.Magic(mime=True)
mime_type = mime.from_file(file_path)
document_type = DocumentType.get_instance(mime_type)
strategy = cls.strategies[document_type]
return strategy.extract_document_data(file_path)
this: -> DocumentData is very obscure for me, as if it was a lamdba it shouls be included in the methods arguments as a callback doesn't it? which meaning does it have in this position?
I can paste even the whole classe if you need a more verbose insight:
from enum import Enum
import json
import magic
import docx
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTContainer, LTTextContainer
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
class DocumentType(Enum):
"""
Defines the handled document types.
Each value is associated to a MIME type.
"""
def __init__(self, mime_type):
self.mime_type = mime_type
#classmethod
def get_instance(cls, mime_type : str):
values = [e for e in cls]
for value in values:
if value.mime_type == mime_type:
return value
raise MimeNotValidError(mime_type)
PDF = 'application/pdf'
DOCX = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
class MimeNotValidError(Exception):
"""
Exception to be raised when a not valid MIME type is processed.
"""
pass
class DocumentData:
"""
Wrapper for the extracted document data (TOC and contents).
"""
def __init__(self, toc : list = [], pages : list = [], document_text : str = None):
self.toc = toc
self.pages = pages
if document_text is not None:
self.document_text = document_text
else:
self.document_text = ' '.join([page.replace('\n', ' ') for page in pages])
def toc_as_json(self) -> str:
return json.dumps(self.toc)
class ExtractionStrategy:
"""
Base class for the extraction strategies.
"""
#staticmethod
def extract_document_data(file_path : str) -> DocumentData:
pass
class DOCXExtractionStrategy(ExtractionStrategy):
"""
It implements the TOC and contents extraction from a DOCX document.
"""
#staticmethod
def extract_document_data(file_path : str) -> DocumentData:
document = docx.Document(file_path)
body_elements = document._body._body
# Selecting only the <w:t> elements from DOCX XML,
# as they're the only to contain some text.
text_elems = body_elements.xpath('.//w:t')
return DocumentData(document_text = ' '.join([elem.text for elem in text_elems]))
class PDFExtractionStrategy(ExtractionStrategy):
"""
It implements the TOC and contents extraction from a PDF document.
"""
#staticmethod
def parse_toc(doc : PDFDocument) -> list:
raw_toc = []
try:
outlines = doc.get_outlines()
for (level, title, dest, a, se) in outlines:
raw_toc.append((level, title))
except PDFNoOutlines:
pass
return PDFExtractionStrategy.build_toc_tree(raw_toc)
#staticmethod
def build_toc_tree(items : list) -> list:
"""
Builds the TOC tree from a list of TOC items.
#type items: list
#param items: The TOC items.
Each item must have the following format: (<item depth>, <item description>).
E.g: [(1, 'Contents'), (2, 'Chapter 1'), (2, 'Chapter 2')]
#rtype: list
#returns: The TOC tree. The tree hasn't a root element, therefore it
actually is a list.
"""
toc = []
if items is None or len(items) == 0:
return toc
current_toc_level = toc
# Using an explicit stack containing the lists corresponding to
# the various levels of the TOC, to simulate the recursive building
# of the TOC tree in a more efficient way
toc_levels_stack = []
toc_levels_stack.append(current_toc_level)
# Each TOC item can be inserted into the current TOC level as
# string (just the item description) or as dict, where the key is
# the item description and the value is a list containing the
# children TOC items.
# To correctly determine how to insert the current item into
# the current level, a kind of look-ahead is needed, that is
# the depth of the next item has to be considered.
# Initializing the variables related to the previous item.
prev_item_depth, prev_item_desc = items[0]
# Adding a fake final item in order to handle all the TOC items
# inside the cycle.
items.append((-1, ''))
for i in range(1, len(items)):
# In fact each iteration handles the item of the previous
# one, using the current item to determine how to insert
# the previous item into the current TOC level,
# as explained before.
curr_item = items[i]
curr_item_depth = curr_item[0]
if curr_item_depth == prev_item_depth:
# The depth of the current item is the same
# as the previous one.
# Inserting the previous item into the current TOC level
# as string.
current_toc_level.append(prev_item_desc)
elif curr_item_depth == prev_item_depth + 1:
# The depth of the current item is increased by 1 compared to
# the previous one.
# Inserting the previous item into the current TOC level
# as dict.
prev_item_dict = { prev_item_desc : [] }
current_toc_level.append(prev_item_dict)
# Updating the current TOC level with the newly created one
# which contains the children of the previous item.
current_toc_level = prev_item_dict[prev_item_desc]
toc_levels_stack.append(current_toc_level)
elif curr_item_depth < prev_item_depth:
# The depth of the current item is lesser than
# the previous one.
# Inserting the previous item into the current TOC level
# as string.
current_toc_level.append(prev_item_desc)
if i < len(items)-1:
# Executing these steps for all the items except the last one
depth_diff = prev_item_depth - curr_item_depth
# Removing from the stack as many TOC levels as the difference
# between the depth of the previous item and the depth of the
# current one.
for i in range(0, depth_diff):
toc_levels_stack.pop()
# Updating the current TOC level with the one contained in
# the head of the stack.
current_toc_level = toc_levels_stack[-1]
# Updating the previous item with the current one
prev_item_depth, prev_item_desc = curr_item
return toc
#staticmethod
def from_bytestring(s) -> str:
"""
If the input string is a byte-string, converts it to a string using
UTF-8 as encoding.
#param s: A string or a byte-string.
#rtype: str
#returns: The potentially converted string.
"""
if s:
if isinstance(s, str):
return s
else:
return s.encode('utf-8')
#staticmethod
def parse_layout_nodes(container : LTContainer) -> str:
"""
Recursively extracts the text from all the nodes contained in the
input PDF layout tree/sub-tree.
#type container: LTContainer
#param container: The PDF layout tree/sub-tree from which to extract the text.
#rtype: str
#returns: A string containing the extracted text.
"""
text_content = []
# The iterator returns the children nodes.
for node in container:
if isinstance(node, LTTextContainer):
# Only nodes of type LTTextContainer contain text.
text_content.append(PDFExtractionStrategy.from_bytestring(node.get_text()))
elif isinstance(node, LTContainer):
# Recursively calling the method on the current node, which is a container itself.
text_content.append(PDFExtractionStrategy.parse_layout_nodes(node))
else:
# Ignoring all the other node types.
pass
# Joining all the extracted text chunks with a new line character.
return "\n".join(text_content)
#staticmethod
def parse_pages(doc : PDFDocument) -> list:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
text_content = []
for i, page in enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page)
layout = device.get_result()
# Extracts the text from all the nodes of the PDF layout tree of each page
text_content.append(PDFExtractionStrategy.parse_layout_nodes(layout))
return text_content
#staticmethod
def parse_pdf(file_path : str) -> (list, list):
toc = []
pages = []
try:
fp = open(file_path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
parser.set_document(doc)
if doc.is_extractable:
toc = PDFExtractionStrategy.parse_toc(doc)
pages = PDFExtractionStrategy.parse_pages(doc)
fp.close()
except IOError:
pass
return (toc, pages)
#staticmethod
def extract_document_data(file_path : str) -> DocumentData:
toc, pages = PDFExtractionStrategy.parse_pdf(file_path)
return DocumentData(toc, pages = pages)
class DocumentDataExtractor:
"""
Main class of the module.
It's responsible for actually executing the text extraction.
The output is constituted by the following items:
-table of contents (TOC);
-pages contents.
"""
# Dictionary containing the extraction strategies for the different
# document types, indexed by the corresponding DocumentType enum values.
strategies = {
DocumentType.DOCX : DOCXExtractionStrategy(),
DocumentType.PDF : PDFExtractionStrategy()
}
#classmethod
def extract_document_data(cls, file_path : str) -> DocumentData:
"""
Entry point of the module, it extracts the data from the document
whose path is passed as input.
The extraction strategy is automatically chosen based on the MIME type
of the file.
#type file_path: str
#param file_path: The path of the document to be parsed.
#rtype: DocumentData
#returns: An object containing the data of the parsed document.
"""
mime = magic.Magic(mime=True)
mime_type = mime.from_file(file_path)
document_type = DocumentType.get_instance(mime_type)
strategy = cls.strategies[document_type]
return strategy.extract_document_data(file_path)
I'm trying to merge two xml files in python with the following code. That I found in another thread: Merge xml files with nested elements without external libraries
import sys
from xml.etree import ElementTree as et
class hashabledict(dict):
def __hash__(self):
return hash(tuple(sorted(self.items())))
class XMLCombiner(object):
def __init__(self, filenames):
assert len(filenames) > 0, 'No filenames!'
# save all the roots, in order, to be processed later
self.roots = [et.parse(f).getroot() for f in filenames]
def combine(self):
for r in self.roots[1:]:
# combine each element with the first one, and update that
self.combine_element(self.roots[0], r)
# return the string representation
return et.ElementTree(self.roots[0])
def combine_element(self, one, other):
"""
This function recursively updates either the text or the children
of an element if another element is found in `one`, or adds it
from `other` if not found.
"""
# Create a mapping from tag name to element, as that's what we are fltering with
mapping = {(el.tag, hashabledict(el.attrib)): el for el in one}
for el in other:
if len(el) == 0:
# Not nested
try:
# Update the text
mapping[(el.tag, hashabledict(el.attrib))].text = el.text
except KeyError:
# An element with this name is not in the mapping
mapping[(el.tag, hashabledict(el.attrib))] = el
# Add it
one.append(el)
else:
try:
# Recursively process the element, and update it in the same way
self.combine_element(mapping[(el.tag, hashabledict(el.attrib))], el)
except KeyError:
# Not in the mapping
mapping[(el.tag, hashabledict(el.attrib))] = el
# Just add it
one.append(el)
if __name__ == '__main__':
r = XMLCombiner(sys.argv[1:-1]).combine()
print '-'*20
print et.tostring(r.getroot())
r.write(sys.argv[-1], encoding="iso-8859-1", xml_declaration=True)
The code works perfectly for merging two xml files, however I would also like to merge the comments I have in the files. I'm new at this and don't know how to not just merge the xml but also the comments I have in the files.
I am trying to load with user defined tags in my python code, with PyYaml. Dont have much experience with pyYaml loader, constructor, representer parser, resolver and dumpers.
Below is my code what i could come up with:
import yaml, os
from collections import OrderedDict
root = os.path.curdir
def construct_position_object(loader, suffix, node):
return loader.construct_yaml_map(node)
def construct_position_sym(loader, node):
return loader.construct_yaml_str(node)
yaml.add_multi_constructor(u"!Position", construct_position_object)
yaml.add_constructor(u"!Position", construct_position_sym)
def main():
file = open('C:\calcWorkspace\\13.3.1.0\PythonTest\YamlInput\Exception_V5.yml','r')
datafile = yaml.load_all(file)
for data in datafile:
yaml.add_representer(literal, literal_presenter)
yaml.add_representer(OrderedDict, ordered_dict_presenter)
d = OrderedDict(l=literal(data))
print yaml.dump(data, default_flow_style=False)
print datafile.get('abcd').get('addresses')
yaml.add_constructor('!include', include)
def include(loader, node):
"""Include another YAML file."""
global root
old_root = root
filename = os.path.join(root, loader.construct_scalar(node))
root = os.path.split(filename)[0]
data = yaml.load(open(filename, 'r'))
root = old_root
return data
class literal(str): pass
def literal_presenter(dumper, data):
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
def ordered_dict_presenter(dumper, data):
return dumper.represent_dict(data.items())
if __name__ == '__main__':
main()
This is my Yaml file:
#sid: Position[SIK,sourceDealID,UTPI]
sid: Position[1232546, 0634.10056718.0.1096840.0,]
ASSET_CLASS: "Derivative"
SOURCE_DEAL_ID: "0634.10056718.0.1096840.0"
INSTR_ID: "UKCM.L"
PRODUCT_TYPE_ID: 0
SOURCE_PRODUCT_TYPE: "CDS"
NOTIONAL_USD: 14.78
NOTIONAL_CCY:
LOB:
PRODUCT_TYPE:
#GIM
UNDERLIER_INSTRUMENT_ID:
MTM_USD:
MTM_CCY:
TRADER_SID:
SALES_PERSON_SID:
CLIENT_SPN:
CLIENT_UCN:
CLIENT_NAME:
LE:
---
sid: Position[1258642, 0634.10056718.0.1096680.0,]
#sid: Position[1]
ASSET_CLASS: "Derivative"
SOURCE_DEAL_ID: "0634.10056718.0.1096840.0"
INSTR_ID: "UKCM.L"
PRODUCT_TYPE_ID: 0
SOURCE_PRODUCT_TYPE: "CDS"
NOTIONAL_AMT: 18.78
NOTIONAL_CCY: "USD"
LOB:
PRODUCT_TYPE:
UNDERLIER_INSTRUMENT_ID:
MTM_AMT:
MTM_CCY:
TRADER_SID:
SALES_PERSON_SID:
CLIENT_SPN:
CLIENT_UCN:
CLIENT_NAME:
LE:
---
# Excption documents to follow from here!!!
Exception:
src_excp_id: 100001
# CONFIGURABLE OBJECT, VALUE TO BE POPULATED RUNTIME (impact_obj COMES FROM CONFIG FILE)
# VALUE STARTS FROM "!POSITION..." A USER DEFINED DATATYPE
impact_obj: !Position [1232546, 0634.10056718.0.1096840.0,]
# CONFIGURABLE OBJECT, VALUE TO BE POPULATED RUNTIME (rsn_obj COMES FROM CONFIG FILE)
# VALUE STARTS FROM "_POSITION..." AN IDENTIFIER FOR CONFIGURABLE OBJECTS
rsn_obj: !Position [1258642, 0634.10056718.0.1096680.0,]
exception_txt: "Invalid data, NULL value provided"
severity: "High"
Looks like my code is unable to identify the !Position user-defined data type.
Any help would be appericiated
Regards.
Needed to change:
def construct_position_sym(loader, node):
return loader.construct_yaml_str(node)
to :
def construct_position_sym(loader, node):
return loader.construct_yaml_seq(node)
Because the position object was a sequence:
!Position [something, something]
So the constructor had to be a sequence type. Works perfect!!!
I'm using the following code as a portion of a larger program that does some error checking on a Digital Cinema Package and tries to check the validity of the XML file that lists the asses on the DCP. ANyway, this is all still very much in its infancy and I'm hoping to learn more python as a result of it.
import xml.etree.ElementTree as etree
import sys
class Parser(object):
def __init__(self, file_name):
self.file_name = file_name
def display(self, rename_this_list):
tree = etree.parse(self.file_name)
for node in tree.getiterator():
for element in rename_this_list:
if element in node.tag:
uuid=(node.text)
#uuid = [s.strip('urn:') for s in uuid]
print(uuid)
fname = sys.argv[1]
key_search_words = ['KeyId']
instance = Parser(fname)
instance.display(key_search_words)
when I try to store the output so that each line is a list it doesn't format the way that I would expect. Minus the urn: I'd like to be storing each line with uuid: and the following info as an element of a list.
urn:uuid:9851b0f6-4790-0d4c-a69d-ea8abdedd03d
urn:uuid:8317e8f3-1597-494d-9ed8-08a751ff8615
urn:uuid:5d9b228d-7120-344c-aefc-840cdd32bbfc
urn:uuid:1e32ccb2-ab0b-9d43-b879-1c12840c178b
urn:uuid:44d04416-676a-2e4f-8995-165de8cab78d
urn:uuid:906da0c1-b0cb-4541-b8a9-86476583cdc4
urn:uuid:0fe2d73a-ebe3-9844-b3de-4517c63c4b90
urn:uuid:862fa79a-18c7-9245-a172-486541bef0c0
urn:uuid:aa2f1a88-7a55-894d-bc19-42afca589766
urn:uuid:59d6eeff-cd56-6245-9f13-951554466626
urn:uuid:14a13b1a-76ba-764c-97d0-9900f58af53e
urn:uuid:ccdbe0ae-1c3f-224c-b450-947f43bbd640
urn:uuid:dcd37f10-b042-8e44-bef0-89bda2174842
urn:uuid:9dd7103e-7e5a-a840-a15f-f7d7fe699203
If you need a list, then you can try this.
def display(self, rename_this_list):
listOfNodes = []
tree = etree.parse(self.file_name)
for node in tree.getiterator():
for element in rename_this_list:
if element in node.tag:
# append text of element to the list
# without first four characters which are "urn:"
listOfNodes.append(node.text[4:])
print str(listOfNodes)
return listOfNodes
Remember that keys of a dictionary have to be unique, in a dictionary you can't have two items with keys "uuid", if you want a dictionary then you can only have one dictionary with one key "uuid" and a list of all those numbers as values.
import collections
class Parser(object):
def __init__(self, file_name):
self.file_name = file_name, self.res = collections.defaultdict(list)
def display(self, rename_this_list):
tree = etree.parse(self.file_name)
for node in tree.getiterator():
for element in rename_this_list:
if element in node.tag:
uuid = node.text
key, value = uuid[4:].split(':')
self.res[key].append(value)
Can this satisfy your need? I don't know the details of your data so if anything wrong please tell. I think the result should be like this:
{'uuid':['9851b0f6-4790-0d4c-a69d-ea8abdedd03d','ccdbe0ae-1c3f-224c-b450-947f43bbd640',...]}