Difficulty creating lxml Element subclass - python

I’m trying to create a subclass of the Element class. I’m having trouble getting started though.
from lxml import etree
try:
import docx
except ImportError:
from docx import docx
class File(etree.ElementBase):
def _init(self):
etree.ElementBase._init(self)
self.body = self.append(docx.makeelement('body'))
f = File()
relationships = docx.relationshiplist()
title = 'File'
subject = 'A very special File'
creator = 'Me'
keywords = ['python', 'Office Open XML', 'Word']
coreprops = docx.coreproperties(title=title, subject=subject, creator=creator,
keywords=keywords)
appprops = docx.appproperties()
contenttypes = docx.contenttypes()
websettings = docx.websettings()
wordrelationships = docx.wordrelationships(relationships)
docx.savedocx(f, coreprops, appprops, contenttypes, websettings,
wordrelationships, 'file.docx')
When I try to open the document that is outputted from this code, my version of Word (2003 with compatibility pack) gives me the following error: “This file was created by a previous beta version of Word 2007 and cannot be opened in this version.” When I replace the File object with a different Element created with docx.newdocument(), the document comes out fine. Any ideas/advice?

I don't really get why you want to use a separate class named File.
As Michael0x2a said, you did'nt put a document tag, so it won't work (I don't think Word 2007 can read your file too)
But here is the corrected code:
from lxml import etree
try:
import docx
except ImportError:
from docx import docx
class File(object):
def makeelement(tagname, tagtext=None, nsprefix='w', attributes=None,
attrnsprefix=None):
'''Create an element & return it'''
# Deal with list of nsprefix by making namespacemap
namespacemap = None
if isinstance(nsprefix, list):
namespacemap = {}
for prefix in nsprefix:
namespacemap[prefix] = nsprefixes[prefix]
# FIXME: rest of code below expects a single prefix
nsprefix = nsprefix[0]
if nsprefix:
namespace = '{'+nsprefixes[nsprefix]+'}'
else:
# For when namespace = None
namespace = ''
newelement = etree.Element(namespace+tagname, nsmap=namespacemap)
# Add attributes with namespaces
if attributes:
# If they haven't bothered setting attribute namespace, use an empty
# string (equivalent of no namespace)
if not attrnsprefix:
# Quick hack: it seems every element that has a 'w' nsprefix for
# its tag uses the same prefix for it's attributes
if nsprefix == 'w':
attributenamespace = namespace
else:
attributenamespace = ''
else:
attributenamespace = '{'+nsprefixes[attrnsprefix]+'}'
for tagattribute in attributes:
newelement.set(attributenamespace+tagattribute,
attributes[tagattribute])
if tagtext:
newelement.text = tagtext
return newelement
def __init__(self):
super(File,self).__init__()
self.document = self.makeelement('document')
self.document.append(self.makeelement('body'))
f = File()
relationships = docx.relationshiplist()
title = 'File'
subject = 'A very special File'
creator = 'Me'
keywords = ['python', 'Office Open XML', 'Word']
coreprops = docx.coreproperties(title=title, subject=subject, creator=creator,
keywords=keywords)
appprops = docx.appproperties()
contenttypes = docx.contenttypes()
websettings = docx.websettings()
wordrelationships = docx.wordrelationships(relationships)
docx.savedocx(f.document, coreprops, appprops, contenttypes, websettings,
wordrelationships, 'file.docx')

Related

How to get readable unicode string from single bibtex entry field in python script

Suppose you have a .bib file containing bibtex-formatted entries. I want to extract the "title" field from an entry, and then format it to a readable unicode string.
For example, if the entry was:
#article{mypaper,
author = {myself},
title = {A very nice {title} with annoying {symbols} like {\^{a}}}
}
what I want to extract is the string:
A very nice title with annoying symbols like â
I am currently trying to use the pybtex package, but I cannot figure out how to do it. The command-line utility pybtex-format does a good job in converting full .bib files, but I need to do this inside a script and for single title entries.
Figured it out:
def load_bib(filename):
from pybtex.database.input.bibtex import Parser
parser = Parser()
DB = parser.parse_file(filename)
return DB
def get_title(entry):
from pybtex.plugin import find_plugin
style = find_plugin('pybtex.style.formatting', 'plain')()
backend = find_plugin('pybtex.backends', 'plaintext')()
sentence = style.format_title(entry, 'title')
data = {'entry': entry,
'style': style,
'bib_data': None}
T = sentence.f(sentence.children, data)
title = T.render(backend)
return title
DB = load_bib("bibliography.bib")
print ( get_title(DB.entries["entry_label"]) )
where entry_label must match the label you use in latex to cite the bibliography entry.
Building upon the answer by Daniele, I wrote this function that lets one render fields without having to use a file.
from io import StringIO
from pybtex.database.input.bibtex import Parser
from pybtex.plugin import find_plugin
def render_fields(author="", title=""):
"""The arguments are in bibtex format. For example, they may contain
things like \'{i}. The output is a dictionary with these fields
rendered in plain text.
If you run tests by defining a string in Python, use r'''string''' to
avoid issues with escape characters.
"""
parser = Parser()
istr = r'''
#article{foo,
Author = {''' + author + r'''},
Title = {''' + title + '''},
}
'''
bib_data = parser.parse_stream(StringIO(istr))
style = find_plugin('pybtex.style.formatting', 'plain')()
backend = find_plugin('pybtex.backends', 'plaintext')()
entry = bib_data.entries["foo"]
data = {'entry': entry, 'style': style, 'bib_data': None}
sentence = style.format_author_or_editor(entry)
T = sentence.f(sentence.children, data)
rendered_author = T.render(backend)[0:-1] # exclude period
sentence = style.format_title(entry, 'title')
T = sentence.f(sentence.children, data)
rendered_title = T.render(backend)[0:-1] # exclude period
return {'title': rendered_title, 'author': rendered_author}

Python Class returning an empty dictionary

Newbie needing some help making code object-oriented.
I am trying to write a class with different methods for processing XML files. One of these methods has a goal of returning a dictionary with the embedded attachment's filename and the encoded datastring as key and value respectively.
I have managed to get this to work outside of the class:
import xml.etree.ElementTree as ET
tree = ET.parse('invoice.xml')
root = tree.getroot()
namespace = {
'cac': 'urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2',
'cbc': 'urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2',
'ext': 'urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-2',
'ccts': 'urn:un:unece:uncefact:documentation:2',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
}
attachments = {}
for document in root.findall('cac:AdditionalDocumentReference', namespace):
filename = document.find('cbc:ID', namespace).text
print(filename)
# Find the embedded file
for child in document.findall('cac:Attachment', namespace):
attachment = child.find('cbc:EmbeddedDocumentBinaryObject', namespace).text
attachments[filename] = attachment
But I have been unable to translate this into a class method, as the class method returns an empty dictionary. The code I am working on:
import xml.etree.ElementTree as ET
class Invoice:
"""
Common tasks in relation to EHF invoices.
"""
namespace = {
'cac': 'urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2',
'cbc': 'urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2',
'ext': 'urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-2',
'ccts': 'urn:un:unece:uncefact:documentation:2',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
}
attachments = {}
def __init__(self, invoice):
"""Initialize invoice attributes."""
self.invoice = invoice
# Dictionary for namespace used in EHF invoices
self.namespace = self.namespace
def encoded_attachments(self):
"""
Return the embedded attachments from the EHF invoice in encoded form
as a dictionary.
Keys = filenames
Value = base64 encoded files
"""
for document in self.invoice.findall('cac:AdditonalDocumentReference', self.namespace):
# Find filename
filename = document.find('cbc:ID', self.namespace).text
# Find the embedded file
for child in document.findall('cac:Attachment', namespace):
attachment = child.find('cbc:EmbeddedDocumentBinaryObject', self.namespace).text
# Add filename and attachment to dictionary
self.attachments[filename] = attachment
return(self.attachments)
tree = ET.parse('invoice.xml')
root = tree.getroot()
ehf = Invoice(root)
attach_dict = ehf.encoded_attachments()
print(attach_dict)
I think there is something important I am missing about classes. Any help is appreciated.
Edit:
Part of the xml file. Encoded data replaced with a dummy text string.
<?xml version="1.0" encoding="UTF-8"?>
<Invoice xmlns="urn:oasis:names:specification:ubl:schema:xsd:Invoice-2"
xmlns:cac="urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2"
xmlns:cbc="urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2"
xmlns:ext="urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-2"
xmlns:ccts="urn:un:unece:uncefact:documentation:2"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<cbc:CustomizationID>urn:cen.eu:en16931:2017#compliant#urn:fdc:peppol.eu:2017:poacc:billing:3.0</cbc:CustomizationID>
<cbc:ProfileID>urn:fdc:peppol.eu:2017:poacc:billing:01:1.0</cbc:ProfileID>
<cbc:ID>1060649</cbc:ID>
<cbc:IssueDate>2020-01-23</cbc:IssueDate>
<cbc:DueDate>2020-02-07</cbc:DueDate>
<cbc:InvoiceTypeCode>380</cbc:InvoiceTypeCode>
<cbc:TaxPointDate>2020-01-23</cbc:TaxPointDate>
<cbc:DocumentCurrencyCode>NOK</cbc:DocumentCurrencyCode>
<cbc:BuyerReference>N/A</cbc:BuyerReference>
<cac:AdditionalDocumentReference>
<cbc:ID>invoice_attachment_filename.pdf</cbc:ID>
<cbc:DocumentTypeCode>130</cbc:DocumentTypeCode>
<cbc:DocumentDescription>CommercialInvoice</cbc:DocumentDescription>
<cac:Attachment>
<cbc:EmbeddedDocumentBinaryObject mimeCode="application/pdf" filename="1060649.pdf">BASE64ENCODEDTEXT</cbc:EmbeddedDocumentBinaryObject>
</cac:Attachment>
</cac:AdditionalDocumentReference>
</Invoice>
Usage of self is not consistent
for child in document.findall('cac:Attachment', **namespace**):
attachment = child.find('cbc:EmbeddedDocumentBinaryObject', **self.namespace**).text
And the answer is (drum roll...) Everything is correct, but compare old and new code here:
old: for document in root.findall('cac:AdditionalDocumentReference', namespace)
new: for document in self.invoice.findall('cac:AdditonalDocumentReference', self.namespace)
^
By the way, you can leave out the line self.namespace = self.namespace.
You are making two mistakes here.
One is that you are using class variables (read up here: https://docs.python.org/3/tutorial/classes.html)
The second one is what gokaai said here.
This should work:
import xml.etree.ElementTree as ET
class Invoice:
"""
Common tasks in relation to EHF invoices.
"""
def __init__(self, invoice):
"""Initialize invoice attributes."""
self.invoice = invoice
self.namespace = {
'cac': 'urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-3',
'cbc': 'urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-3',
'ext': 'urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-3',
'ccts': 'urn:un:unece:uncefact:documentation:1',
'xsi': 'http://www.w2.org/2001/XMLSchema-instance'
}
self.attachments = {}
def encoded_attachments(self):
"""
Return the embedded attachments from the EHF invoice in encoded form
as a dictionary.
Keys = filenames
Value = base64 encoded files
"""
for document in self.invoice.findall('cac:AdditonalDocumentReference', self.namespace):
# Find filename
filename = document.find('cbc:ID', self.namespace).text
# Find the embedded file
for child in document.findall('cac:Attachment', self.namespace):
# Add filename and attachment to dictionary
self.attachments[filename] = child.find('cbc:EmbeddedDocumentBinaryObject', self.namespace).text
return self.attachments

Pyuno indexing issue that I would like an explanation for

The following python libreoffice Uno macro works but only with the try..except statement.
The macro allows you to select text in a writer document and send it to a search engine in your default browser.
The issue, is that if you select a single piece of text,oSelected.getByIndex(0) is populated but if you select multiple pieces of text oSelected.getByIndex(0) is not populated. In this case the data starts at oSelected.getByIndex(1) and oSelected.getByIndex(0) is left blank.
I have no idea why this should be and would love to know if anyone can explain this strange behaviour.
#!/usr/bin/python
import os
import webbrowser
from configobj import ConfigObj
from com.sun.star.awt.MessageBoxButtons import BUTTONS_OK, BUTTONS_OK_CANCEL, BUTTONS_YES_NO, BUTTONS_YES_NO_CANCEL, BUTTONS_RETRY_CANCEL, BUTTONS_ABORT_IGNORE_RETRY
from com.sun.star.awt.MessageBoxButtons import DEFAULT_BUTTON_OK, DEFAULT_BUTTON_CANCEL, DEFAULT_BUTTON_RETRY, DEFAULT_BUTTON_YES, DEFAULT_BUTTON_NO, DEFAULT_BUTTON_IGNORE
from com.sun.star.awt.MessageBoxType import MESSAGEBOX, INFOBOX, WARNINGBOX, ERRORBOX, QUERYBOX
def fs3Browser(*args):
#get the doc from the scripting context which is made available to all scripts
desktop = XSCRIPTCONTEXT.getDesktop()
model = desktop.getCurrentComponent()
doc = XSCRIPTCONTEXT.getDocument()
parentwindow = doc.CurrentController.Frame.ContainerWindow
oSelected = model.getCurrentSelection()
oText = ""
try:
for i in range(0,4,1):
print ("Index No ", str(i))
try:
oSel = oSelected.getByIndex(i)
print (str(i), oSel.getString())
oText += oSel.getString()+" "
except:
break
except AttributeError:
mess = "Do not select text from more than one table cell"
heading = "Processing error"
MessageBox(parentwindow, mess, heading, INFOBOX, BUTTONS_OK)
return
lookup = str(oText)
special_c =str.maketrans("","",'!|##"$~%&/()=?+*][}{-;:,.<>')
lookup = lookup.translate(special_c)
lookup = lookup.strip()
configuration_dir = os.environ["HOME"]+"/fs3"
config_filename = configuration_dir + "/fs3.cfg"
if os.access(config_filename, os.R_OK):
cfg = ConfigObj(config_filename)
#define search engine from the configuration file
try:
searchengine = cfg["control"]["ENGINE"]
except:
searchengine = "https://duckduckgo.com"
if 'duck' in searchengine:
webbrowser.open_new('https://www.duckduckgo.com//?q='+lookup+'&kj=%23FFD700 &k7=%23C9C4FF &ia=meanings')
else:
webbrowser.open_new('https://www.google.com/search?/&q='+lookup)
return None
def MessageBox(ParentWindow, MsgText, MsgTitle, MsgType, MsgButtons):
ctx = XSCRIPTCONTEXT.getComponentContext()
sm = ctx.ServiceManager
si = sm.createInstanceWithContext("com.sun.star.awt.Toolkit", ctx)
mBox = si.createMessageBox(ParentWindow, MsgType, MsgButtons, MsgTitle, MsgText)
mBox.execute()
Your code is missing something. This works without needing an extra try/except clause:
selected_strings = []
try:
for i in range(oSelected.getCount()):
oSel = oSelected.getByIndex(i)
if oSel.getString():
selected_strings.append(oSel.getString())
except AttributeError:
# handle exception...
return
result = " ".join(selected_strings)
To answer your question about the "strange behaviour," it seems pretty straightforward to me. If the 0th element is empty, then there are multiple selections which may need to be handled differently.

python lxml get the name of a node

This is my xml file:
<FuzzyComparison>
<Modules>
<Module>
<name>AutosoukModelMakeFuzzyComparisonModule</name>
<configurationLoader>DefaultLoader</configurationLoader>
<configurationFile>MakesModels.conf</configurationFile>
<settings></settings>
</Module>
<Module>
<name>DefaultFuzzyComparisonModule</name>
<configurationLoader>DefaultLoader</configurationLoader>
<configurationFile>Buildings.conf</configurationFile>
<settings>
<attribute>building</attribute>
</settings>
</Module>
</Modules>
</FuzzyComparison>
This is the code I've been trying to parse it with:
from lxml import etree
class AttributesXMLParser():
def __init__(self):
self.doc=etree.parse('Items.xml')
def getValueOfTag(self, tagName): #This function returns the value of a specific tag for exmaple, the tageName could be "FirstDate"
return self.doc.find(tagName).text
def loadFuzzySettings(self):
modulesDict = list()
modules = self.doc.findall('FuzzyComparison/Modules/Module')
for module in modules:
moduleDict = dict()
moduleName = module.find('name').text
moduleDict['name'] = moduleName
moduleConfigurationLoader = module.find('configurationLoader').text
moduleDict['configurationLoader'] = moduleConfigurationLoader
moduleConfigurationFile = module.find('configurationFile').text
moduleDict['moduleConfigurationFile'] = moduleConfigurationFile
settings = module.findall('settings')
settingsDict = dict()
for oneSetting in settings:
settingsDict[oneSetting] = oneSetting.text
moduleDict['settings'] = settingsDict
modulesDict.append(moduleDict)
return modulesDict
and this is the results:
[{'moduleConfigurationFile': 'MakesModels.conf', 'configurationLoader': 'Default
Loader', 'name': 'AutosoukModelMakeFuzzyComparisonModule', 'settings': {<Element
settings at 0x25257c8>: None}}, {'moduleConfigurationFile': 'Buildings.conf', '
configurationLoader': 'DefaultLoader', 'name': 'DefaultFuzzyComparisonModule', '
settings': {<Element settings at 0x2525e48>: '\n\t\t\t\t'}}]
My problem
I don't know how to get the name and value of the settings node, because as you see everything is working great except the settings, I need to have it like this:
"attribute": building
But my code gives me:
{<Element settings at 0x2525e48>: '\n\t\t\t\t'}}]
Could you help please to solve that?
Since findall() returns a list, you want to iterate over the contents of elements of that list, rather than the list itself. You also want to use the element's tag as a key, rather than using the element itself.
settingsDict = {}
for settingsNode in module.findall('settings'):
for setting in settingsNode:
settingsDict[setting.tag] = setting.text
Or, if you only have one settings tag,
settingsDict = {}
for setting in module.find('settings'):
settingsDict[setting.tag] = setting.text
Which can be simplified to:
settingsDict = {setting.tag: setting.text
for setting in module.find('settings')}

Python OOP Project Organization

I'm a bit new to Python dev -- I'm creating a larger project for some web scraping. I want to approach this as "Pythonically" as possible, and would appreciate some help with the project structure. Here's how I'm doing it now:
Basically, I have a base class for an object whose purpose is to go to a website and parse some specific data on it into its own array, jobs[]
minion.py
class minion:
# Empty getJobs() function to be defined by object pre-instantiation
def getJobs(self):
pass
# Constructor for a minion that requires site authorization
# Ex: minCity1 = minion('http://portal.com/somewhere', 'user', 'password')
# or minCity2 = minion('http://portal.com/somewhere')
def __init__(self, title, URL, user='', password=''):
self.title = title
self.URL = URL
self.user = user
self.password = password
self.jobs = []
if (user == '' and password == ''):
self.reqAuth = 0
else:
self.reqAuth = 1
def displayjobs(self):
for j in self.jobs:
j.display()
I'm going to have about 100 different data sources. The way I'm doing it now is to just create a separate module for each "Minion", which defines (and binds) a more tailored getJobs() function for that object
Example: minCity1.py
from minion import minion
from BeautifulSoup import BeautifulSoup
import urllib2
from job import job
# MINION CONFIG
minTitle = 'Some city'
minURL = 'http://www.somewebpage.gov/'
# Here we define a function that will be bound to this object's getJobs function
def getJobs(self):
page = urllib2.urlopen(self.URL)
soup = BeautifulSoup(page)
# For each row
for tr in soup.findAll('tr'):
tJob = job()
span = tr.findAll(['span', 'class="content"'])
# If row has 5 spans, pull data from span 2 and 3 ( [1] and [2] )
if len(span) == 5:
tJob.title = span[1].a.renderContents()
tJob.client = 'Some City'
tJob.source = minURL
tJob.due = span[2].div.renderContents().replace('<br />', '')
self.jobs.append(tJob)
# Don't forget to bind the function to the object!
minion.getJobs = getJobs
# Instantiate the object
mCity1 = minion(minTitle, minURL)
I also have a separate module which simply contains a list of all the instantiated minion objects (which I have to update each time I add one):
minions.py
from minion_City1 import mCity1
from minion_City2 import mCity2
from minion_City3 import mCity3
from minion_City4 import mCity4
minionList = [mCity1,
mCity2,
mCity3,
mCity4]
main.py references minionList for all of its activities for manipulating the aggregated data.
This seems a bit chaotic to me, and was hoping someone might be able to outline a more Pythonic approach.
Thank you, and sorry for the long post!
Instead of creating functions and assigning them to objects (or whatever minion is, I'm not really sure), you should definitely use classes instead. Then you'll have one class for each of your data sources.
If you want, you can even have these classes inherit from a common base class, but that isn't absolutely necessary.

Categories