I have the following script -
count = 1
for line in temp:
if (str(count) + '=') in line:
job = re.findall(re.escape('=')+"(.*)",line)[0]
fullsrcurl = self.srcjson + '?format=xml&jobname=' + job
srcfile = urllib2.urlopen(fullsrcurl)
srcdoc = etree.parse(srcfile)
srcdata = etree.tostring(srcdoc, pretty_print=True)
srcjobmst_id = srcdoc.xpath('//jobmst_id/text()')[0]
srcxml = 'c:\\temp\\deployments\\%s\\%s.xml' % (source_env, srcjobmst_id)
srcxmlsave = open(srcxml, 'w')
srcxmlsave.write(srcdata)
srcxmlsave.close
fulldsturl = self.targetjson + '?format=xml&jobname=' + job
dstfile = urllib2.urlopen(fulldsturl)
dstdoc = etree.parse(dstfile)
dstdata = etree.tostring(dstdoc, pretty_print=True)
dstjobmst_id = dstdoc.xpath('//jobmst_id/text()')[0]
dstxml = 'c:\\temp\\deployments\\%s\\%s.xml' % (target_env, dstjobmst_id)
dstxmlsave = open(dstxml, 'w')
dstxmlsave.write(dstdata)
dstxmlsave.close
print "Job = " + job
count += 1
It's hitting 2 separate APIs in 2 environments but the data is almost identical. The source works fine, as soon as it tries to do anything with the destination data I get the followign error -
Traceback (most recent call last):
File "S:\Operations\Tidal\deployment\deployv2.py", line 213, in <module>
main()
File "S:\Operations\Tidal\deployment\deployv2.py", line 209, in main
auto_deploy.deploy()
File "S:\Operations\Tidal\deployment\deployv2.py", line 173, in deploy
dstdoc = etree.parse(dstfile)
File "lxml.etree.pyx", line 3239, in lxml.etree.parse (src\lxml\lxml.etree.c:6
9970)
File "parser.pxi", line 1770, in lxml.etree._parseDocument (src\lxml\lxml.etre
e.c:102272)
File "parser.pxi", line 1790, in lxml.etree._parseFilelikeDocument (src\lxml\l
xml.etree.c:102531)
File "parser.pxi", line 1685, in lxml.etree._parseDocFromFilelike (src\lxml\lx
ml.etree.c:101457)
File "parser.pxi", line 1134, in lxml.etree._BaseParser._parseDocFromFilelike
(src\lxml\lxml.etree.c:97084)
File "parser.pxi", line 582, in lxml.etree._ParserContext._handleParseResultDo
c (src\lxml\lxml.etree.c:91290)
File "parser.pxi", line 683, in lxml.etree._handleParseResult (src\lxml\lxml.e
tree.c:92476)
File "parser.pxi", line 622, in lxml.etree._raiseParseError (src\lxml\lxml.etr
ee.c:91772)
lxml.etree.XMLSyntaxError: Extra content at the end of the document, line 4, col
umn 1
So there has to be something different about the destination/target xml but I'm having a hard time understanding what. When I look at both of the values in a browser they're identical except for a few values (jobmst_id)
You aren't closing the files. Change srcxmlsave.close to srcxmlsave.close() or use a context manager as in
with open(srcxml, 'w') as srcxmlsave:
srcxmlsave.write(srcdata)
If anyone experiences an issue like this in the future I found the problem and it's nothing related to lxml or the xml I'm generating. My source environment has been productionalized using mod_wsgi but the target environment is still using runserver.
I guess something in the encoding is breaking with the target. I just productionalized the target environment and it works fine.
Related
I am parsing a huge file(3.5gb) using lxml, the data of one child looks something like this :
<phdthesis mdate="2022-10-12" key="phd/it/Borga22">
<author>Piero Borga</author>
<title>Active opto-magnetic biosensing system on chip.</title>
<school>Polytechnic University of Milan, Italy</school>
<year>2022</year>
<ee>https://hdl.handle.net/10589/188712</ee>
</phdthesis>
(there aren't only phdthesis but also mastersthesis, articles, ...)
I need to make a function to retrieve all fields relative to a title through a web server using bottle. My current function looks like this :
from bottle import route, run
from lxml import etree as ET
#route('/')
#route('/publications/<id>')
def publications(id=''):
rendu = ''
found = 0
tmp = ''
events = ("start", "end")
with open("dblp.xml", "rb") as fo:
context = ET.iterparse(fo, events=events)
for action, elem in context:
if (elem.tag != None) and (elem.tag=='author') and (elem.text != None):
tmp = elem.tag +' '+elem.text
if elem.text==id:
found = 1
if found == 1:
if (action=='end') and ((elem.tag == 'article') | (elem.tag == 'inproceedings') | (elem.tag == 'proceedings') | (elem.tag == 'book') | (elem.tag == 'incollection') | (elem.tag == 'phdthesis') |(elem.tag == 'mastersthesis')) :
return tmp+rendu
elif action == 'start' :
rendu += elem.tag+ ' '
rendu += elem.text+ ' '
run(host='localhost', port=8080, debug=True)
I get this error when i try to reach localhost:8080/publications/"name of an article"
Traceback (most recent call last):
File "c:\Users\isma7\AppData\Local\Programs\Python\Python310\lib\site-packages\bottle.py", line 876, in _handle
return route.call(**args)
File "c:\Users\isma7\AppData\Local\Programs\Python\Python310\lib\site-packages\bottle.py", line 1756, in wrapper
rv = callback(*a, **ka)
File "C:\Users\isma7\AppData\Local\Temp\ipykernel_17812\750479840.py", line 14, in publications
for action, elem in context:
File "src\lxml\iterparse.pxi", line 210, in lxml.etree.iterparse.__next__
File "src\lxml\iterparse.pxi", line 195, in lxml.etree.iterparse.__next__
File "src\lxml\iterparse.pxi", line 230, in lxml.etree.iterparse._read_more_events
File "src\lxml\parser.pxi", line 1376, in lxml.etree._FeedParser.feed
File "src\lxml\parser.pxi", line 606, in lxml.etree._ParserContext._handleParseResult
File "src\lxml\parser.pxi", line 615, in lxml.etree._ParserContext._handleParseResultDoc
File "src\lxml\parser.pxi", line 725, in lxml.etree._handleParseResult
File "src\lxml\parser.pxi", line 654, in lxml.etree._raiseParseError
File "file:/c:/sshSU/dblp.xml", line 244
lxml.etree.XMLSyntaxError: Entity 'ograve' not defined, line 244, column 31
If i run this without the web server just by removing the bottle part(route/run) and launching :
publications("Active opto-magnetic biosensing system on chip.")
the return value would be correct and no errors would rise, but as soon as i try to access it through the web server :
localhost:8080/publications/"Active opto-magnetic biosensing system on chip."
I get the error message above.
Any suggestions ?
I was running a script to get data from excel for over a year using the Xlwings range command like so...
list=Range('A1:D10').value
Suddenly, it stopper working. I had changed nothing in the code nor the system, other than maybe installing another network card.
This is the error when trying to use the Range assignment now.
Traceback (most recent call last):
File "G:\python32\fetcher.py", line 61, in <module>
listFull = getComData()
File "G:\python32\fetcher.py", line 38, in getComData
listFull=Range('A4:H184').value
File "G:\python32\lib\site-packages\xlwings\main.py", line 1490, in __init__
impl = apps.active.range(cell1).impl
File "G:\python32\lib\site-packages\xlwings\main.py", line 439, in range
return Range(impl=self.impl.range(cell1, cell2))
File "G:\python32\lib\site-packages\xlwings\_xlwindows.py", line 457, in range
xl1 = self.xl.Range(arg1)
File "G:\python32\lib\site-packages\xlwings\_xlwindows.py", line 341, in xl
self._xl = get_xl_app_from_hwnd(self._hwnd)
File "G:\python32\lib\site-packages\xlwings\_xlwindows.py", line 251, in get_xl_app_from_hwnd
disp = COMRetryObjectWrapper(Dispatch(p))
File "G:\python32\lib\site-packages\win32com\client\__init__.py", line 96, in Dispatch
return __WrapDispatch(dispatch, userName, resultCLSID, typeinfo, clsctx=clsctx)
File "G:\python32\lib\site-packages\win32com\client\__init__.py", line 37, in __WrapDispatch
klass = gencache.GetClassForCLSID(resultCLSID)
File "G:\python32\lib\site-packages\win32com\client\gencache.py", line 180, in GetClassForCLSID
mod = GetModuleForCLSID(clsid)
File "G:\python32\lib\site-packages\win32com\client\gencache.py", line 223, in GetModuleForCLSID
mod = GetModuleForTypelib(typelibCLSID, lcid, major, minor)
File "G:\python32\lib\site-packages\win32com\client\gencache.py", line 259, in GetModuleForTypelib
mod = _GetModule(modName)
File "G:\python32\lib\site-packages\win32com\client\gencache.py", line 622, in _GetModule
mod = __import__(mod_name)
ValueError: source code string cannot contain null bytes
I'm working on slowly converting my very serialized text analysis engine to use Modin and Ray. Feels like I'm nearly there, however, I seem to have hit a stumbling block. My code looks like this:
vectorizer = TfidfVectorizer(
analyzer=ngrams, encoding="ascii", stop_words="english", strip_accents="ascii"
)
tf_idf_matrix = vectorizer.fit_transform(r_strings["name"])
r_vectorizer = ray.put(vectorizer)
r_tf_idf_matrix = ray.put(tf_idf_matrix)
n = 2
match_results = []
for fn in files["c.file"]:
match_results.append(
match_name.remote(fn, r_vectorizer, r_tf_idf_matrix, r_strings, n)
)
match_returns = ray.get(match_results)
I'm following the guidance from the "anti-patterns" section in the Ray documentation, on what to avoid, and this is very similar to that of the "better" pattern.
Traceback (most recent call last):
File "alt.py", line 213, in <module>
match_returns = ray.get(match_results)
File "/home/myuser/.local/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 62, in wrapper
return func(*args, **kwargs)
File "/home/myuser/.local/lib/python3.7/site-packages/ray/worker.py", line 1501, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(PicklingError): ray::match_name() (pid=23393, ip=192.168.1.173)
File "python/ray/_raylet.pyx", line 564, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 565, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1652, in ray._raylet.CoreWorker.store_task_outputs
File "/home/myuser/.local/lib/python3.7/site-packages/ray/serialization.py", line 327, in serialize
return self._serialize_to_msgpack(value)
File "/home/myuser/.local/lib/python3.7/site-packages/ray/serialization.py", line 307, in _serialize_to_msgpack
self._serialize_to_pickle5(metadata, python_objects)
File "/home/myuser/.local/lib/python3.7/site-packages/ray/serialization.py", line 267, in _serialize_to_pickle5
raise e
File "/home/myuser/.local/lib/python3.7/site-packages/ray/serialization.py", line 264, in _serialize_to_pickle5
value, protocol=5, buffer_callback=writer.buffer_callback)
File "/home/myuser/.local/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 73, in dumps
cp.dump(obj)
File "/home/myuser/.local/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 580, in dump
return Pickler.dump(self, obj)
_pickle.PicklingError: args[0] from __newobj__ args has the wrong class
Definitely an unexpected result. I'm not sure where to go next with this and would appreciate help from folks who have more experience with Ray and Modin.
In order to read few files from common crawl I have written this script
import warc
import boto
for line in sys.stdin:
line = line.strip()
#Connect to AWS and read a dataset
conn = boto.connect_s3(anon=True, host='s3.amazonaws.com')
pds = conn.get_bucket('commoncrawl')
k = Key(pds)
k.key = line
f = warc.WARCFile(fileobj=GzipStreamFile(k))
skipped_doc = 0
for num, record in enumerate(f):
# analysis code
Where each line is the key of warc files. When I run this script to analyze 5 files, I got this exception
Traceback (most recent call last):
File "./warc_mapper_full.py", line 42, in <module>
for num, record in enumerate(f):
File "/usr/lib/python2.7/site-packages/warc/warc.py", line 393, in __iter__
record = self.read_record()
File "/usr/lib/python2.7/site-packages/warc/warc.py", line 364, in read_record
self.finish_reading_current_record()
File "/usr/lib/python2.7/site-packages/warc/warc.py", line 358, in finish_reading_current_record
self.current_payload.read()
File "/usr/lib/python2.7/site-packages/warc/utils.py", line 59, in read
return self._read(self.length)
File "/usr/lib/python2.7/site-packages/warc/utils.py", line 69, in _read
content = self.buf + self.fileobj.read(size)
File "/home/hpcnl/Documents/kics/current_work/aws/tasks/warc-analysis/src/gzipstream/gzipstream/gzipstreamfile.py", line 67, in read
result = super(GzipStreamFile, self).read(*args, **kwargs)
File "/home/hpcnl/Documents/kics/current_work/aws/tasks/warc-analysis/src/gzipstream/gzipstream/gzipstreamfile.py", line 48, in readinto
data = self.read(len(b))
File "/home/hpcnl/Documents/kics/current_work/aws/tasks/warc-analysis/src/gzipstream/gzipstream/gzipstreamfile.py", line 38, in read
raw = self.stream.read(io.DEFAULT_BUFFER_SIZE)
File "/usr/lib/python2.7/site-packages/boto/s3/key.py", line 400, in read
data = self.resp.read(size)
File "/usr/lib/python2.7/site-packages/boto/connection.py", line 413, in read
return http_client.HTTPResponse.read(self, amt)
File "/usr/lib64/python2.7/httplib.py", line 602, in read
s = self.fp.read(amt)
File "/usr/lib64/python2.7/socket.py", line 380, in read
data = self._sock.recv(left)
File "/usr/lib64/python2.7/ssl.py", line 736, in recv
return self.read(buflen)
File "/usr/lib64/python2.7/ssl.py", line 630, in read
v = self._sslobj.read(len or 1024)
ssl.SSLError: ('The read operation timed out',)
I run it many times. Above exception happened every time. Where is the problem ?
I'm trying to convert text into xml format. And I'm using LXML Library. But I'm getting error message. Please help me. Thank you
import re
from lxml import etree
import urllib,urllib2
def get_movie_info(movie_id):
URL = "http://www.raaga.com/a/rss.asp?%s"%(movie_id)
f = urllib.urlopen(URL)
movie_info = f.read()
rss = "".join([ line.strip() for line in movie_info ])
mi_tree = etree.fromstring(rss)
#mi_title = self._parse_movie_title(mi_tree.xpath("/rss/channel/title/text()")[0])
#mi_tracks = mi_tree.xpath("/rss/channel/item")
return mi_tree
get_movie_info('A0000102')
Here is my traceback
Traceback (most recent call last):
File "py1.py", line 14, in <module>
get_movie_info('A0000102')
File "py1.py", line 9, in get_movie_info
mi_tree = etree.fromstring(rss)
File "lxml.etree.pyx", line 2743, in lxml.etree.fromstring (src/lxml\lxml.etre
e.c:52665)
File "parser.pxi", line 1573, in lxml.etree._parseMemoryDocument (src/lxml\lxm
l.etree.c:79932)
File "parser.pxi", line 1452, in lxml.etree._parseDoc (src/lxml\lxml.etree.c:7
8774)
File "parser.pxi", line 960, in lxml.etree._BaseParser._parseDoc (src/lxml\lxm
l.etree.c:75389)
File "parser.pxi", line 564, in lxml.etree._ParserContext._handleParseResultDo
c (src/lxml\lxml.etree.c:71739)
File "parser.pxi", line 645, in lxml.etree._handleParseResult (src/lxml\lxml.e
tree.c:72614)
File "parser.pxi", line 585, in lxml.etree._raiseParseError (src/lxml\lxml.etr
ee.c:71955)
lxml.etree.XMLSyntaxError: xmlParsePITarget: invalid name prefix 'xml', line 1,
column 13
It works fine for me without this string:
rss = "".join([ line.strip() for line in movie_info ])
Something like this:
mi_tree = etree.fromstring(movie_info)