Python ldif3 parser and exception in for loop - python

From site: https://pypi.python.org/pypi/ldif3/3.2.0
I have this code:
from ldif3 import LDIFParser
from pprint import pprint
parser = LDIFParser(open('data.ldif', 'rb'))
for dn, entry in parser.parse():
print('got entry record: %s' % dn)
pprint(record)
And now, reading my file data.ldif I have exception in parser.parse().
Question is how to catch this exception and allow for loop to go to next record (continue)?
Trackback:
Traceback (most recent call last):
File "ldif.py", line 16, in <module>
for dn, entry in parser.parse():
File "/home/dlubom/anaconda2/lib/python2.7/site-packages/ldif3.py", line 373, in parse
yield self._parse_entry_record(block)
File "/home/dlubom/anaconda2/lib/python2.7/site-packages/ldif3.py", line 346, in _parse_entry_record
attr_type, attr_value = self._parse_attr(line)
File "/home/dlubom/anaconda2/lib/python2.7/site-packages/ldif3.py", line 309, in _parse_attr
return attr_type, attr_value.decode('utf8')
File "/home/dlubom/anaconda2/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xb3 in position 6: invalid start byte

i think it's not possible to handle exceptions in such case because they happend before / during the variable assignment.
BTW probably want to use the attribute:
strict (boolean) – If set to False, recoverable parse errors will
produce log warnings rather than exceptions.
Example:
parser = LDIFParser(ldif_file, strict=False)
https://ldif3.readthedocs.io/en/latest/
That helped me parsing an invalid ldif file containing commas inside CN attributes.

Related

json.load() function doesn't work- Python

JSON load doesn't work for me whenever I run my code,
In this part, I create the code to read the file
import json
filename = 'eq_1_day_m1.json'
with open(filename) as f:
all_eq_data = json.load(f)
readable_file = 'readable_eq_data.json'
with open(readable_file, 'w') as c:
json.dump(all_eq_data, c, indent=4
Then It gives me so many errors talking about charmap. I think this is because of the maximum capacity. Can I do something about this?
C:\Users\PC\AppData\Local\Microsoft\WindowsApps\python.exe "C:/Users/PC/PycharmProjects/Learning/Learning Matplotlib/eq_explore_data.py"
Traceback (most recent call last):
File "C:\Users\PC\PycharmProjects\Learning\Learning Matplotlib\eq_explore_data.py", line 5, in <module>
all_eq_data = json.load(f)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.496.0_x64__qbz5n2kfra8p0\lib\json\__init__.py", line 293, in load
return loads(fp.read(),
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.496.0_x64__qbz5n2kfra8p0\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 10292: character maps to <undefined>
Process finished with exit code 1
I have my json file: 'eq_1_day_m1.json' if you are wondering. It's too big for StackOverflow to handle so I didn't add it to the question.

Encoding Error with Beautiful Soup: Character Maps to Undefined (Python)

I've written a script that is supposed to retrieve html pages off a site and update their contents. The following function looks for a certain file on my system, then attempts to open it and edit it:
def update_sn(files_to_update, sn, table, title):
paths = files_to_update['files']
print('updating the sn')
try:
sn_htm = [s for s in paths if re.search('^((?!(Default|Notes|Latest_Addings)).)*htm$', s)][0]
notes_htm = [s for s in paths if re.search('_Notes\.htm$', s)][0]
except Exception:
print('no sns were found')
pass
new_path_name = new_path(sn_htm, files_to_update['predecessor'], files_to_update['original'])
new_sn_number = sn
htm_text = open(sn_htm, 'rb').read().decode('cp1252')
content = re.findall(r'(<table>.*?<\/table>.*)(?:<\/html>)', htm_text, re.I | re.S)
minus_content = htm_text.replace(content[0], '')
table_soup = BeautifulSoup(table, 'html.parser')
new_soup = BeautifulSoup(minus_content, 'html.parser')
head_title = new_soup.title.string.replace_with(new_sn_number)
new_soup.link.insert_after(table_soup.div.next)
with open(new_path_name, "w+") as file:
result = str(new_soup)
try:
file.write(result)
except Exception:
print('Met exception. Changing encoding to cp1252')
try:
file.write(result('cp1252'))
except Exception:
print('cp1252 did\'nt work. Changing encoding to utf-8')
file.write(result.encode('utf8'))
try:
print('utf8 did\'nt work. Changing encoding to utf-16')
file.write(result.encode('utf16'))
except Exception:
pass
This works in the majority of cases, but sometimes it fails to write, at which point the exception kicks in and I try every feasible encoding without success:
updating the sn
Met exception. Changing encoding to cp1252
cp1252 did'nt work. Changing encoding to utf-8
Traceback (most recent call last):
File "C:\Users\Joseph\Desktop\SN Script\update_files.py", line 145, in update_sn
file.write(result)
File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 4006-4007: character maps to <undefined>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Joseph\Desktop\SN Script\update_files.py", line 149, in update_sn
file.write(result('cp1252'))
TypeError: 'str' object is not callable
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "scraper.py", line 79, in <module>
get_latest(entries[0], int(num), entries[1])
File "scraper.py", line 56, in get_latest
update_files.update_sn(files_to_update, data['number'], data['table'], data['title'])
File "C:\Users\Joseph\Desktop\SN Script\update_files.py", line 152, in update_sn
file.write(result.encode('utf8'))
TypeError: write() argument must be str, not bytes
Can anyone give me any pointers on how to better handle html data that might have inconsistent encoding?
In your code you open the file in text mode, but then you attempt to write bytes (str.encode returns bytes) and so Python throws an exception:
TypeError: write() argument must be str, not bytes
If you want to write bytes, you should open the file in binary mode.
BeautifulSoup detects the document’s encoding (if it is bytes) and converts it to string automatically. We can access the encoding with .original_encoding, and use it to encode the content when writting to file. For example,
soup = BeautifulSoup(b'<tag>ascii characters</tag>', 'html.parser')
data = soup.tag.text
encoding = soup.original_encoding or 'utf-8'
print(encoding)
#ascii
with open('my.file', 'wb+') as file:
file.write(data.encode(encoding))
In order for this to work you should pass your html as bytes to BeautifulSoup, so don't decode the response content.
If BeautifulSoup fails to detect the correct encoding for some reason, then you could try a list of possible encodings, like you have done in your code.
data = 'Somé téxt'
encodings = ['ascii', 'utf-8', 'cp1252']
with open('my.file', 'wb+') as file:
for encoding in encodings:
try:
file.write(data.encode(encoding))
break
except UnicodeEncodeError:
print(encoding + ' failed.')
Alternatively, you could open the file in text mode and set the encoding in open (instead of encoding the content), but note that this option is not available in Python2.
Just out of curiosity, is this line of code a typo file.write(result('cp1252'))? Seems like it is missing .encode method.
Traceback (most recent call last):
File "C:\Users\Joseph\Desktop\SN Script\update_files.py", line 149, in update_sn
file.write(result('cp1252'))
TypeError: 'str' object is not callable
Will it work perfectly if you modify the code to: file.write(result.encode('cp1252'))
I once had this write to file with encoding problem and brewed my own solution through the following thread:
Saving utf-8 texts in json.dumps as UTF8, not as \u escape sequence
.
My problem solved by changing the html.parser parsing mode to html5lib. I root-caused my problem due to malformed HTML tag and solved it with html5lib parser. For your reference, this is the documentation for each parser provided by BeautifulSoup.
Hope this helps

Umlauts in JSON files lead to errors in Python code created by ANTLR4

I've created python modules from the JSON grammar on github / antlr4 with
antlr4 -Dlanguage=Python3 JSON.g4
I've written a main program "JSON2.py" following this guide: https://github.com/antlr/antlr4/blob/master/doc/python-target.md
and downloaded the example1.json also from github.
python3 ./JSON2.py example1.json # works perfectly, but
python3 ./JSON2.py bookmarks-2017-05-24.json # the bookmarks contain German Umlauts like "ü"
...
File "/home/xyz/lib/python3.5/site-packages/antlr4/FileStream.py", line 27, in readDataFrom
return codecs.decode(bytes, encoding, errors)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 227: ordinal not in range(128)
The offending line in JSON2.py is
input = FileStream(argv[1])
I've searched stackoverflow and tried this instead of using the above FileStream:
fp = codecs.open(argv[1], 'rb', 'utf-8')
try:
input = fp.read()
finally:
fp.close()
lexer = JSONLexer(input)
stream = CommonTokenStream(lexer)
parser = JSONParser(stream)
tree = parser.json() # This is line 39, mentioned in the error message
Execution of this program ends with an error message, even if the input file doesn't contain Umlauts:
python3 ./JSON2.py example1.json
Traceback (most recent call last):
File "./JSON2.py", line 46, in <module>
main(sys.argv)
File "./JSON2.py", line 39, in main
tree = parser.json()
File "/home/x/Entwicklung/antlr/links/JSONParser.py", line 108, in json
self.enterRule(localctx, 0, self.RULE_json)
File "/home/xyz/lib/python3.5/site-packages/antlr4/Parser.py", line 358, in enterRule
self._ctx.start = self._input.LT(1)
File "/home/xyz/lib/python3.5/site-packages/antlr4/CommonTokenStream.py", line 61, in LT
self.lazyInit()
File "/home/xyz/lib/python3.5/site-packages/antlr4/BufferedTokenStream.py", line 186, in lazyInit
self.setup()
File "/home/xyz/lib/python3.5/site-packages/antlr4/BufferedTokenStream.py", line 189, in setup
self.sync(0)
File "/home/xyz/lib/python3.5/site-packages/antlr4/BufferedTokenStream.py", line 111, in sync
fetched = self.fetch(n)
File "/home/xyz/lib/python3.5/site-packages/antlr4/BufferedTokenStream.py", line 123, in fetch
t = self.tokenSource.nextToken()
File "/home/xyz/lib/python3.5/site-packages/antlr4/Lexer.py", line 111, in nextToken
tokenStartMarker = self._input.mark()
AttributeError: 'str' object has no attribute 'mark'
This parses correctly:
javac *.java
grun JSON json -gui bookmarks-2017-05-24.json
So the grammar itself is not the problem.
So finally the question: How should I process the input file in python, so that lexer and parser can digest it?
Thanks in advance.
Make sure your input file is actually encoded as UTF-8. Many problems with character recognition by the lexer are caused by using other encodings. I just took a testbed application, added ëto the list of available characters for an IDENTIFIER and it works again. UTF-8 is the key -- and make sure your grammar also allows these characters where you want to accept them.
I solved it by passing the encoding info:
input = FileStream(sys.argv[1], encoding = 'utf8')
If without the encoding info, I will have the same issue as yours.
Traceback (most recent call last):
File "test.py", line 20, in <module>
main()
File "test.py", line 9, in main
input = FileStream(sys.argv[1])
File ".../lib/python3.5/site-packages/antlr4/FileStream.py", line 20, in __init__
super().__init__(self.readDataFrom(fileName, encoding, errors))
File ".../lib/python3.5/site-packages/antlr4/FileStream.py", line 27, in readDataFrom
return codecs.decode(bytes, encoding, errors)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe4 in position 1: ordinal not in range(128)
Where my input data is
[今明]天(台南|高雄)的?天氣如何

UnicodeEncodeError in Python 3.6

I'm trying to find a way to disable this error logging in my python code. The program seems to actually run fine, the search function just returns a whole buttload of json objects with dozens of attributes each whenever it finds a character that it cant print it will print the thousands of json objects returned
to the console.
I wrapped the guilty code (below) in a try block but it hasn't changed anything.
try:
results = api.search(query)
print('Station hits: ', len(results['station_hits']), '\nSong hits: ', len(results['song_hits']), '\nArtist hits: ', len(results['artist_hits']), '\nAlbum hits: ', len(results['album_hits'])).encode('ascii', 'ignore')
except UnicodeEncodeError:
pass
Here is the error that is printed to the console. (Without the buttload of text referenced earlier)
--- Logging error ---
Traceback (most recent call last):
File "C:\Users\670854\AppData\Local\Programs\Python\Python36-32\lib\logging\__init__.py", line 989, in emit
stream.write(msg)
File "C:\Users\670854\AppData\Local\Programs\Python\Python36-32\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u2117' in position 108194: character maps to <undefined>
Call stack:
File "gpm.py", line 247, in <module>
main()
File "gpm.py", line 181, in main
results = api.search(query)
File "C:\Users\670854\AppData\Local\Programs\Python\Python36-32\lib\site-packages\gmusicapi\clients\mobileclient.py", line 1806, in search
res = self._make_call(mobileclient.Search, query, max_results)
File "C:\Users\670854\AppData\Local\Programs\Python\Python36-32\lib\site-packages\gmusicapi\clients\shared.py", line 84, in _make_call
return protocol.perform(self.session, self.validate, *args, **kwargs)
File "C:\Users\670854\AppData\Local\Programs\Python\Python36-32\lib\site-packages\gmusicapi\protocol\shared.py", line 243, in perform
log.debug(cls.filter_response(parsed_response))
Trace-back etc reveals: Can't encode u"\u2117" using cp1252, not surprising, use utf8 instead.

Getting Unicode Error when looping through tags and writing XML

I am trying to write out some XML that does have some special characters. The place where I am running in trouble is when I iterate through a list of tags to create several elements called tag.
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as xml
reload(sys)
sys.setdefaultencoding('utf-8')
Snippet of code:
check = (video['tags'].split(', '))
x=len(check)
y=x-1
for i in xrange(0,y):
tagger = xml.SubElement(doc, 'field', name="tag")
s=check[i]
tagger.text = s.encode('utf-8')
The problem is when I try to write:
output = open(file_name,'w+')
tree = xml.ElementTree(add)
tree.write(output)
output.close()
I get the following error:
Traceback (most recent call last):
File "xml_breakup3.py", line 108, in <module>
tagger.text = s.encode('utf-8')
UnicodeDecodeError: 'utf8' codec can't decode byte 0x81 in position 0: invalid start byte
When I run my code without this snippet it writes the xml without a problem. If I make tagger.text = any kind of string(i.e. '99') it writes fine. If I make the loop go from 0 to 3 it works. It is only when I try to iterate through the whole list that I get an UnicodeDecode Error
when I try:
check = (video['tags'].split(', '))
for ta in check:
tagger = xml.SubElement(doc, 'field', name="tag")
tagger.text = ta
I get this:
Traceback (most recent call last):
File "xml_breakup3.py", line 172, in <module>
tree.write(output)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/etree/ElementTree.py", line 821, in write
serialize(write, self._root, encoding, qnames, namespaces)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/etree/ElementTree.py", line 940, in _serialize_xml
_serialize_xml(write, e, encoding, qnames, None)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/etree/ElementTree.py", line 940, in _serialize_xml
_serialize_xml(write, e, encoding, qnames, None)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/etree/ElementTree.py", line 938, in _serialize_xml
write(_escape_cdata(text, encoding))
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/etree/ElementTree.py", line 1074, in _escape_cdata
return text.encode(encoding, "xmlcharrefreplace")
UnicodeDecodeError: 'utf8' codec can't decode byte 0xba in position 0: invalid start byte
You may want to try removing the str from in front of the piece you are encoding. When you use str, you are converting what I assume is Unicode into a string, which you are then trying encode. If you instead leave it as Unicode and decode directly, it should work:
>>> s = u'\xba'
>>> print s
º
>>> s.encode('utf8')
'\xc2\xba'
>>> str(s).encode('utf8')
Traceback (most recent call last):
File "<pyshell#30>", line 1, in <module>
str(s).encode('utf8')
UnicodeEncodeError: 'ascii' codec can't encode character u'\xba' in position 0: ordinal not in range(128)

Categories