Encoding issues while saving json to file - python

I am reading logs from mobile device, from system terminal line by line and saving it to string.
Then I cut this string into parts and want to generate .*json file from it.
I have my .*json in form of dictionary and I want to save it to file with usage of this custom made method:
def save_json_to_file(json_dict, folder, file_name, extension):
directory = add_ending_slash(GlobalConfig.OUTPUT_DIR) + add_ending_slash(str(folder))
file_path = clean_path(directory + str(file_name) + "." + extension)
output_file = None
try:
if not os.path.exists(directory):
os.makedirs(directory)
output_file = open(file_path, "w")
absolute_path = os.path.abspath(file_path)
Printer.system_message(TAG, "Created json file '" + absolute_path + "'.")
json.dump(json_dict, output_file, indent=4, ensure_ascii=False)
except Exception as e:
message = "Unable to create file '{}.{}'. Error message: {}"
message = message.format(file_path, extension, str(e))
raise LauncherFlowInterruptedException(TAG, message)
finally:
if output_file is not None and hasattr(output_file, "close"):
output_file.close()
return absolute_path
At some point of json.dump doing it's job I get crash:
File "/Users/F1sherKK/anaconda3/lib/python3.6/codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc0 in position 89: invalid start byte
I tried changing every string I receive from console by doing:
received_terminal_line = received_terminal_line.encode('utf-8')
I know that in incoming strings there can be various different encodings, in example emoji.

Related

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 295: character maps to <undefined>

I am trying to run this program from a book. I have created the file called 'alice_2.txt'
def count_words(filename):
"""Count the approximate number of words in a file."""
try:
with open(filename) as f_obj:
contents = f_obj.read()
except FileNotFoundError:
msg = "Sorry, the file " + filename + " does not exist."
print(msg)
else:
# Count approximate number of words in the file.
words = contents.split()
num_words = len(words)
print("The file " + filename + " has about " + str(num_words) +
" words.")
filename = 'alice_2.txt'
count_words(filename)
But I keep getting this error message
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 295: character maps to <undefined>
Can anyone explain what this means, and how to solve it?
You are trying to use an encoding which cannot store the character you have in file.
for example ɛ can't be opened in ascii since it doesn't have valid ascii code.
try to open the file using utf-8.
with open(filename, encoding='utf8') as f_obj:
pass
# DO your stuff

Encrypting a file using saved RSA keys in python

I am trying to encrypt an image file using RSA keys that are generated by another script and saved into a .pem file. when i am trying to encrypt the file its showing errors like this
Traceback (most recent call last):
File "rsaencrypt.py", line 85, in <module>
main()
File "rsaencrypt.py", line 45, in main
content = fileObj.read()
File "/usr/lib64/python3.7/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
I am new to python and file handling so i think the problem is in the way of how i am handling the files both the key files and the inputfile. Looking forward to some suggestion.
Heres the code of my encryption file:
import time, os, sys
def main():
inputFilename = 'img.jpg'
# BE CAREFUL! If a file with the outputFilename name already exists,
# this program will overwrite that file.
outputFilename = 'encrypted.jpg'
myKey = open("public_key.pem",'r')
myMode = 'encrypt' # set to 'encrypt' or 'decrypt'
# If the input file does not exist, then the program terminates early.
if not os.path.exists(inputFilename):
print('The file %s does not exist. Quitting...' % (inputFilename))
sys.exit()
# If the output file already exists, give the user a chance to quit.
if os.path.exists(outputFilename):
print('This will overwrite the file %s. (C)ontinue or (Q)uit?' % (outputFilename))
response = input('> ')
if not response.lower().startswith('c'):
sys.exit()
# Read in the message from the input file
fileObj = open(inputFilename)
content = fileObj.read()
fileObj.close()
print('%sing...' % (myMode.title()))
# Measure how long the encryption/decryption takes.
startTime = time.time()
if myMode == 'encrypt':
translated = transpositionEncrypt.encryptMessage(myKey, content)
elif myMode == 'decrypt':
translated = transpositionDecrypt.decryptMessage(myKey, content)
totalTime = round(time.time() - startTime, 2)
print('%sion time: %s seconds' % (myMode.title(), totalTime))
# Write out the translated message to the output file.
outputFileObj = open(outputFilename, 'w')
outputFileObj.write(translated)
outputFileObj.close()
print('Done %sing %s (%s characters).' % (myMode, inputFilename, len(content)))
print('%sed file is %s.' % (myMode.title(), outputFilename))
# If transpositionCipherFile.py is run (instead of imported as a module)
# call the main() function.
if __name__ == '__main__':
main()
You need to open the file in binary mode, not text (which is the default).
Turn
fileObj = open(inputFilename)
into
fileObj = open(inputFilename, "rb")
and .read() will return bytes (i.e. binary data), not str (i.e. text).

UnicodeDecodeError with QFileDialog in PyQt

Hello I am having an issue with my program when it comes to a file dialog function I have.
First here is my code:
def getFileInfo(self):
global logName
logName = QtGui.QFileDialog.getOpenFileName()
return logName
def getFileName(self):
return logName
def compareAction(self):
def process(infile, outfile, keywords):
keys = [[k[0], k[1], 0] for k in keywords]
endk = None
with open(infile, 'rb') as fdin:
with open(outfile, 'ab') as fdout:
fdout.write("<" + words + ">" + "\r\n")
for line in fdin:
if endk is not None:
fdout.write(line)
if line.find(endk) >= 0:
fdout.write("\r\n")
endk = None
else:
for k in keys:
index = line.find(k[0])
if index >= 0:
fdout.write(line[index + len(k[0]):].lstrip())
endk = k[1]
k[2] += 1
if endk is not None:
raise Exception(endk + "Not found before end of file")
return keys
clearOutput = open('test.txt', 'wb')
clearOutput.truncate()
clearOutput.close()
outputText = 'test.txt'
end_token = "[+][+]"
inputFile = logName
start_token = self.serialInputText.toPlainText()
split_start = start_token.split(' ')
for words in split_start:
process(inputFile,outputText,((words + "SHOWALL"),))
fo = open(outputText, "rb")
text = fo.read()
print start_token + '\r\n'
print split_start
print inputFile
Okay, So the general idea of this piece of code is grabbing a some inputted text from a TextEdit in my PyQt GUI. Then, splitting that string into a List that can be used to 'scan' through the file and if there are any matches then print out those matches into another text document.
Steps:
User inputs texts into TextEdit
Texts inside TextEdit gets stored into a QString
That QString has a space as a delimiter so we split each entry into a list. i.e This is a list -> [u'This', u'Is', u'A', u'List'] (The list has a u due to my code using sip)
Now that we have this QStringList we can pass it through my def process function.
We need a file to search through obviously, this is where the def getFileInfo(self) and def GetFileName(Self) function come into play.
So after the user has inputted some text, selected a file to search through, he/she will press a Button, lets call it CompareButton, and it will execute the def compareAction(self) function.
Issue
Currently, my issue is this error that appears after doing all the steps it fails on step number 6. This is my error:
Traceback (most recent call last):
File "RETRACTED.py", line 278, in compareAction
process(inputFile,outputText,((words + "SHOWALL"),))
File "RETRACTED.py", line 260, in process
index = line.find(k[0])
UnicodeDecodeError: 'ascii' codec can't decode byte 0xef in position 0: ordinal not in range(128)
I am unsure as to why this error is happening. I have been searching for a similar issue but i believe it has to do with my process function. I am unsure
That specific error:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xef in position 0: ordinal not in range(128)
looks like a problem with an (unexpected) Byte Order Mark (BOM) in the input file. I suspect the log file is UTF-8 with BOM.
Try changing your file open line to:
open(infile, 'rb', encoding='utf-8-sig')
to have the the BOM marker stripped from the file.

Cannot open file using my text editor in wxpython?

The text editor in wxpython cannot open saved files. The files are saved as text files but while opening the the following error appears
Error opening file
'charmap' codec can't decode byte 0x8d in position 5: charcter maps to <undefined>
The code used for opening the file is given below,
def DoOpenFile(self):
#wcd = 'All files (*)|*|Editor files (*.ef)|*.ef|'
wcd='Text files(*.txt)|*.txt|Plain Text files (*.txt)|*.txt'
dir = os.getcwd()
open_dlg = wx.FileDialog(self, message='Choose a file', defaultDir=dir, defaultFile='',
wildcard=wcd, style=wx.OPEN|wx.CHANGE_DIR)
if open_dlg.ShowModal() == wx.ID_OK:
path = open_dlg.GetPath()
try:
file = open(path, 'r')
text = file.read()
file.close()
if self.text.GetLastPosition():
self.text.Clear()
self.text.WriteText(text)
self.last_name_saved = path
self.statusbar.SetStatusText('', 1)
self.modify = False
self.SetTitle(window_title + path)
except IOError, error:
dlg = wx.MessageDialog(self, 'Error opening file' + str(error))
dlg.ShowModal()
except UnicodeDecodeError, error:
dlg = wx.MessageDialog(self, 'Error opening file\n' + str(error))
dlg.ShowModal()
open_dlg.Destroy()
Change your code as
file = codecs.open(path, 'r',encoding='utf-8')

Python UnicodeEncodeError with pre decoded UTF-8

I'm trying to parse through a bunch of logfiles (up to 4GiB) in a tar.gz file. The source files come from RedHat 5.8 Server systems and SunOS 5.10, processing has to be done on WindowsXP.
I iterate through the tar.gz files, read the files, decode the file contents to UTF-8 and parse them with regular expressions before further processing.
When I'm writing out the processed data along with the raw-data that was read from the tar.gz, I get the following error:
Traceback (most recent call last):
File "C:\WoMMaxX\lt_automation\Tools\LogParser.py", line 375, in <module>
p.analyze_longtails()
File "C:\WoMMaxX\lt_automation\Tools\LogParser.py", line 196, in analyze_longtails
oFile.write(entries[key]['source'] + '\n')
File "C:\Python\3.2\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 24835-24836: character maps
to <undefined>
Heres the part where I read and parse the logfiles:
def getSalesSoaplogEntries(perfid=None):
for tfile in parser.salestarfiles:
path = os.path.join(parser.logpath,tfile)
if os.path.isfile(path):
if tarfile.is_tarfile(path):
tar = tarfile.open(path,'r:gz')
for tarMember in tar.getmembers():
if 'salescomponent-soap.log' in tarMember.name:
tarMemberFile = tar.extractfile(tarMember)
content = tarMemberFile.read().decode('UTF-8','surrogateescape')
for m in parser.soaplogregex.finditer(content):
entry = {}
entry['time'] = datetime(datetime.now().year, int(m.group('month')), int(m.group('day')),int(m.group('hour')), int(m.group('minute')), int(m.group('second')), int(m.group('millis'))*1000)
entry['perfid'] = m.group('perfid')
entry['direction'] = m.group('direction')
entry['payload'] = m.group('payload')
entry['file'] = tarMember.name
entry['source'] = m.group(0)
sm = parser.soaplogmethodregex.match(entry['payload'])
if sm:
entry['method'] = sm.group('method')
if entry['time'] >= parser.starttime and entry['time'] <= parser.endtime:
if perfid and entry['perfid'] == perfid:
yield entry
tar.members = []
And heres the part where I write the processed information along with the raw data out(its an aggregation of all log-entries for one specific process:
if len(entries) > 0:
time = perfentry['time']
filename = time.isoformat('-').replace(':','').replace('-','') + 'longtail_' + perfentry['perfid'] + '.txt'
oFile = open(os.path.join(parser.logpath,filename), 'w')
oFile.write(perfentry['source'] +'\n')
oFile.write('------\n')
for key in sorted(entries.keys()):
oFile.write('------\n')
oFile.write(entries[key]['source'] + '\n') #<-- here it is failing
What I don't get is why it seems to be correct to read the files in UTF-8, it is not possible to just write them out as UTF-8. What am I doing wrong?
Your output file is using the default encoding for your OS, which is not UTF-8. Use codecs.open instead of open and specify encoding='utf-8'.
oFile = codecs.open(os.path.join(parser.logpath,filename), 'w', encoding='utf-8')
See http://docs.python.org/howto/unicode.html#reading-and-writing-unicode-data

Categories