UnicodeDecodeError with QFileDialog in PyQt - python

Hello I am having an issue with my program when it comes to a file dialog function I have.
First here is my code:
def getFileInfo(self):
global logName
logName = QtGui.QFileDialog.getOpenFileName()
return logName
def getFileName(self):
return logName
def compareAction(self):
def process(infile, outfile, keywords):
keys = [[k[0], k[1], 0] for k in keywords]
endk = None
with open(infile, 'rb') as fdin:
with open(outfile, 'ab') as fdout:
fdout.write("<" + words + ">" + "\r\n")
for line in fdin:
if endk is not None:
fdout.write(line)
if line.find(endk) >= 0:
fdout.write("\r\n")
endk = None
else:
for k in keys:
index = line.find(k[0])
if index >= 0:
fdout.write(line[index + len(k[0]):].lstrip())
endk = k[1]
k[2] += 1
if endk is not None:
raise Exception(endk + "Not found before end of file")
return keys
clearOutput = open('test.txt', 'wb')
clearOutput.truncate()
clearOutput.close()
outputText = 'test.txt'
end_token = "[+][+]"
inputFile = logName
start_token = self.serialInputText.toPlainText()
split_start = start_token.split(' ')
for words in split_start:
process(inputFile,outputText,((words + "SHOWALL"),))
fo = open(outputText, "rb")
text = fo.read()
print start_token + '\r\n'
print split_start
print inputFile
Okay, So the general idea of this piece of code is grabbing a some inputted text from a TextEdit in my PyQt GUI. Then, splitting that string into a List that can be used to 'scan' through the file and if there are any matches then print out those matches into another text document.
Steps:
User inputs texts into TextEdit
Texts inside TextEdit gets stored into a QString
That QString has a space as a delimiter so we split each entry into a list. i.e This is a list -> [u'This', u'Is', u'A', u'List'] (The list has a u due to my code using sip)
Now that we have this QStringList we can pass it through my def process function.
We need a file to search through obviously, this is where the def getFileInfo(self) and def GetFileName(Self) function come into play.
So after the user has inputted some text, selected a file to search through, he/she will press a Button, lets call it CompareButton, and it will execute the def compareAction(self) function.
Issue
Currently, my issue is this error that appears after doing all the steps it fails on step number 6. This is my error:
Traceback (most recent call last):
File "RETRACTED.py", line 278, in compareAction
process(inputFile,outputText,((words + "SHOWALL"),))
File "RETRACTED.py", line 260, in process
index = line.find(k[0])
UnicodeDecodeError: 'ascii' codec can't decode byte 0xef in position 0: ordinal not in range(128)
I am unsure as to why this error is happening. I have been searching for a similar issue but i believe it has to do with my process function. I am unsure

That specific error:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xef in position 0: ordinal not in range(128)
looks like a problem with an (unexpected) Byte Order Mark (BOM) in the input file. I suspect the log file is UTF-8 with BOM.
Try changing your file open line to:
open(infile, 'rb', encoding='utf-8-sig')
to have the the BOM marker stripped from the file.

Related

Python3 f.write UnicodeEncodeError: 'utf-8' codec can't encode characters surrogates not allowed

Run Python files over the web (php).
Afterwards, an error occurs while printing the Korean string to a file with Python.
On the other hand, running Python files directly using the terminal does not cause errors.
Do you know what the problem is?
Please help me.
error Traceback (most recent call last): File "makeApp.py", line 171,
in modify_app_info(app_name) File "makeApp.py", line 65, in modify_app_info f.write(line+"\n") UnicodeEncodeError: 'utf-8' codec can't encode characters in position 13-30: surrogates not allowed
Below is the code that causes the problem.
lines = read_file(read_file_path)
f = open(read_file_path, 'r', encoding='UTF-8')
lines = f.readlines()
f.close()
f = open(write_file_path, 'w', encoding='UTF-8')
for line in lines:
if '"name": "userInputAppName"' in line:
line = ' "name": "' + app_name + '",')
continue
f.write(line+"\n")
# f.write(line)
f.close()
Remove the encoding param, cauz you open a file in encoded mode so, you can't join any substring on the string.
So your code will be--
# ...
lines = read_file(read_file_path)
f = open(read_file_path, 'r')
lines = f.readlines()
f.close()
f = open(write_file_path, 'w')
for line in lines:
if '"name": "userInputAppName"' in line:
line = ' "name": "' + app_name + '",')
continue
f.write(line+"\n")
# f.write(line)
f.close()

Encrypting a file using saved RSA keys in python

I am trying to encrypt an image file using RSA keys that are generated by another script and saved into a .pem file. when i am trying to encrypt the file its showing errors like this
Traceback (most recent call last):
File "rsaencrypt.py", line 85, in <module>
main()
File "rsaencrypt.py", line 45, in main
content = fileObj.read()
File "/usr/lib64/python3.7/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
I am new to python and file handling so i think the problem is in the way of how i am handling the files both the key files and the inputfile. Looking forward to some suggestion.
Heres the code of my encryption file:
import time, os, sys
def main():
inputFilename = 'img.jpg'
# BE CAREFUL! If a file with the outputFilename name already exists,
# this program will overwrite that file.
outputFilename = 'encrypted.jpg'
myKey = open("public_key.pem",'r')
myMode = 'encrypt' # set to 'encrypt' or 'decrypt'
# If the input file does not exist, then the program terminates early.
if not os.path.exists(inputFilename):
print('The file %s does not exist. Quitting...' % (inputFilename))
sys.exit()
# If the output file already exists, give the user a chance to quit.
if os.path.exists(outputFilename):
print('This will overwrite the file %s. (C)ontinue or (Q)uit?' % (outputFilename))
response = input('> ')
if not response.lower().startswith('c'):
sys.exit()
# Read in the message from the input file
fileObj = open(inputFilename)
content = fileObj.read()
fileObj.close()
print('%sing...' % (myMode.title()))
# Measure how long the encryption/decryption takes.
startTime = time.time()
if myMode == 'encrypt':
translated = transpositionEncrypt.encryptMessage(myKey, content)
elif myMode == 'decrypt':
translated = transpositionDecrypt.decryptMessage(myKey, content)
totalTime = round(time.time() - startTime, 2)
print('%sion time: %s seconds' % (myMode.title(), totalTime))
# Write out the translated message to the output file.
outputFileObj = open(outputFilename, 'w')
outputFileObj.write(translated)
outputFileObj.close()
print('Done %sing %s (%s characters).' % (myMode, inputFilename, len(content)))
print('%sed file is %s.' % (myMode.title(), outputFilename))
# If transpositionCipherFile.py is run (instead of imported as a module)
# call the main() function.
if __name__ == '__main__':
main()
You need to open the file in binary mode, not text (which is the default).
Turn
fileObj = open(inputFilename)
into
fileObj = open(inputFilename, "rb")
and .read() will return bytes (i.e. binary data), not str (i.e. text).

How do I get rid of a random new line in the middle of a line in a text file using Python?

The following code will take the contents of 'out.txt' and append it to the end of 'fixed_inv.txt' in the form of a new file, 'concat.txt' based on
a shared path.
In the 'concat.txt' file, I am getting a few rows (out of thousands) that seem to have a random new line in the middle of said line.
For instance, a line is supposed to look like:
122 abc.def.com Failed to get CIFS shares with error code -2147024891. None Non-supported share access type. 0 Unkonwn NULL bluearc Different Security Type (1), Access is denied. (1354), Pruned. Different security type (21), The inherited access control list (ACL) or access control entry (ACE) could not be built. (3713), Could not convert the name of inner file or directory (27)
But instead, I have a few looking like:
122 abc.def.com Failed to get CIFS shares with error code -2147024891. None
Non-supported share access type. 0 Unkonwn NULL bluearc Different Security Type (1), Access is denied. (1354), Pruned. Different security type (21), The inherited access control list (ACL) or access control entry (ACE) could not be built. (3713), Could not convert the name of inner file or directory (27)
I have tried to fix this in my code below, but for some reason the code runs but does not fix the issue - which is to backspace the misplaced half line back or to get rid of the random new line.
class Error:
def __init__ (self, path, message): #self = new instance of class
self.path = path
self.message = message #error message
self.matched = False #has the path from out.txt been matched to the path of fixed_inv.txt?
def open_files(file1, file2, file3):
try:
f1 = open(file1, 'r')
except IOError:
print("Can't open {}".format(file1))
return None, None, None #you can't just open one file you have to open all
else:
try:
f2 = open(file2, 'r')
except IOError:
print("Can't open {}".format(file2))
f1.close()
return None, None, None
else:
try:
f3 = open(file3, 'w')
except IOError:
print("Can't open {}".format(file3))
f1.close()
f2.close()
return None, None, None
else:
return f1, f2, f3
def concat(file1, file2, file3):
errors = {} #key: path, value: instance of class Error
f1, f2, f3 = open_files(file1, file2, file3)
prevLine = "" #NEW
if f1 is not None: #if file one is able to open...
with f1:
for line_num, line in enumerate(f1): #get the line number and line
line = line.replace("\\", "/") #account for the differences in backslashes
tokens = line.strip().split(': ') #strip white spaces, split based on ':'
if len(tokens) != 3: #if there's less than two tokens...
print('Error on line {} in file {}: Expected three tokens, but found {}'.format(line_num + 1, file1, len(tokens))) #error
else: #NEW
if line.startswith('Non-supported'): #NEW
Prevline = line
Prevline = line.strip('\n') #NEW
else:
errors[tokens[1]] = Error(tokens[1], tokens[2])
with f2:
with f3:
for line_num, line in enumerate(f2):
line = line.replace("\\", "/").strip() #account for the differences in backslashes
tokens_2 = line.strip().split('\t') #strip white spaces, split based on tab
if len(tokens_2) < 4: #if we are unable to obtain the path by now since the path should be on 3rd or 4th index
print('Error on line {} in file {}: Expected >= 4 tokens, but found {}'.format(line_num + 1, file2, len(tokens_2)))
f3.write('{}\n'.format(line))
else: #if we have enough tokens to find the path...
if tokens_2[3] in errors: #if path is found in our errors dictionary from out.txt...
line.strip('\n')
path = tokens_2[3] #set path to path found
msg = errors[path].message #set the class instance of the value to msg
errors[path].matched = True #paths have been matched
f3.write('{}\t{}\n'.format(line, msg)) #write the line and the error message to concat
else: #if path is NOT found in our errors dictionary from out.txt...
f3.write('{}\t{}\n'.format(line, 'None'))
print('Error on line {} in file {}: Path {} not matched'.format(line_num + 1, file2, tokens_2[3])) #found in fixed_inv.txt,
#but not out.txt
"""for e in errors: #go through errors
if errors[e].matched is False: #if no paths have been matched
print('Path {} from {} not matched in {}'.format(errors[e].path, file1, file2)) #found in out.txt, but not in fixed_inv
f3.write('{}\t{}\n'.format(line, 'No error present'))
def main():
file1 = 'out.txt'
file2 = 'fixed_inv.txt'
file3 = 'test_concat.txt'
concat(file1, file2, file3)
if __name__ == '__main__':
main()
Any ideas/advice would be greatly appreciated! Thank you.
try replacing newline chars before writing it.
Ex:
f3.write('{}\n'.format(line.strip().replace("\n", "")))
f3.write('{}\t{}\n'.format(line.strip().replace("\n", ""), msg.replace("\n", "")))
f3.write('{}\t{}\n'.format(line.strip().replace("\n", ""), 'None'))
If you can fix this on the output side, it will obviously be a lot easier and more robust. But if you can’t, what you’re doing is a start in the right direction. You just want to:
Use prevline + line in place of line the first time.
Set prevline = “” in successful cases.
Do the check for an incomplete line before reading an error instead of after.
Distinguish too few tokens (may be an incomplete line) from too many (definitely an error) instead of trying to treat them the same.
Possibly (depending on actual input) replace new lines with some other white space instead of nothing.
Also, you may want to wrap this logic up in a generator function that you can reuse. Something like this:
def tokenizing(lines):
prevline = ""
for line in lines:
line = prevline + line
line = line.strip_logic_goes_here()
tokens = tokenize_logic_goes_here(line)
if len(tokens) > REQUIRED_TOKENS:
raise AppropriateException()
elif len(tokens) == REQUIRED_TOKENS:
yield line, tokens
prevline = ""
else:
prevline = line
if not prevline: return
tokens = tokenize_logic_goes_here()
if len(tokens) != REQUIRED_TOKENS:
raise AppropriateException()
yield line, tokens
Then you can just write;
for line, tokens in tokenizing(f1):

Python JSON to CSV - bad encoding, UnicodeDecodeError: 'charmap' codec can't decode byte

I have a problem converting nested JSON to CSV. For this i use https://github.com/vinay20045/json-to-csv (forked a bit to support python 3.4), here is full json-to-csv.py file.
Converting is working, if i set
#Base Condition
else:
reduced_item[str(key)] = (str(value)).encode('utf8','ignore')
and
fp = open(json_file_path, 'r', encoding='utf-8')
but when i import csv to MS Excel i see bad cyrillic characters, for example \xe0\xf1 , english text is ok.
Experimented with setting encode('cp1251','ignore') but then i got an error
UnicodeDecodeError: 'charmap' codec can't decode byte X in position Y: character maps to (as here UnicodeDecodeError: 'charmap' codec can't decode byte X in position Y: character maps to <undefined>)
import sys
import json
import csv
##
# This function converts an item like
# {
# "item_1":"value_11",
# "item_2":"value_12",
# "item_3":"value_13",
# "item_4":["sub_value_14", "sub_value_15"],
# "item_5":{
# "sub_item_1":"sub_item_value_11",
# "sub_item_2":["sub_item_value_12", "sub_item_value_13"]
# }
# }
# To
# {
# "node_item_1":"value_11",
# "node_item_2":"value_12",
# "node_item_3":"value_13",
# "node_item_4_0":"sub_value_14",
# "node_item_4_1":"sub_value_15",
# "node_item_5_sub_item_1":"sub_item_value_11",
# "node_item_5_sub_item_2_0":"sub_item_value_12",
# "node_item_5_sub_item_2_0":"sub_item_value_13"
# }
##
def reduce_item(key, value):
global reduced_item
#Reduction Condition 1
if type(value) is list:
i=0
for sub_item in value:
reduce_item(key+'_'+str(i), sub_item)
i=i+1
#Reduction Condition 2
elif type(value) is dict:
sub_keys = value.keys()
for sub_key in sub_keys:
reduce_item(key+'_'+str(sub_key), value[sub_key])
#Base Condition
else:
reduced_item[str(key)] = (str(value)).encode('cp1251','ignore')
if __name__ == "__main__":
if len(sys.argv) != 4:
print("\nUsage: python json_to_csv.py <node_name> <json_in_file_path> <csv_out_file_path>\n")
else:
#Reading arguments
node = sys.argv[1]
json_file_path = sys.argv[2]
csv_file_path = sys.argv[3]
fp = open(json_file_path, 'r', encoding='cp1251')
json_value = fp.read()
raw_data = json.loads(json_value)
processed_data = []
header = []
for item in raw_data[node]:
reduced_item = {}
reduce_item(node, item)
header += reduced_item.keys()
processed_data.append(reduced_item)
header = list(set(header))
header.sort()
with open(csv_file_path, 'wt+') as f:#wb+ for python 2.7
writer = csv.DictWriter(f, header, quoting=csv.QUOTE_ALL, delimiter=',')
writer.writeheader()
for row in processed_data:
writer.writerow(row)
print("Just completed writing csv file with %d columns" % len(header))
How to convert cyrillic correctly and also i want to skip bad characters?
You need to know cyrylic encoding of which file are you going to open.
For example that is enough in python3:
with open(args.input_file, 'r', encoding="cp866") as input_file:
data = input_file.read()
structure = json.loads(data)
In python3 data variable is automatically utf-8. In python2 there might be problem with feeding input to json.
Also try to print out in python interpreter line and see if symbols are right. Without input file is hard to tell if everything is right. Also are you sure that it is python, not excel related problem? Did you tried to open in notepad++ or similar encodings respecting editors?
Most important thing working with encodings is cheking that input and output is right. I would suggest to look here.
maybe you could use the chardet to detect the file's encoding.
import chardet
File='arq.GeoJson'
enc=chardet.detect(open(File,'rb').read())['encoding']
with open(File,'r', encoding = enc) as f:
data=json.load(f)
f.close()
This avoids 'to kick' the encoding.

Python UnicodeEncodeError with pre decoded UTF-8

I'm trying to parse through a bunch of logfiles (up to 4GiB) in a tar.gz file. The source files come from RedHat 5.8 Server systems and SunOS 5.10, processing has to be done on WindowsXP.
I iterate through the tar.gz files, read the files, decode the file contents to UTF-8 and parse them with regular expressions before further processing.
When I'm writing out the processed data along with the raw-data that was read from the tar.gz, I get the following error:
Traceback (most recent call last):
File "C:\WoMMaxX\lt_automation\Tools\LogParser.py", line 375, in <module>
p.analyze_longtails()
File "C:\WoMMaxX\lt_automation\Tools\LogParser.py", line 196, in analyze_longtails
oFile.write(entries[key]['source'] + '\n')
File "C:\Python\3.2\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 24835-24836: character maps
to <undefined>
Heres the part where I read and parse the logfiles:
def getSalesSoaplogEntries(perfid=None):
for tfile in parser.salestarfiles:
path = os.path.join(parser.logpath,tfile)
if os.path.isfile(path):
if tarfile.is_tarfile(path):
tar = tarfile.open(path,'r:gz')
for tarMember in tar.getmembers():
if 'salescomponent-soap.log' in tarMember.name:
tarMemberFile = tar.extractfile(tarMember)
content = tarMemberFile.read().decode('UTF-8','surrogateescape')
for m in parser.soaplogregex.finditer(content):
entry = {}
entry['time'] = datetime(datetime.now().year, int(m.group('month')), int(m.group('day')),int(m.group('hour')), int(m.group('minute')), int(m.group('second')), int(m.group('millis'))*1000)
entry['perfid'] = m.group('perfid')
entry['direction'] = m.group('direction')
entry['payload'] = m.group('payload')
entry['file'] = tarMember.name
entry['source'] = m.group(0)
sm = parser.soaplogmethodregex.match(entry['payload'])
if sm:
entry['method'] = sm.group('method')
if entry['time'] >= parser.starttime and entry['time'] <= parser.endtime:
if perfid and entry['perfid'] == perfid:
yield entry
tar.members = []
And heres the part where I write the processed information along with the raw data out(its an aggregation of all log-entries for one specific process:
if len(entries) > 0:
time = perfentry['time']
filename = time.isoformat('-').replace(':','').replace('-','') + 'longtail_' + perfentry['perfid'] + '.txt'
oFile = open(os.path.join(parser.logpath,filename), 'w')
oFile.write(perfentry['source'] +'\n')
oFile.write('------\n')
for key in sorted(entries.keys()):
oFile.write('------\n')
oFile.write(entries[key]['source'] + '\n') #<-- here it is failing
What I don't get is why it seems to be correct to read the files in UTF-8, it is not possible to just write them out as UTF-8. What am I doing wrong?
Your output file is using the default encoding for your OS, which is not UTF-8. Use codecs.open instead of open and specify encoding='utf-8'.
oFile = codecs.open(os.path.join(parser.logpath,filename), 'w', encoding='utf-8')
See http://docs.python.org/howto/unicode.html#reading-and-writing-unicode-data

Categories