Python UnicodeEncodeError with pre decoded UTF-8 - python

I'm trying to parse through a bunch of logfiles (up to 4GiB) in a tar.gz file. The source files come from RedHat 5.8 Server systems and SunOS 5.10, processing has to be done on WindowsXP.
I iterate through the tar.gz files, read the files, decode the file contents to UTF-8 and parse them with regular expressions before further processing.
When I'm writing out the processed data along with the raw-data that was read from the tar.gz, I get the following error:
Traceback (most recent call last):
File "C:\WoMMaxX\lt_automation\Tools\LogParser.py", line 375, in <module>
p.analyze_longtails()
File "C:\WoMMaxX\lt_automation\Tools\LogParser.py", line 196, in analyze_longtails
oFile.write(entries[key]['source'] + '\n')
File "C:\Python\3.2\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 24835-24836: character maps
to <undefined>
Heres the part where I read and parse the logfiles:
def getSalesSoaplogEntries(perfid=None):
for tfile in parser.salestarfiles:
path = os.path.join(parser.logpath,tfile)
if os.path.isfile(path):
if tarfile.is_tarfile(path):
tar = tarfile.open(path,'r:gz')
for tarMember in tar.getmembers():
if 'salescomponent-soap.log' in tarMember.name:
tarMemberFile = tar.extractfile(tarMember)
content = tarMemberFile.read().decode('UTF-8','surrogateescape')
for m in parser.soaplogregex.finditer(content):
entry = {}
entry['time'] = datetime(datetime.now().year, int(m.group('month')), int(m.group('day')),int(m.group('hour')), int(m.group('minute')), int(m.group('second')), int(m.group('millis'))*1000)
entry['perfid'] = m.group('perfid')
entry['direction'] = m.group('direction')
entry['payload'] = m.group('payload')
entry['file'] = tarMember.name
entry['source'] = m.group(0)
sm = parser.soaplogmethodregex.match(entry['payload'])
if sm:
entry['method'] = sm.group('method')
if entry['time'] >= parser.starttime and entry['time'] <= parser.endtime:
if perfid and entry['perfid'] == perfid:
yield entry
tar.members = []
And heres the part where I write the processed information along with the raw data out(its an aggregation of all log-entries for one specific process:
if len(entries) > 0:
time = perfentry['time']
filename = time.isoformat('-').replace(':','').replace('-','') + 'longtail_' + perfentry['perfid'] + '.txt'
oFile = open(os.path.join(parser.logpath,filename), 'w')
oFile.write(perfentry['source'] +'\n')
oFile.write('------\n')
for key in sorted(entries.keys()):
oFile.write('------\n')
oFile.write(entries[key]['source'] + '\n') #<-- here it is failing
What I don't get is why it seems to be correct to read the files in UTF-8, it is not possible to just write them out as UTF-8. What am I doing wrong?

Your output file is using the default encoding for your OS, which is not UTF-8. Use codecs.open instead of open and specify encoding='utf-8'.
oFile = codecs.open(os.path.join(parser.logpath,filename), 'w', encoding='utf-8')
See http://docs.python.org/howto/unicode.html#reading-and-writing-unicode-data

Related

Python3 f.write UnicodeEncodeError: 'utf-8' codec can't encode characters surrogates not allowed

Run Python files over the web (php).
Afterwards, an error occurs while printing the Korean string to a file with Python.
On the other hand, running Python files directly using the terminal does not cause errors.
Do you know what the problem is?
Please help me.
error Traceback (most recent call last): File "makeApp.py", line 171,
in modify_app_info(app_name) File "makeApp.py", line 65, in modify_app_info f.write(line+"\n") UnicodeEncodeError: 'utf-8' codec can't encode characters in position 13-30: surrogates not allowed
Below is the code that causes the problem.
lines = read_file(read_file_path)
f = open(read_file_path, 'r', encoding='UTF-8')
lines = f.readlines()
f.close()
f = open(write_file_path, 'w', encoding='UTF-8')
for line in lines:
if '"name": "userInputAppName"' in line:
line = ' "name": "' + app_name + '",')
continue
f.write(line+"\n")
# f.write(line)
f.close()
Remove the encoding param, cauz you open a file in encoded mode so, you can't join any substring on the string.
So your code will be--
# ...
lines = read_file(read_file_path)
f = open(read_file_path, 'r')
lines = f.readlines()
f.close()
f = open(write_file_path, 'w')
for line in lines:
if '"name": "userInputAppName"' in line:
line = ' "name": "' + app_name + '",')
continue
f.write(line+"\n")
# f.write(line)
f.close()

Unicode error in `str.format()`

I am trying to run the following script, which scans for *.csproj files and checks for project dependencies in Visual Studio solutions, but I am getting the following error. I have already tried all sorts of codec and encode/decode and u'' combination, to no avail...
(the diacritics are intended and I plan to keep them).
Traceback (most recent call last):
File "E:\00 GIT\SolutionDependencies.py", line 44, in <module>
references = GetProjectReferences("MiotecGit")
File "E:\00 GIT\SolutionDependencies.py", line 40, in GetProjectReferences
outputline = u'"{}" -> "{}"'.format(projectName, referenceName)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xed in position 19: ordinal not in range(128)
import glob
import os
import fnmatch
import re
import subprocess
import codecs
gvtemplate = """
digraph g {
rankdir = "LR"
#####
}
""".strip()
def GetProjectFiles(rootFolder):
result = []
for root, dirnames, filenames in os.walk(rootFolder):
for filename in fnmatch.filter(filenames, "*.csproj"):
result.append(os.path.join(root, filename))
return result
def GetProjectName(path):
result = os.path.splitext(os.path.basename(path))[0]
return result
def GetProjectReferences(rootFolder):
result = []
projectFiles = GetProjectFiles(rootFolder)
for projectFile in projectFiles:
projectName = GetProjectName(projectFile)
with codecs.open(projectFile, 'r', "utf-8") as pfile:
content = pfile.read()
references = re.findall("<ProjectReference.*?</ProjectReference>", content, re.DOTALL)
for reference in references:
referenceProject = re.search('"([^"]*?)"', reference).group(1)
referenceName = GetProjectName(referenceProject)
outputline = u'"{}" -> "{}"'.format(projectName, referenceName)
result.append(outputline)
return result
references = GetProjectReferences("MiotecGit")
output = u"\n".join(*references)
with codecs.open("output.gv", "w", 'utf-8') as outputfile:
outputfile.write(gvtemplate.replace("#####", output))
graphvizpath = glob.glob(r"C:\Program Files*\Graphviz*\bin\dot.*")[0]
command = '{} -Gcharset=latin1 -T pdf -o "output.pdf" "output.gv"'.format(graphvizpath)
subprocess.call(command)
When Python 2.x tries to use a byte string in a Unicode context, it automatically tries to decode the byte string to a Unicode string using the ascii codec. While the ascii codec is a safe choice, it often doesn't work.
For Windows environments the mbcs codec will select the code page that Windows uses for 8-bit characters. You can decode the string yourself explicitly.
outputline = u'"{}" -> "{}"'.format(projectName.decode('mbcs'), referenceName.decode('mbcs'))

UnicodeDecodeError with QFileDialog in PyQt

Hello I am having an issue with my program when it comes to a file dialog function I have.
First here is my code:
def getFileInfo(self):
global logName
logName = QtGui.QFileDialog.getOpenFileName()
return logName
def getFileName(self):
return logName
def compareAction(self):
def process(infile, outfile, keywords):
keys = [[k[0], k[1], 0] for k in keywords]
endk = None
with open(infile, 'rb') as fdin:
with open(outfile, 'ab') as fdout:
fdout.write("<" + words + ">" + "\r\n")
for line in fdin:
if endk is not None:
fdout.write(line)
if line.find(endk) >= 0:
fdout.write("\r\n")
endk = None
else:
for k in keys:
index = line.find(k[0])
if index >= 0:
fdout.write(line[index + len(k[0]):].lstrip())
endk = k[1]
k[2] += 1
if endk is not None:
raise Exception(endk + "Not found before end of file")
return keys
clearOutput = open('test.txt', 'wb')
clearOutput.truncate()
clearOutput.close()
outputText = 'test.txt'
end_token = "[+][+]"
inputFile = logName
start_token = self.serialInputText.toPlainText()
split_start = start_token.split(' ')
for words in split_start:
process(inputFile,outputText,((words + "SHOWALL"),))
fo = open(outputText, "rb")
text = fo.read()
print start_token + '\r\n'
print split_start
print inputFile
Okay, So the general idea of this piece of code is grabbing a some inputted text from a TextEdit in my PyQt GUI. Then, splitting that string into a List that can be used to 'scan' through the file and if there are any matches then print out those matches into another text document.
Steps:
User inputs texts into TextEdit
Texts inside TextEdit gets stored into a QString
That QString has a space as a delimiter so we split each entry into a list. i.e This is a list -> [u'This', u'Is', u'A', u'List'] (The list has a u due to my code using sip)
Now that we have this QStringList we can pass it through my def process function.
We need a file to search through obviously, this is where the def getFileInfo(self) and def GetFileName(Self) function come into play.
So after the user has inputted some text, selected a file to search through, he/she will press a Button, lets call it CompareButton, and it will execute the def compareAction(self) function.
Issue
Currently, my issue is this error that appears after doing all the steps it fails on step number 6. This is my error:
Traceback (most recent call last):
File "RETRACTED.py", line 278, in compareAction
process(inputFile,outputText,((words + "SHOWALL"),))
File "RETRACTED.py", line 260, in process
index = line.find(k[0])
UnicodeDecodeError: 'ascii' codec can't decode byte 0xef in position 0: ordinal not in range(128)
I am unsure as to why this error is happening. I have been searching for a similar issue but i believe it has to do with my process function. I am unsure
That specific error:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xef in position 0: ordinal not in range(128)
looks like a problem with an (unexpected) Byte Order Mark (BOM) in the input file. I suspect the log file is UTF-8 with BOM.
Try changing your file open line to:
open(infile, 'rb', encoding='utf-8-sig')
to have the the BOM marker stripped from the file.

convert pdf to text file in python

My code works perfectly for some pdf, but some show error:
Traceback (most recent call last):
File "con.py", line 24, in <module>
print getPDFContent("abc.pdf")
File "con.py", line 17, in getPDFContent
f.write(a)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u02dd' in position 64: ordinal not in range(128)
My code is
import pyPdf
def getPDFContent(path):
content = ""
pdf = pyPdf.PdfFileReader(file(path, "rb"))
for i in range(0, pdf.getNumPages()):
f=open("xxx.txt",'a')
content= pdf.getPage(i).extractText() + "\n"
import string
c=content.split()
for a in c:
f.write(" ")
f.write(a)
f.write('\n')
f.close()
return content
print getPDFContent("abc.pdf")
Your problem is that when you call f.write() with a string, it is trying to encode it using the ascii codec. Your pdf contains characters that can not be represented by the ascii codec. Try explicitly encoding your str, e.g.
a = a.encode('utf-8')
f.write(a)
Try
import sys
print getPDFContent("abc.pdf").encode(sys.getfilesystemencoding())

Python JSON to CSV - bad encoding, UnicodeDecodeError: 'charmap' codec can't decode byte

I have a problem converting nested JSON to CSV. For this i use https://github.com/vinay20045/json-to-csv (forked a bit to support python 3.4), here is full json-to-csv.py file.
Converting is working, if i set
#Base Condition
else:
reduced_item[str(key)] = (str(value)).encode('utf8','ignore')
and
fp = open(json_file_path, 'r', encoding='utf-8')
but when i import csv to MS Excel i see bad cyrillic characters, for example \xe0\xf1 , english text is ok.
Experimented with setting encode('cp1251','ignore') but then i got an error
UnicodeDecodeError: 'charmap' codec can't decode byte X in position Y: character maps to (as here UnicodeDecodeError: 'charmap' codec can't decode byte X in position Y: character maps to <undefined>)
import sys
import json
import csv
##
# This function converts an item like
# {
# "item_1":"value_11",
# "item_2":"value_12",
# "item_3":"value_13",
# "item_4":["sub_value_14", "sub_value_15"],
# "item_5":{
# "sub_item_1":"sub_item_value_11",
# "sub_item_2":["sub_item_value_12", "sub_item_value_13"]
# }
# }
# To
# {
# "node_item_1":"value_11",
# "node_item_2":"value_12",
# "node_item_3":"value_13",
# "node_item_4_0":"sub_value_14",
# "node_item_4_1":"sub_value_15",
# "node_item_5_sub_item_1":"sub_item_value_11",
# "node_item_5_sub_item_2_0":"sub_item_value_12",
# "node_item_5_sub_item_2_0":"sub_item_value_13"
# }
##
def reduce_item(key, value):
global reduced_item
#Reduction Condition 1
if type(value) is list:
i=0
for sub_item in value:
reduce_item(key+'_'+str(i), sub_item)
i=i+1
#Reduction Condition 2
elif type(value) is dict:
sub_keys = value.keys()
for sub_key in sub_keys:
reduce_item(key+'_'+str(sub_key), value[sub_key])
#Base Condition
else:
reduced_item[str(key)] = (str(value)).encode('cp1251','ignore')
if __name__ == "__main__":
if len(sys.argv) != 4:
print("\nUsage: python json_to_csv.py <node_name> <json_in_file_path> <csv_out_file_path>\n")
else:
#Reading arguments
node = sys.argv[1]
json_file_path = sys.argv[2]
csv_file_path = sys.argv[3]
fp = open(json_file_path, 'r', encoding='cp1251')
json_value = fp.read()
raw_data = json.loads(json_value)
processed_data = []
header = []
for item in raw_data[node]:
reduced_item = {}
reduce_item(node, item)
header += reduced_item.keys()
processed_data.append(reduced_item)
header = list(set(header))
header.sort()
with open(csv_file_path, 'wt+') as f:#wb+ for python 2.7
writer = csv.DictWriter(f, header, quoting=csv.QUOTE_ALL, delimiter=',')
writer.writeheader()
for row in processed_data:
writer.writerow(row)
print("Just completed writing csv file with %d columns" % len(header))
How to convert cyrillic correctly and also i want to skip bad characters?
You need to know cyrylic encoding of which file are you going to open.
For example that is enough in python3:
with open(args.input_file, 'r', encoding="cp866") as input_file:
data = input_file.read()
structure = json.loads(data)
In python3 data variable is automatically utf-8. In python2 there might be problem with feeding input to json.
Also try to print out in python interpreter line and see if symbols are right. Without input file is hard to tell if everything is right. Also are you sure that it is python, not excel related problem? Did you tried to open in notepad++ or similar encodings respecting editors?
Most important thing working with encodings is cheking that input and output is right. I would suggest to look here.
maybe you could use the chardet to detect the file's encoding.
import chardet
File='arq.GeoJson'
enc=chardet.detect(open(File,'rb').read())['encoding']
with open(File,'r', encoding = enc) as f:
data=json.load(f)
f.close()
This avoids 'to kick' the encoding.

Categories