Unicode error in `str.format()` - python

I am trying to run the following script, which scans for *.csproj files and checks for project dependencies in Visual Studio solutions, but I am getting the following error. I have already tried all sorts of codec and encode/decode and u'' combination, to no avail...
(the diacritics are intended and I plan to keep them).
Traceback (most recent call last):
File "E:\00 GIT\SolutionDependencies.py", line 44, in <module>
references = GetProjectReferences("MiotecGit")
File "E:\00 GIT\SolutionDependencies.py", line 40, in GetProjectReferences
outputline = u'"{}" -> "{}"'.format(projectName, referenceName)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xed in position 19: ordinal not in range(128)
import glob
import os
import fnmatch
import re
import subprocess
import codecs
gvtemplate = """
digraph g {
rankdir = "LR"
#####
}
""".strip()
def GetProjectFiles(rootFolder):
result = []
for root, dirnames, filenames in os.walk(rootFolder):
for filename in fnmatch.filter(filenames, "*.csproj"):
result.append(os.path.join(root, filename))
return result
def GetProjectName(path):
result = os.path.splitext(os.path.basename(path))[0]
return result
def GetProjectReferences(rootFolder):
result = []
projectFiles = GetProjectFiles(rootFolder)
for projectFile in projectFiles:
projectName = GetProjectName(projectFile)
with codecs.open(projectFile, 'r', "utf-8") as pfile:
content = pfile.read()
references = re.findall("<ProjectReference.*?</ProjectReference>", content, re.DOTALL)
for reference in references:
referenceProject = re.search('"([^"]*?)"', reference).group(1)
referenceName = GetProjectName(referenceProject)
outputline = u'"{}" -> "{}"'.format(projectName, referenceName)
result.append(outputline)
return result
references = GetProjectReferences("MiotecGit")
output = u"\n".join(*references)
with codecs.open("output.gv", "w", 'utf-8') as outputfile:
outputfile.write(gvtemplate.replace("#####", output))
graphvizpath = glob.glob(r"C:\Program Files*\Graphviz*\bin\dot.*")[0]
command = '{} -Gcharset=latin1 -T pdf -o "output.pdf" "output.gv"'.format(graphvizpath)
subprocess.call(command)

When Python 2.x tries to use a byte string in a Unicode context, it automatically tries to decode the byte string to a Unicode string using the ascii codec. While the ascii codec is a safe choice, it often doesn't work.
For Windows environments the mbcs codec will select the code page that Windows uses for 8-bit characters. You can decode the string yourself explicitly.
outputline = u'"{}" -> "{}"'.format(projectName.decode('mbcs'), referenceName.decode('mbcs'))

Related

Python convention name files encoding from iso-8859-5 to utf-8

I have about 3500 files whose name is encoded in 'iso-8859-5' and the contents too.
here's how it looks on the Linux console and the 7 zip program:
I'm trying to write a script that converts to 'UTF-8'
# -*- coding: utf-8 -*-
import os
#Exemple
# how it should look like
#iso-8859-5 ==> utf-8
#НјБ_ФШРУ_Г99 ==> ЭМС_диаг_У99
path = r"C://Users//Kamel//Desktop//работа//macros"
obj = os.scandir(path)
for entry in obj:
if entry.is_dir() or entry.is_file():
command = entry.name
print(command, end="\t\t")
file_name = command.encode('iso-8859-5').decode('UTF-8')
print(command)
I get this error
C:\Python\Python310\python.exe D:/PycharmProjects/pythonProject3/ansi_to_utf.py
Traceback (most recent call last):
File "D:\PycharmProjects\pythonProject3\ansi_to_utf.py", line 15, in <module>
file_name = command.encode('iso-8859-5').decode('UTF-8')
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb2 in position 11: invalid start byte
BE_BEF BE_BEF
BE_BEF_IMP_0 BE_BEF_IMP_0
BE_BEF_IMP_1 BE_BEF_IMP_1
BE_BEF_IMP_6 BE_BEF_IMP_6
BE_BEF_IMP_7 BE_BEF_IMP_7
BE_BEF_IMP_8 BE_BEF_IMP_8
BE_BEF_IMP_K BE_BEF_IMP_K
BE_BEF_IMP_T BE_BEF_IMP_T
BE_BEF_IMP_В
Process finished with exit code 1
A mojibake case. Your example НјБ_ФШРУ_Г99 ==> ЭМС_диаг_У99 could be accomplished as:
'НјБ_ФШРУ_Г99'.encode('cp1251').decode('iso-8859-5')
# 'ЭМС_диаг_У99'
or (alternatively) as
'НјБ_ФШРУ_Г99'.encode('ptcp154').decode('iso-8859-5')
# 'ЭМС_диаг_У99'
Your failing example (… can't decode byte 0xb2 in position 11):
'BE_BEF_IMP_В'.encode('iso-8859-5')
# b'BE_BEF_IMP_\xb2'
is solved using the same mechanism:
'BE_BEF_IMP_В'.encode('cp1251').decode('iso-8859-5')
# 'BE_BEF_IMP_Т'

'charmap' codec can't decode byte 0x9d

i'm making a program in python that convert a the input json file in file xml, it work good for the first 560 files but than this happened:
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 1871: character maps to <undefined>
this is my code:
# -*- coding: utf-8 -*-
##IMPORT
import codecs
import string
import sys
from src.json2xml import Json2xml
import unicodedata
import os
##FUNCTION
def fn_conversione(f):
data = Json2xml.fromjsonfile('json//' + f).data
data_object = Json2xml(data)
output = data_object.json2xml() #xml output
return (output)
def fn_letturaFile():
filenamelist = []
path = './json'
for filename in os.listdir(path):
filenamelist.append(filename)
return (filenamelist)
def fn_createXml(filenamejson, content):
path = './xml'
filenamexml = filenamejson.replace('.json', '.xml')
f = open("./xml/" + filenamexml, "w+", encoding="utf8")
f.write(content)
f.close()
return("scritto")
nomeFile = fn_letturaFile()
for i in range (len(nomeFile)):
contenuto = fn_conversione(nomeFile[i])
if contenuto != None:
fn_createXml(nomeFile[i], contenuto)
print(i, "/", len(nomeFile))
print(i, "/", len(nomeFile))

convert pdf to text file in python

My code works perfectly for some pdf, but some show error:
Traceback (most recent call last):
File "con.py", line 24, in <module>
print getPDFContent("abc.pdf")
File "con.py", line 17, in getPDFContent
f.write(a)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u02dd' in position 64: ordinal not in range(128)
My code is
import pyPdf
def getPDFContent(path):
content = ""
pdf = pyPdf.PdfFileReader(file(path, "rb"))
for i in range(0, pdf.getNumPages()):
f=open("xxx.txt",'a')
content= pdf.getPage(i).extractText() + "\n"
import string
c=content.split()
for a in c:
f.write(" ")
f.write(a)
f.write('\n')
f.close()
return content
print getPDFContent("abc.pdf")
Your problem is that when you call f.write() with a string, it is trying to encode it using the ascii codec. Your pdf contains characters that can not be represented by the ascii codec. Try explicitly encoding your str, e.g.
a = a.encode('utf-8')
f.write(a)
Try
import sys
print getPDFContent("abc.pdf").encode(sys.getfilesystemencoding())

Python JSON to CSV - bad encoding, UnicodeDecodeError: 'charmap' codec can't decode byte

I have a problem converting nested JSON to CSV. For this i use https://github.com/vinay20045/json-to-csv (forked a bit to support python 3.4), here is full json-to-csv.py file.
Converting is working, if i set
#Base Condition
else:
reduced_item[str(key)] = (str(value)).encode('utf8','ignore')
and
fp = open(json_file_path, 'r', encoding='utf-8')
but when i import csv to MS Excel i see bad cyrillic characters, for example \xe0\xf1 , english text is ok.
Experimented with setting encode('cp1251','ignore') but then i got an error
UnicodeDecodeError: 'charmap' codec can't decode byte X in position Y: character maps to (as here UnicodeDecodeError: 'charmap' codec can't decode byte X in position Y: character maps to <undefined>)
import sys
import json
import csv
##
# This function converts an item like
# {
# "item_1":"value_11",
# "item_2":"value_12",
# "item_3":"value_13",
# "item_4":["sub_value_14", "sub_value_15"],
# "item_5":{
# "sub_item_1":"sub_item_value_11",
# "sub_item_2":["sub_item_value_12", "sub_item_value_13"]
# }
# }
# To
# {
# "node_item_1":"value_11",
# "node_item_2":"value_12",
# "node_item_3":"value_13",
# "node_item_4_0":"sub_value_14",
# "node_item_4_1":"sub_value_15",
# "node_item_5_sub_item_1":"sub_item_value_11",
# "node_item_5_sub_item_2_0":"sub_item_value_12",
# "node_item_5_sub_item_2_0":"sub_item_value_13"
# }
##
def reduce_item(key, value):
global reduced_item
#Reduction Condition 1
if type(value) is list:
i=0
for sub_item in value:
reduce_item(key+'_'+str(i), sub_item)
i=i+1
#Reduction Condition 2
elif type(value) is dict:
sub_keys = value.keys()
for sub_key in sub_keys:
reduce_item(key+'_'+str(sub_key), value[sub_key])
#Base Condition
else:
reduced_item[str(key)] = (str(value)).encode('cp1251','ignore')
if __name__ == "__main__":
if len(sys.argv) != 4:
print("\nUsage: python json_to_csv.py <node_name> <json_in_file_path> <csv_out_file_path>\n")
else:
#Reading arguments
node = sys.argv[1]
json_file_path = sys.argv[2]
csv_file_path = sys.argv[3]
fp = open(json_file_path, 'r', encoding='cp1251')
json_value = fp.read()
raw_data = json.loads(json_value)
processed_data = []
header = []
for item in raw_data[node]:
reduced_item = {}
reduce_item(node, item)
header += reduced_item.keys()
processed_data.append(reduced_item)
header = list(set(header))
header.sort()
with open(csv_file_path, 'wt+') as f:#wb+ for python 2.7
writer = csv.DictWriter(f, header, quoting=csv.QUOTE_ALL, delimiter=',')
writer.writeheader()
for row in processed_data:
writer.writerow(row)
print("Just completed writing csv file with %d columns" % len(header))
How to convert cyrillic correctly and also i want to skip bad characters?
You need to know cyrylic encoding of which file are you going to open.
For example that is enough in python3:
with open(args.input_file, 'r', encoding="cp866") as input_file:
data = input_file.read()
structure = json.loads(data)
In python3 data variable is automatically utf-8. In python2 there might be problem with feeding input to json.
Also try to print out in python interpreter line and see if symbols are right. Without input file is hard to tell if everything is right. Also are you sure that it is python, not excel related problem? Did you tried to open in notepad++ or similar encodings respecting editors?
Most important thing working with encodings is cheking that input and output is right. I would suggest to look here.
maybe you could use the chardet to detect the file's encoding.
import chardet
File='arq.GeoJson'
enc=chardet.detect(open(File,'rb').read())['encoding']
with open(File,'r', encoding = enc) as f:
data=json.load(f)
f.close()
This avoids 'to kick' the encoding.

Python UnicodeEncodeError with pre decoded UTF-8

I'm trying to parse through a bunch of logfiles (up to 4GiB) in a tar.gz file. The source files come from RedHat 5.8 Server systems and SunOS 5.10, processing has to be done on WindowsXP.
I iterate through the tar.gz files, read the files, decode the file contents to UTF-8 and parse them with regular expressions before further processing.
When I'm writing out the processed data along with the raw-data that was read from the tar.gz, I get the following error:
Traceback (most recent call last):
File "C:\WoMMaxX\lt_automation\Tools\LogParser.py", line 375, in <module>
p.analyze_longtails()
File "C:\WoMMaxX\lt_automation\Tools\LogParser.py", line 196, in analyze_longtails
oFile.write(entries[key]['source'] + '\n')
File "C:\Python\3.2\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 24835-24836: character maps
to <undefined>
Heres the part where I read and parse the logfiles:
def getSalesSoaplogEntries(perfid=None):
for tfile in parser.salestarfiles:
path = os.path.join(parser.logpath,tfile)
if os.path.isfile(path):
if tarfile.is_tarfile(path):
tar = tarfile.open(path,'r:gz')
for tarMember in tar.getmembers():
if 'salescomponent-soap.log' in tarMember.name:
tarMemberFile = tar.extractfile(tarMember)
content = tarMemberFile.read().decode('UTF-8','surrogateescape')
for m in parser.soaplogregex.finditer(content):
entry = {}
entry['time'] = datetime(datetime.now().year, int(m.group('month')), int(m.group('day')),int(m.group('hour')), int(m.group('minute')), int(m.group('second')), int(m.group('millis'))*1000)
entry['perfid'] = m.group('perfid')
entry['direction'] = m.group('direction')
entry['payload'] = m.group('payload')
entry['file'] = tarMember.name
entry['source'] = m.group(0)
sm = parser.soaplogmethodregex.match(entry['payload'])
if sm:
entry['method'] = sm.group('method')
if entry['time'] >= parser.starttime and entry['time'] <= parser.endtime:
if perfid and entry['perfid'] == perfid:
yield entry
tar.members = []
And heres the part where I write the processed information along with the raw data out(its an aggregation of all log-entries for one specific process:
if len(entries) > 0:
time = perfentry['time']
filename = time.isoformat('-').replace(':','').replace('-','') + 'longtail_' + perfentry['perfid'] + '.txt'
oFile = open(os.path.join(parser.logpath,filename), 'w')
oFile.write(perfentry['source'] +'\n')
oFile.write('------\n')
for key in sorted(entries.keys()):
oFile.write('------\n')
oFile.write(entries[key]['source'] + '\n') #<-- here it is failing
What I don't get is why it seems to be correct to read the files in UTF-8, it is not possible to just write them out as UTF-8. What am I doing wrong?
Your output file is using the default encoding for your OS, which is not UTF-8. Use codecs.open instead of open and specify encoding='utf-8'.
oFile = codecs.open(os.path.join(parser.logpath,filename), 'w', encoding='utf-8')
See http://docs.python.org/howto/unicode.html#reading-and-writing-unicode-data

Categories