python: unicode problem - python

I am trying to decode a string I took from file:
file = open ("./Downloads/lamp-post.csv", 'r')
data = file.readlines()
data[0]
'\xff\xfeK\x00e\x00y\x00w\x00o\x00r\x00d\x00\t\x00C\x00o\x00m\x00p\x00e\x00t\x00i\x00t\x00i\x00o\x00n\x00\t\x00G\x00l\x00o\x00b\x00a\x00l\x00
\x00M\x00o\x00n\x00t\x00h\x00l\x00y\x00
\x00S\x00e\x00a\x00r\x00c\x00h\x00e\x00s\x00\t\x00D\x00e\x00c\x00
\x002\x000\x001\x000\x00\t\x00N\x00o\x00v\x00
\x002\x000\x001\x000\x00\t\x00O\x00c\x00t\x00
\x002\x000\x001\x000\x00\t\x00S\x00e\x00p\x00
\x002\x000\x001\x000\x00\t\x00A\x00u\x00g\x00
\x002\x000\x001\x000\x00\t\x00J\x00u\x00l\x00
\x002\x000\x001\x000\x00\t\x00J\x00u\x00n\x00
\x002\x000\x001\x000\x00\t\x00M\x00a\x00y\x00
\x002\x000\x001\x000\x00\t\x00A\x00p\x00r\x00
\x002\x000\x001\x000\x00\t\x00M\x00a\x00r\x00
\x002\x000\x001\x000\x00\t\x00F\x00e\x00b\x00
\x002\x000\x001\x000\x00\t\x00J\x00a\x00n\x00
\x002\x000\x001\x000\x00\t\x00A\x00d\x00
\x00s\x00h\x00a\x00r\x00e\x00\t\x00S\x00e\x00a\x00r\x00c\x00h\x00
\x00s\x00h\x00a\x00r\x00e\x00\t\x00E\x00s\x00t\x00i\x00m\x00a\x00t\x00e\x00d\x00
\x00A\x00v\x00g\x00.\x00
\x00C\x00P\x00C\x00\t\x00E\x00x\x00t\x00r\x00a\x00c\x00t\x00e\x00d\x00
\x00F\x00r\x00o\x00m\x00
\x00W\x00e\x00b\x00
\x00P\x00a\x00g\x00e\x00\t\x00L\x00o\x00c\x00a\x00l\x00
\x00M\x00o\x00n\x00t\x00h\x00l\x00y\x00
\x00S\x00e\x00a\x00r\x00c\x00h\x00e\x00s\x00\n'
Adding ignore do not really help...:
In [69]: data[2]
Out[69]: u'\u6700\u6100\u7200\u6400\u6500\u6e00\u2000\u6c00\u6100\u6d00\u7000\u2000\u7000\u6f00\u7300\u7400\u0900\u3000\u2e00\u3900\u3400\u0900\u3800\u3800\u3000\u0900\u2d00\u0900\u3300\u3200\u3000\u0900\u3300\u3900\u3000\u0900\u3300\u3900\u3000\u0900\u3400\u3800\u3000\u0900\u3500\u3900\u3000\u0900\u3500\u3900\u3000\u0900\u3700\u3200\u3000\u0900\u3700\u3200\u3000\u0900\u3300\u3900\u3000\u0900\u3300\u3200\u3000\u0900\u3200\u3600\u3000\u0900\u2d00\u0900\u2d00\u0900\ua300\u3200\u2e00\u3100\u3800\u0900\u2d00\u0900\u3400\u3800\u3000\u0a00'
In [70]: data[2].decode("utf-8",
"replace")
---------------------------------------------------------------------------
Traceback (most recent call last)
/Users/oleg/ in
()
/opt/local/lib/python2.5/encodings/utf_8.py
in decode(input, errors)
14
15 def decode(input, errors='strict'):
---> 16 return codecs.utf_8_decode(input, errors,
True)
17
18 class IncrementalEncoder(codecs.IncrementalEncoder):
:
'ascii' codec can't encode characters
in position 0-87: ordinal not in
range(128)
In [71]:

This looks like UTF-16 data. So try
data[0].rstrip("\n").decode("utf-16")
Edit (for your update): Try to decode the whole file at once, that is
data = open(...).read()
data.decode("utf-16")
The problem is that the line breaks in UTF-16 are "\n\x00", but using readlines() will split at the "\n", leaving the "\x00" character for the next line.

This file is a UTF-16-LE encoded file, with an initial BOM.
import codecs
fp= codecs.open("a", "r", "utf-16")
lines= fp.readlines()

EDIT
Since you posted 2.7 this is the 2.7 solution:
file = open("./Downloads/lamp-post.csv", "r")
data = [line.decode("utf-16", "replace") for line in file]
Ignoring undecodeable characters:
file = open("./Downloads/lamp-post.csv", "r")
data = [line.decode("utf-16", "ignore") for line in file]

Related

Python emoji to bytes write to txt file

I get this error
UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f525' in position 0: character maps to
I would like to write for example "🔥" to a txt file and it should be \U0001f525 written in the txt file
Here's my code
test1 = f"{config['emoji']}"
with open('emoji.txt', 'w') as f:
f.write(test1)
test1 = "🔥"
with open('emoji.txt', 'w') as f:
transformed = (test1
.encode('utf-16', 'surrogatepass')\
.decode('utf-16')\
.encode("raw_unicode_escape")\
.decode("latin_1"))
f.write(transformed)
Adapted from this answer

Combining multiple txt files (Python 3, UnicodeDecodeError)

Below codes were used in Python 2 to combine all txt files in a folder. It worked fine.
import os
base_folder = "C:\\FDD\\"
all_files = []
for each in os.listdir(base_folder):
if each.endswith('.txt'):
kk = os.path.join(base_folder, each)
all_files.append(kk)
with open(base_folder + "Combined.txt", 'w') as outfile:
for fname in all_files:
with open(fname) as infile:
for line in infile:
outfile.write(line)
When in Python 3, it gives an error:
Traceback (most recent call last):
File "C:\Scripts\thescript.py", line 26, in <module>
for line in infile:
File "C:\Users\User\AppData\Local\Programs\Python\Python37-32\lib\codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'CP_UTF8' codec can't decode byte 0xe4 in position 53: No mapping for the Unicode character exists in the target code page.
I made this change:
with open(fname) as infile:
to
with open(fname, 'r', encoding = 'latin-1') as infile:
It gives me “MemoryError”.
How can I correct this error in Python 3? Thank you.
As #transilvlad suggested here, use the open method from the codecs module to read in the file:
import codecs
with codecs.open(fname, 'r', encoding = 'utf-8',
errors='ignore') as infile:
This will strip out (ignore) the characters in the error returning the string without them.

UnicodeEncodeError: 'cp950' codec can't encode character '\u810f' in position 67: illegal multibyte sequence

I scraped a web page, they contain some articles in Traditional Chinese, Simplified Chinese and English. There's no problem to save them in data and print, but when I tried to write them into my folder, it went error. I tried different ways to encoding them as UTF-8 in open but still not working. By the way, I compiled them on Anaconda's Jupyter.
Code:
for urls in all:
re=requests.get(urls)
soup=BeautifulSoup(re.text.encode('utf-8'), "html.parser")
title_tag = soup.select_one('.mms_article_title')
#print(title_tag.text)
list=[]
for tag in soup.select('.mms_article_content'):
list.append(tag.text)
list=([c.replace('\n', '') for c in list])
list=([c.replace('\r', '') for c in list])
list=([c.replace('\t', '') for c in list])
list=([c.replace(u'\xa0', u' ') for c in list])
list= (', '.join(list))
data={
"Title" : title_tag.text,
"Article": list
}
save_path= 'C:/json_n/'
file_name=save_path+'%s.json' % title_tag.text
with open(file_name, 'w') as f:
print(file_name)
file = json.dumps(data,ensure_ascii=False)
f.write(file)
I have 1700 files and it only prints 2 file_name. It also saves these 2 files in the folder "json_n", but only the first json file successfully saved data, the second one was empty since its data was in simplified Chinese, it could not write.
C:/json_n/肝動脈栓塞術.json
C:/json_n/心臟電氣生理學檢查注意事項(簡體中文).json
Error:
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-39-e73321a3e622> in <module>()
21 print(file_name)
22 file = json.dumps(data,ensure_ascii=False)
---> 23 f.write(file)
UnicodeEncodeError: 'cp950' codec can't encode character '\u810f' in position 67: illegal multibyte sequence
When I set encoding in open:
with open(file_name, 'w', encoding="utf-8") as f:
It still prints out 2 file_name, and the second one is still empty.
C:/json_n/肝動脈栓塞術.json
C:/json_n/心臟電氣生理學檢查注意事項(簡體中文).json
Error:
OSError Traceback (most recent call last)
<ipython-input-44-256bcf14fcbe> in <module>()
18 save_path= 'C:/json_n/'
19 file_name=save_path+'%s.json' % title_tag.text
---> 20 with open(file_name, 'w', encoding="utf-8") as f:
21 print(file_name)
22 file = json.dumps(data,ensure_ascii=False)
OSError: [Errno 22] Invalid argument: 'C:/json_n/如何使用胰島素空針抽取短效型胰島素?.json'

convert pdf to text file in python

My code works perfectly for some pdf, but some show error:
Traceback (most recent call last):
File "con.py", line 24, in <module>
print getPDFContent("abc.pdf")
File "con.py", line 17, in getPDFContent
f.write(a)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u02dd' in position 64: ordinal not in range(128)
My code is
import pyPdf
def getPDFContent(path):
content = ""
pdf = pyPdf.PdfFileReader(file(path, "rb"))
for i in range(0, pdf.getNumPages()):
f=open("xxx.txt",'a')
content= pdf.getPage(i).extractText() + "\n"
import string
c=content.split()
for a in c:
f.write(" ")
f.write(a)
f.write('\n')
f.close()
return content
print getPDFContent("abc.pdf")
Your problem is that when you call f.write() with a string, it is trying to encode it using the ascii codec. Your pdf contains characters that can not be represented by the ascii codec. Try explicitly encoding your str, e.g.
a = a.encode('utf-8')
f.write(a)
Try
import sys
print getPDFContent("abc.pdf").encode(sys.getfilesystemencoding())

Python UnicodeEncodeError with pre decoded UTF-8

I'm trying to parse through a bunch of logfiles (up to 4GiB) in a tar.gz file. The source files come from RedHat 5.8 Server systems and SunOS 5.10, processing has to be done on WindowsXP.
I iterate through the tar.gz files, read the files, decode the file contents to UTF-8 and parse them with regular expressions before further processing.
When I'm writing out the processed data along with the raw-data that was read from the tar.gz, I get the following error:
Traceback (most recent call last):
File "C:\WoMMaxX\lt_automation\Tools\LogParser.py", line 375, in <module>
p.analyze_longtails()
File "C:\WoMMaxX\lt_automation\Tools\LogParser.py", line 196, in analyze_longtails
oFile.write(entries[key]['source'] + '\n')
File "C:\Python\3.2\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 24835-24836: character maps
to <undefined>
Heres the part where I read and parse the logfiles:
def getSalesSoaplogEntries(perfid=None):
for tfile in parser.salestarfiles:
path = os.path.join(parser.logpath,tfile)
if os.path.isfile(path):
if tarfile.is_tarfile(path):
tar = tarfile.open(path,'r:gz')
for tarMember in tar.getmembers():
if 'salescomponent-soap.log' in tarMember.name:
tarMemberFile = tar.extractfile(tarMember)
content = tarMemberFile.read().decode('UTF-8','surrogateescape')
for m in parser.soaplogregex.finditer(content):
entry = {}
entry['time'] = datetime(datetime.now().year, int(m.group('month')), int(m.group('day')),int(m.group('hour')), int(m.group('minute')), int(m.group('second')), int(m.group('millis'))*1000)
entry['perfid'] = m.group('perfid')
entry['direction'] = m.group('direction')
entry['payload'] = m.group('payload')
entry['file'] = tarMember.name
entry['source'] = m.group(0)
sm = parser.soaplogmethodregex.match(entry['payload'])
if sm:
entry['method'] = sm.group('method')
if entry['time'] >= parser.starttime and entry['time'] <= parser.endtime:
if perfid and entry['perfid'] == perfid:
yield entry
tar.members = []
And heres the part where I write the processed information along with the raw data out(its an aggregation of all log-entries for one specific process:
if len(entries) > 0:
time = perfentry['time']
filename = time.isoformat('-').replace(':','').replace('-','') + 'longtail_' + perfentry['perfid'] + '.txt'
oFile = open(os.path.join(parser.logpath,filename), 'w')
oFile.write(perfentry['source'] +'\n')
oFile.write('------\n')
for key in sorted(entries.keys()):
oFile.write('------\n')
oFile.write(entries[key]['source'] + '\n') #<-- here it is failing
What I don't get is why it seems to be correct to read the files in UTF-8, it is not possible to just write them out as UTF-8. What am I doing wrong?
Your output file is using the default encoding for your OS, which is not UTF-8. Use codecs.open instead of open and specify encoding='utf-8'.
oFile = codecs.open(os.path.join(parser.logpath,filename), 'w', encoding='utf-8')
See http://docs.python.org/howto/unicode.html#reading-and-writing-unicode-data

Categories