convert pdf to text file in python - python

My code works perfectly for some pdf, but some show error:
Traceback (most recent call last):
File "con.py", line 24, in <module>
print getPDFContent("abc.pdf")
File "con.py", line 17, in getPDFContent
f.write(a)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u02dd' in position 64: ordinal not in range(128)
My code is
import pyPdf
def getPDFContent(path):
content = ""
pdf = pyPdf.PdfFileReader(file(path, "rb"))
for i in range(0, pdf.getNumPages()):
f=open("xxx.txt",'a')
content= pdf.getPage(i).extractText() + "\n"
import string
c=content.split()
for a in c:
f.write(" ")
f.write(a)
f.write('\n')
f.close()
return content
print getPDFContent("abc.pdf")

Your problem is that when you call f.write() with a string, it is trying to encode it using the ascii codec. Your pdf contains characters that can not be represented by the ascii codec. Try explicitly encoding your str, e.g.
a = a.encode('utf-8')
f.write(a)

Try
import sys
print getPDFContent("abc.pdf").encode(sys.getfilesystemencoding())

Related

Python emoji to bytes write to txt file

I get this error
UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f525' in position 0: character maps to
I would like to write for example "🔥" to a txt file and it should be \U0001f525 written in the txt file
Here's my code
test1 = f"{config['emoji']}"
with open('emoji.txt', 'w') as f:
f.write(test1)
test1 = "🔥"
with open('emoji.txt', 'w') as f:
transformed = (test1
.encode('utf-16', 'surrogatepass')\
.decode('utf-16')\
.encode("raw_unicode_escape")\
.decode("latin_1"))
f.write(transformed)
Adapted from this answer

UnicodeEncodeError: 'latin-1' codec can't encode character u'\u2019' in position 4: ordinal not in range(256)

I am using eyeD3 to edit metadata of mp3 files. I am unable to set lyrics tag.
def fetch_lyrics(title, artist):
URL='http://makeitpersonal.co/lyrics?artist=%s&title=%s'
webaddr=(URL %(artist, title)).replace(" ", "%20")
print webaddr
response = requests.get(webaddr)
if response.content=="Sorry, We don't have lyrics for this song yet.":
return 0
else:
return response.content
def get_lyrics(pattern, path=os.getcwd()):
files=find(pattern, path)
matches = len(files)
if matches==1:
tag = eyeD3.Tag()
tag.link(files[0])
lyrics = tag.getLyrics()
if lyrics:
for l in lyrics:
print l.lyrics
else:
print "Lyrics not found. Searching online..."
tag = eyeD3.Tag()
tag.link(files[0])
artist = tag.getArtist()
title = tag.getTitle()
l = fetch_lyrics(title, artist)
if l==0:
print "No matches found."
else:
#print l
tag.addLyrics(l.decode('utf-8'))
tag.update()
The traceback that I got is:
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "lyrics.py", line 99, in get_lyrics
tag.update()
File "/usr/lib/python2.7/dist-packages/eyeD3/tag.py", line 526, in update
self.__saveV2Tag(version);
File "/usr/lib/python2.7/dist-packages/eyeD3/tag.py", line 1251, in __saveV2Ta
g
raw_frame = f.render();
File "/usr/lib/python2.7/dist-packages/eyeD3/frames.py", line 1200, in render
self.lyrics.encode(id3EncodingToString(self.encoding))
UnicodeEncodeError: 'latin-1' codec can't encode character u'\u2019' in position
4: ordinal not in range(256)
I don't understand the error. Do I need to pass any other parameter to the update() or addLyrics() functions. Any help?
I imagine you're trying to write ID3v1 (or ID3v2 single-byte) tag which only permits latin-1.
I think I had to patch my eyeD3 once to fix that problem. Try to turn ID3v1 off and set ID3v2 to v2.4 UTF-8.
Ideally - catch, turn off ID3v1, retry. The specific problem is that ’ quote is multi-byte.

Python using json to read a string with emoticons

I have a giant .json file
Im reading it with
json_data=open('file.json')
data = json.load(json_data)
for item in data['payload']['actions']:
print item['author']
print item['action_id']
print item['body']
json_data.close()
eventually one of the item['body'] contains this string (which are actually facebook emoticons) :
words words stuff stuff\ud83c\udf89\ud83c\udf8a\ud83c\udf87\ud83c\udf86\ud83c\udf08\ud83d\udca5\u2728\ud83d\udcab\ud83d\udc45\ud83d\udeb9\ud83d\udeba\ud83d\udc83\ud83d\ude4c\ud83c\udfc3\ud83d\udc6c
which makes it give this error:
Traceback (most recent call last):
File "curse.py", line 15, in <module>
print item['body']
File "C:\python27\lib\encodings\cp437.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_map)
UnicodeEncodeError: 'charmap' codec can't encode characters in position 35-63: character maps to <undefined>
Is there a way to make it ignore these?
You can use string.printable
import string
try:
print item['body']
except UnicodeEncodeError:
print(''.join(c for c in item['body'] if c in string.printable))

Python UnicodeEncodeError with pre decoded UTF-8

I'm trying to parse through a bunch of logfiles (up to 4GiB) in a tar.gz file. The source files come from RedHat 5.8 Server systems and SunOS 5.10, processing has to be done on WindowsXP.
I iterate through the tar.gz files, read the files, decode the file contents to UTF-8 and parse them with regular expressions before further processing.
When I'm writing out the processed data along with the raw-data that was read from the tar.gz, I get the following error:
Traceback (most recent call last):
File "C:\WoMMaxX\lt_automation\Tools\LogParser.py", line 375, in <module>
p.analyze_longtails()
File "C:\WoMMaxX\lt_automation\Tools\LogParser.py", line 196, in analyze_longtails
oFile.write(entries[key]['source'] + '\n')
File "C:\Python\3.2\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 24835-24836: character maps
to <undefined>
Heres the part where I read and parse the logfiles:
def getSalesSoaplogEntries(perfid=None):
for tfile in parser.salestarfiles:
path = os.path.join(parser.logpath,tfile)
if os.path.isfile(path):
if tarfile.is_tarfile(path):
tar = tarfile.open(path,'r:gz')
for tarMember in tar.getmembers():
if 'salescomponent-soap.log' in tarMember.name:
tarMemberFile = tar.extractfile(tarMember)
content = tarMemberFile.read().decode('UTF-8','surrogateescape')
for m in parser.soaplogregex.finditer(content):
entry = {}
entry['time'] = datetime(datetime.now().year, int(m.group('month')), int(m.group('day')),int(m.group('hour')), int(m.group('minute')), int(m.group('second')), int(m.group('millis'))*1000)
entry['perfid'] = m.group('perfid')
entry['direction'] = m.group('direction')
entry['payload'] = m.group('payload')
entry['file'] = tarMember.name
entry['source'] = m.group(0)
sm = parser.soaplogmethodregex.match(entry['payload'])
if sm:
entry['method'] = sm.group('method')
if entry['time'] >= parser.starttime and entry['time'] <= parser.endtime:
if perfid and entry['perfid'] == perfid:
yield entry
tar.members = []
And heres the part where I write the processed information along with the raw data out(its an aggregation of all log-entries for one specific process:
if len(entries) > 0:
time = perfentry['time']
filename = time.isoformat('-').replace(':','').replace('-','') + 'longtail_' + perfentry['perfid'] + '.txt'
oFile = open(os.path.join(parser.logpath,filename), 'w')
oFile.write(perfentry['source'] +'\n')
oFile.write('------\n')
for key in sorted(entries.keys()):
oFile.write('------\n')
oFile.write(entries[key]['source'] + '\n') #<-- here it is failing
What I don't get is why it seems to be correct to read the files in UTF-8, it is not possible to just write them out as UTF-8. What am I doing wrong?
Your output file is using the default encoding for your OS, which is not UTF-8. Use codecs.open instead of open and specify encoding='utf-8'.
oFile = codecs.open(os.path.join(parser.logpath,filename), 'w', encoding='utf-8')
See http://docs.python.org/howto/unicode.html#reading-and-writing-unicode-data

python: unicode problem

I am trying to decode a string I took from file:
file = open ("./Downloads/lamp-post.csv", 'r')
data = file.readlines()
data[0]
'\xff\xfeK\x00e\x00y\x00w\x00o\x00r\x00d\x00\t\x00C\x00o\x00m\x00p\x00e\x00t\x00i\x00t\x00i\x00o\x00n\x00\t\x00G\x00l\x00o\x00b\x00a\x00l\x00
\x00M\x00o\x00n\x00t\x00h\x00l\x00y\x00
\x00S\x00e\x00a\x00r\x00c\x00h\x00e\x00s\x00\t\x00D\x00e\x00c\x00
\x002\x000\x001\x000\x00\t\x00N\x00o\x00v\x00
\x002\x000\x001\x000\x00\t\x00O\x00c\x00t\x00
\x002\x000\x001\x000\x00\t\x00S\x00e\x00p\x00
\x002\x000\x001\x000\x00\t\x00A\x00u\x00g\x00
\x002\x000\x001\x000\x00\t\x00J\x00u\x00l\x00
\x002\x000\x001\x000\x00\t\x00J\x00u\x00n\x00
\x002\x000\x001\x000\x00\t\x00M\x00a\x00y\x00
\x002\x000\x001\x000\x00\t\x00A\x00p\x00r\x00
\x002\x000\x001\x000\x00\t\x00M\x00a\x00r\x00
\x002\x000\x001\x000\x00\t\x00F\x00e\x00b\x00
\x002\x000\x001\x000\x00\t\x00J\x00a\x00n\x00
\x002\x000\x001\x000\x00\t\x00A\x00d\x00
\x00s\x00h\x00a\x00r\x00e\x00\t\x00S\x00e\x00a\x00r\x00c\x00h\x00
\x00s\x00h\x00a\x00r\x00e\x00\t\x00E\x00s\x00t\x00i\x00m\x00a\x00t\x00e\x00d\x00
\x00A\x00v\x00g\x00.\x00
\x00C\x00P\x00C\x00\t\x00E\x00x\x00t\x00r\x00a\x00c\x00t\x00e\x00d\x00
\x00F\x00r\x00o\x00m\x00
\x00W\x00e\x00b\x00
\x00P\x00a\x00g\x00e\x00\t\x00L\x00o\x00c\x00a\x00l\x00
\x00M\x00o\x00n\x00t\x00h\x00l\x00y\x00
\x00S\x00e\x00a\x00r\x00c\x00h\x00e\x00s\x00\n'
Adding ignore do not really help...:
In [69]: data[2]
Out[69]: u'\u6700\u6100\u7200\u6400\u6500\u6e00\u2000\u6c00\u6100\u6d00\u7000\u2000\u7000\u6f00\u7300\u7400\u0900\u3000\u2e00\u3900\u3400\u0900\u3800\u3800\u3000\u0900\u2d00\u0900\u3300\u3200\u3000\u0900\u3300\u3900\u3000\u0900\u3300\u3900\u3000\u0900\u3400\u3800\u3000\u0900\u3500\u3900\u3000\u0900\u3500\u3900\u3000\u0900\u3700\u3200\u3000\u0900\u3700\u3200\u3000\u0900\u3300\u3900\u3000\u0900\u3300\u3200\u3000\u0900\u3200\u3600\u3000\u0900\u2d00\u0900\u2d00\u0900\ua300\u3200\u2e00\u3100\u3800\u0900\u2d00\u0900\u3400\u3800\u3000\u0a00'
In [70]: data[2].decode("utf-8",
"replace")
---------------------------------------------------------------------------
Traceback (most recent call last)
/Users/oleg/ in
()
/opt/local/lib/python2.5/encodings/utf_8.py
in decode(input, errors)
14
15 def decode(input, errors='strict'):
---> 16 return codecs.utf_8_decode(input, errors,
True)
17
18 class IncrementalEncoder(codecs.IncrementalEncoder):
:
'ascii' codec can't encode characters
in position 0-87: ordinal not in
range(128)
In [71]:
This looks like UTF-16 data. So try
data[0].rstrip("\n").decode("utf-16")
Edit (for your update): Try to decode the whole file at once, that is
data = open(...).read()
data.decode("utf-16")
The problem is that the line breaks in UTF-16 are "\n\x00", but using readlines() will split at the "\n", leaving the "\x00" character for the next line.
This file is a UTF-16-LE encoded file, with an initial BOM.
import codecs
fp= codecs.open("a", "r", "utf-16")
lines= fp.readlines()
EDIT
Since you posted 2.7 this is the 2.7 solution:
file = open("./Downloads/lamp-post.csv", "r")
data = [line.decode("utf-16", "replace") for line in file]
Ignoring undecodeable characters:
file = open("./Downloads/lamp-post.csv", "r")
data = [line.decode("utf-16", "ignore") for line in file]

Categories