get LookupError: unknown encoding: ascii - python

I can almost get wolframalpha to run, however I am stuck on this error:
i am working on this since 2 days and now everything is working but this error coming
texts = texts.encode('ascii’, ‘ignore')
LookupError: unknown encoding: ascii’, ‘ignore
my code is
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import wolframalpha
import sys
app_id='PR5756-H3EP749GGH'
client = wolframalpha.Client(app_id)
query = ' '.join(sys.argv[1:])
res = client.query(query)
if len(res.pods) > 0:
texts = ""
pod = res.pods[1]
if pod.text:
texts = pod.text
else:
texts = "I have no answer for that"
# to skip ascii character in case of error
texts = texts.encode('ascii’, ‘ignore')
print ('texts')
please help

Your quote characters are not matching, you are using two different type of quotes ’ and '.
texts = texts.encode('ascii', 'ignore')

Related

Saving text with Polish characters (utf-8) to a textfile from JSON in Python

I am trying to save a conversation from Messenger to a textfile, including things like timestamps and senders.
In the JSON file downloaded from Messenger, the emojis and Polish characters are displayed as UTF-8 in literal (e.g. "ą" as \xc4\x85).
After executing this program:
import json
from datetime import datetime
messages = []
jsonfiles = ["message_1.json","message_2.json","message_3.json","message_4.json","message_5.json", "message_6.json","message_7.json","message_8.json","message_9.json","message_10.json","message_11.json"]
def filldict(textfile,jsonfile):
with open(textfile,"a", encoding="utf-8") as w:
with open(jsonfile, "r", encoding="utf-8") as j:
data = json.load(j)
i = 0
while i<len(data["messages"]):
message = {}
if "content" in data["messages"][len(data["messages"])-1-i]:
stamp = int(data["messages"][len(data["messages"])-1-i]["timestamp_ms"])
date = datetime.fromtimestamp(stamp/1000)
message['timestamp']=stamp
message['date']=date
w.write(str(date))
w.write(" ")
w.write(data["messages"][len(data["messages"])-1-i]["sender_name"])
message['sender']=data["messages"][len(data["messages"])-1-i]["sender_name"]
w.write(": ")
if "content" in str(data["messages"][len(data["messages"])-1-i]):
w.write(data["messages"][len(data["messages"])-1-i]["content"])
message['content']=data["messages"][len(data["messages"])-1-i]["content"]
w.write("\n")
i +=1
messages.append(message)
message = {}
j = len(jsonfiles)
while j>0:
filldict("messages11.txt", jsonfiles[j-1])
j-=1
print("process finished")
the output textfile contains those utf-8 literals instead of the characters which they represent. What can I do in order to fix it and display the Polish characters (and, if that's even possible, emojis) in the textfile? I thought that including " encoding = 'utf-8' " would be enough. Thank you for any clues.

pycharm console unicode to readable string

studying python with this tutorial
The problem is when i trying to get cyrillic characters i get unicode in pycharm console.
import requests
from bs4 import BeautifulSoup
import operator
import codecs
def start(url):
word_list = []
source_code = requests.get(url).text
soup = BeautifulSoup(source_code)
for post_text in soup.findAll('a', {'class': 'b-tasks__item__title js-set-visited'}):
content = post_text.string
words = content.lower().split()
for each_word in words:
word_list.append(each_word)
clean_up_list(word_list)
def clean_up_list(word_list):
clean_word_list = []
for word in word_list:
symbols = "!##$%^&*()_+{}|:<>?,./;'[]\=-\""
for i in range(0, len(symbols)):
word = word.replace(symbols[i], "")
if len(word) > 0:
clean_word_list.append(word)
create_dictionary(clean_word_list)
def create_dictionary(clean_word_list):
word_count = {}
for word in clean_word_list:
if word in word_count:
word_count[word] += 1
else:
word_count[word] = 1
for key, value in sorted(word_count.items(), key=operator.itemgetter(1)):
print(key, value)
When i am changing print(key, value) to print(key.decode('utf8'), value) i am getting "UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-7: ordinal not in range(128)"
start('https://youdo.com/tasks-all-opened-all-moscow-1')
There is some suggestion on the internet about changing encoding in some files - don't really get it. Can't i read it in console?
OSX
UPD
key.encode("utf-8")
UTF-8 is sometimes painful. I created a file with a line in Latin caracters and another one with Russian ones. The following code:
# encoding: utf-8
with open("testing.txt", "r", encoding='utf-8') as f:
line = f.read()
print(line)
outputs in PyCharm
Note the two encoding entries
Since you are getting data from a web page, you must make sure that you use the right encoding as well. The following code
# encoding: utf-8
r = requests.get('http://www.pravda.ru/')
r.encoding = 'utf-8'
print(r.text)
outputs in PyCharm as
Please note that you must specifically set the encoding to match the one of the page.

python: batch string en- / decoding

i have following problem:
i wrote a python script and it needs inputparameters to run... but if the parameters include one of our german "umlaute" like äüö or ß the script stops with following error:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xfc in position
8: ordinal not in range(128)
and if i start the script with a batchfile, the "umlaute" are replaced with random chars like ?, some other variation of the ö....
pls help me.. thx :)
part of the code:
...
if batch_exe:
try:
aIndex = sys.argv.index("-a")
buchungsart_regEx = sys.argv[aIndex+1]
except:
buchungsart_regEx = ""
else:
...
select_stmt = select_stmt + " AND REGEXP_LIKE (BUCHUNGSART, " + "'" + buchungsart_regEx + "')"
...
db_list = sde_conn.execute(select_stmt)
...
and the cmdinput is something like:
python C:\...\Script.py -i ..... -a äöüß
Check this answer: https://stackoverflow.com/a/846931/1686094
You can use his sys.argv = win32_unicode_argv()
And maybe you can then encode your sys.argv with utf-8 for future use.
You could try adding the type of encoding at the top of your script:
# -*- coding: utf-8 -*-

How to fix this UnicodeDecodeError that appears when I try to remove accents in Python strings?

I'm trying to use this function:
import unicodedata
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
in the code below (which unzips and reads files with non-ASCII strings). But I'm getting this error, (from this library file C:\Python27\Lib\encodings\utf_8.py):
Message File Name Line Position
Traceback
<module> C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py 64
getNameList C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py 26
remove_accents C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py 17
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe1 in position 3: ordinal not in range(128)
Why am I getting this error? How to avoid it and make remove_accents work?
Thanks for any help!
Here's the entire code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import os
import re
from zipfile import ZipFile
import csv
##def strip_accents(s):
## return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
import unicodedata
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def getNameList():
namesDict=extractNamesDict()
maleNames=list()
femaleNames=list()
for name in namesDict:
print name
# name = strip_accents(name)
name = remove_accents(name)
counts=namesDict[name]
tuple=(name,counts[0],counts[1])
if counts[0]>counts[1]:
maleNames.append(tuple)
elif counts[1]>counts[0]:
femaleNames.append(tuple)
names=(maleNames,femaleNames)
# print maleNames
return names
def extractNamesDict():
zf=ZipFile('names.zip', 'r')
filenames=zf.namelist()
names=dict()
genderMap={'M':0,'F':1}
for filename in filenames:
file=zf.open(filename,'r')
rows=csv.reader(file, delimiter=',')
for row in rows:
#name=row[0].upper().decode('latin1')
name=row[0].upper()
gender=genderMap[row[1]]
count=int(row[2])
if not names.has_key(name):
names[name]=[0,0]
names[name][gender]=names[name][gender]+count
file.close()
# print '\tImported %s'%filename
# print names
return names
if __name__ == "__main__":
getNameList()
Best practice is to decode to Unicode when the data comes into your program:
for row in rows:
name=row[0].upper().decode('utf8') # or whatever...you DO need to know the encoding.
Then remove_accents can just be:
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', input_str)
return u''.join(c for c in nkfd_form if not unicodedata.combining(c))
Encode data when leaving your program such as writing to a file, database, terminal, etc.
Why remove accents in the first place?
If you want to robustly convert unicode characters to ascii in a string, you should use the awesome unidecode module:
>>> import unidecode
>>> unidecode.unidecode(u'Björk')
'Bjork'
>>> unidecode.unidecode(u'András Sütő')
'Andras Suto'
>>> unidecode.unidecode(u'Ελλάς')
'Ellas'
You get it because you are decoding from a bytestring without specifying a codec:
unicode(input_str)
Add a codec there (here I assume your data is encoded in utf-8, 0xe1 would be the first of a 3-byte character):
unicode(input_str, 'utf8')

reading # char in python

can someone help with me reading "#" char in python? i can't seem to get the file. because this is an output from the stanford postagger, is there any scripts available to convert the stanford postagger http://nlp.stanford.edu/software/tagger.shtml file to cwb. http://cogsci.uni-osnabrueck.de/~korpora/ws/CWBdoc/CWB_Encoding_Tutorial/node3.html
so this is the utf-8 txt file that i'm trying to read:
如果#CS 您#PN 在#P 新加坡#NR 只#AD 能#VV 前往#VV 一#CD 间#M 俱乐部#NN ,#PU 祖卡#NN 酒吧#NN 必然#AD 是#VC 您#PN 的#DEG 不二#JJ 选择#NN 。#PU
作为#P 或许#AD 是#VC 新加坡#NR 唯一#JJ 一#CD 家#M 国际#NN 知名#VA 的#DEC 夜店#NN ,#PU 祖卡#NN 既#CC 是#VC 一#CD 个#M 公共#JJ 机构#NN ,#PU
So with this code i'm not readin the # char in the utf-8 txt files:
#!/usr/bin/python # -*- coding: utf-8 -*-
'''
stanford POS tagger to CWB format
'''
import codecs
import nltk
import os, sys, re, glob
reload(sys)
sys.setdefaultencoding('utf-8')
cwd = './path/to/file.txt' #os.getcwd()
for infile in glob.glob(os.path.join(cwd, 'zouk.txt')):
print infile
(PATH, FILENAME) = os.path.split(infile)
reader = codecs.open(infile, 'r', 'utf-8')
for line in reader:
for word in line:
if word == '\#':
print 'hex is here'
if word == '\#':
This probably doesn't do what you think it does. (Hint: print "\#")
If Python does not recognize an escape sequence then it will include the backslash in the string.
>>> '\#' == '\\#'
True

Categories