Cyrillic symbols decode in numpy array - python

I need to get pieplot with labels in Cyrillic symbols, that is in df.index
plt.pie(df['reg_created'], labels = df.index)
So, it's return error:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
df.index:
Index([u'Бизнес', u'Вечеринки', u'Выставки', u'Гражданские проекты',
u'Для детей', u'Другие развлечения', u'Другие события', u'Еда',
u'ИТ и интернет', u'Иностранные языки', u'Интеллектуальные игры',
u'Искусство и культура', u'Кино', u'Концерты', u'Красота и здоровье',
u'Наука', u'Образование за рубежом', u'Психология и самопознание',
u'Спорт', u'Театры', u'Хобби и творчество', u'Экскурсии и путешествия'],
dtype='object', name=u'name')
matplotlib.pyplot.pie label parameter shoud be a list, so if I try:
df.index.tolist()
returns:
['\xd0\x91\xd0\xb8\xd0\xb7\xd0\xbd\xd0\xb5\xd1\x81', '\xd0\x92\xd0\xb5\xd1\x87\xd0\xb5\xd1\x80\xd0\xb8\xd0\xbd\xd0\xba\xd0\xb8', '\xd0\x92\xd1\x8b\xd1\x81\xd1\x82\xd0\xb0\xd0\xb2\xd0\xba\xd0\xb8', '\xd0\x93\xd1\x80\xd0\xb0\xd0\xb6\xd0\xb4\xd0\xb0\xd0\xbd\xd1\x81\xd0\xba\xd0\xb8\xd0\xb5 \xd0\xbf\xd1\x80\xd0\xbe\xd0\xb5\xd0\xba\xd1\x82\xd1\x8b', '\xd0\x94\xd0\xbb\xd1\x8f \xd0\xb4\xd0\xb5\xd1\x82\xd0\xb5\xd0\xb9', '\xd0\x94\xd1\x80\xd1\x83\xd0\xb3\xd0\xb8\xd0\xb5 \xd1\x80\xd0\xb0\xd0\xb7\xd0\xb2\xd0\xbb\xd0\xb5\xd1\x87\xd0\xb5\xd0\xbd\xd0\xb8\xd1\x8f', '\xd0\x94\xd1\x80\xd1\x83\xd0\xb3\xd0\xb8\xd0\xb5 \xd1\x81\xd0\xbe\xd0\xb1\xd1\x8b\xd1\x82\xd0\xb8\xd1\x8f', '\xd0\x95\xd0\xb4\xd0\xb0', '\xd0\x98\xd0\xa2 \xd0\xb8 \xd0\xb8\xd0\xbd\xd1\x82\xd0\xb5\xd1\x80\xd0\xbd\xd0\xb5\xd1\x82', '\xd0\x98\xd0\xbd\xd0\xbe\xd1\x81\xd1\x82\xd1\x80\xd0\xb0\xd0\xbd\xd0\xbd\xd1\x8b\xd0\xb5 \xd1\x8f\xd0\xb7\xd1\x8b\xd0\xba\xd0\xb8', '\xd0\x98\xd0\xbd\xd1\x82\xd0\xb5\xd0\xbb\xd0\xbb\xd0\xb5\xd0\xba\xd1\x82\xd1\x83\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xbd\xd1\x8b\xd0\xb5 \xd0\xb8\xd0\xb3\xd1\x80\xd1\x8b', '\xd0\x98\xd1\x81\xd0\xba\xd1\x83\xd1\x81\xd1\x81\xd1\x82\xd0\xb2\xd0\xbe \xd0\xb8 \xd0\xba\xd1\x83\xd0\xbb\xd1\x8c\xd1\x82\xd1\x83\xd1\x80\xd0\xb0', '\xd0\x9a\xd0\xb8\xd0\xbd\xd0\xbe', '\xd0\x9a\xd0\xbe\xd0\xbd\xd1\x86\xd0\xb5\xd1\x80\xd1\x82\xd1\x8b', '\xd0\x9a\xd1\x80\xd0\xb0\xd1\x81\xd0\xbe\xd1\x82\xd0\xb0 \xd0\xb8 \xd0\xb7\xd0\xb4\xd0\xbe\xd1\x80\xd0\xbe\xd0\xb2\xd1\x8c\xd0\xb5', '\xd0\x9d\xd0\xb0\xd1\x83\xd0\xba\xd0\xb0', '\xd0\x9e\xd0\xb1\xd1\x80\xd0\xb0\xd0\xb7\xd0\xbe\xd0\xb2\xd0\xb0\xd0\xbd\xd0\xb8\xd0\xb5 \xd0\xb7\xd0\xb0 \xd1\x80\xd1\x83\xd0\xb1\xd0\xb5\xd0\xb6\xd0\xbe\xd0\xbc', '\xd0\x9f\xd1\x81\xd0\xb8\xd1\x85\xd0\xbe\xd0\xbb\xd0\xbe\xd0\xb3\xd0\xb8\xd1\x8f \xd0\xb8 \xd1\x81\xd0\xb0\xd0\xbc\xd0\xbe\xd0\xbf\xd0\xbe\xd0\xb7\xd0\xbd\xd0\xb0\xd0\xbd\xd0\xb8\xd0\xb5', '\xd0\xa1\xd0\xbf\xd0\xbe\xd1\x80\xd1\x82', '\xd0\xa2\xd0\xb5\xd0\xb0\xd1\x82\xd1\x80\xd1\x8b', '\xd0\xa5\xd0\xbe\xd0\xb1\xd0\xb1\xd0\xb8 \xd0\xb8 \xd1\x82\xd0\xb2\xd0\xbe\xd1\x80\xd1\x87\xd0\xb5\xd1\x81\xd1\x82\xd0\xb2\xd0\xbe', '\xd0\xad\xd0\xba\xd1\x81\xd0\xba\xd1\x83\xd1\x80\xd1\x81\xd0\xb8\xd0\xb8 \xd0\xb8 \xd0\xbf\xd1\x83\xd1\x82\xd0\xb5\xd1\x88\xd0\xb5\xd1\x81\xd1\x82\xd0\xb2\xd0\xb8\xd1\x8f']
if I print list by element:
for i in df.index.tolist():
print i
returns Cyrillic text
Бизнес
Вечеринки
Выставки
Гражданские проекты
...
Why I have difference in print list of Cyrillic text and print that list by element?
And what I shoud get to pyplot.pie label param for Cyrillic labels?

You got your answer in the error message, the charters are decoded as ASCII and not as UTF-8
https://stackoverflow.com/a/10406161
https://stackoverflow.com/a/36454865

Related

Optimize byte array escaping performance python

I need to perform custom escaping over a byte array in python. However, during escaping python converts bytes to integers, making performance optimization very difficult. How can I speed up my escaping function?
ESCAPE_DICT={
0x00: [0x5C,0x7A], # null -> \z 0x5c 0x7a
0x22: [0x5C,0x71], # " -> \q 0x5c 0x71
0x3B: [0x5C,0x73], # ; -> \s 0x5c 0x73
0x5C: [0x5C,0x5C], # \ -> \\ 0x5c 0x5c
0x0A: [0x5C,0x6E], # line-feed -> \n 0x5c 0x6e
0x0C: [0x5C,0x66], # form-feed -> \f 0x5c 0x66
0x0D: [0x5C,0x63], # carr-return -> \c 0x5c 0x63
}
def escape(string: bytes):
str_len=string.__len__()
escaped_list=[]
for i in range(0,str_len):
curr_byte=string[i]
escape = ESCAPE_DICT.get(curr_byte)
if escape is None:
# Don't escape current byte
escaped_list.append(curr_byte)
else:
# Escape current byte
escaped_list.extend(escape)
return bytes(escaped_array)
import re
ESCAPE_DICT = {
b'\x00': rb'\z', # null
b'"': rb'\q',
b';': rb'\s',
b'\\': rb'\\',
b'\n': rb'\n', # linefeed
b'\f': rb'\f', # formfeed
b'\r': rb'\c', # carriage return
}
ESCAPE_CLASS = '[' + ''.join(r'\x' + e.hex() for e in ESCAPE_DICT) + ']'
ESCAPE_REGEX = re.compile(ESCAPE_CLASS.encode())
def escape(string: bytes) -> bytes:
return re.sub(ESCAPE_REGEX, lambda m: ESCAPE_DICT[m.group(0)], string)
x = b'"abc\ndef\rpqr\x00stu\\xyz"'
y = escape(x)
from pprint import pprint
pprint(ESCAPE_CLASS)
pprint(ESCAPE_REGEX)
pprint(x)
pprint(y)
# =>
# '[\\x00\\x22\\x3b\\x5c\\x0a\\x0c\\x0d]'
# re.compile(b'[\\x00\\x22\\x3b\\x5c\\x0a\\x0c\\x0d]')
# b'"abc\ndef\rpqr\x00stu\\xyz"'
# b'\\qabc\\ndef\\cpqr\\zstu\\\\xyz\\q'
You can read the rb prefix as “raw bytes”.
Your escapes are a bit strange, though. E.g., the carriage return is normally \r, not \c, and \s normally stands for generic whitespace.

function write in python for a json file

I'm a beginner in python so I have this program where it classifies tweets into different categories (sport,sante, culture...) using keywords and I would like to copy-paste every line of the JSON file that belongs to a certain category into a file named text1
and I did the following :
but I guess I did it the wrong way since I keep receiving the same error
please any suggestion on how to solve this problem!
import json
import mysql.connector
'''
python -m pip install unicode
'''
c = 0
n = 0
sportcount=0
religcount=0
santecount=0
educcount=0
cultcount=0
socicount=0
policount=0
covid = ['كوفيد','MEDECIN','كورونا','CORONA', 'COVID','VACCIN', 'PANDEMIE', 'CONFINEMENT', 'PANDEMIE', 'CHU', 'GEL','ﻣﺎﺳﻚ' ,'CHINA','ANTIVIRALES','LAVAGE DE MAINS','VIRUS','اﻟﺤﺮاﻙ','اﻟﺤﺠﺮ','CHLOROQUINE','FATIGUE','كماما','STAYSAFE','EPIDEMIE','STAYHOME','منظمة الصحة',' pas de prière']
sport=['مولودية','WORKOUT','بايرن','ارسنال','ليفربول','منتخب','تتويج','ميسي','PSG','FIFA', 'FOOT','FEKIR', 'BOUGER', 'DANSER', 'STADE','بونجاح','JOUEUR', 'COMPETION', 'SPORT', 'SALLE', 'SPONSOR', 'PISCINE', 'PUMA', 'GYM', 'TEN', 'MATCH', 'CHAMPION', 'BASKET', 'NOVAK', 'DJOKOVIC', 'MESSI', 'OLYMPIQUE']
religion=['ALLAH','المساجد','مصل','HAMDULILAH','المسلم','فتوى','لله','EID','الله','MOSQ','دين']
sante=['controle','إصاب','OXYGENE','بوناطيرو','حالات','مؤكد','IMMUNIT','CAS','صح','DOCTEUR','مخبر','حصيلة','صحة','أطباء','تسجل','FATIG','مستشف','HOPITAUX','سعال','لقاح','SOUCHES','MALADE','حصيلة','FUMEURS', 'DIABETE', 'EPIDEMIE', 'DEPISTAGE', 'SOIGNANT', 'INJECTION','GEL','SANTE', 'FIEVRE', 'KAWASAKI', 'RESPIRATOIRE', 'PATIENT', 'TEST', 'TRAITEMENT','فحص','كماما', 'CHU','منظمة الصحة', 'MEDECINE', 'POSITIF', 'PHARMACE', 'INFECTES', 'IMMUN', 'VACCIN', 'PFIZER', 'PCR', 'PANDÉM', 'PANADEMI', 'ÉPIDÉMI', 'EPIDEMIC', 'MASQUE', 'BAVETTE', 'MASK', 'MÉDICAL', 'MEDICAL', 'HÔPITAL', 'HOSPITAL', 'INFECT', 'TRANSMISSION', 'SURVIVANT', 'SURVIVORS', 'DIAGNOSTIC', 'DIAGNOSIS', 'SANTÉ', 'HEALTH', 'MÉDECIN', 'DOCTOR', 'MÉDICAMENT', 'MEDICIN', 'AMBULANCE', 'DÉPISTAGE', 'DEPISTAGE', 'STATISTI', 'MALAD', 'SICK', 'CONFIN', 'PROPAGATION', 'PRÉVENTION', 'PREVENTION', 'CONTAGION', 'SYMPT', 'MESUR', 'MEASUR', 'MICROB', 'WASH', 'ISOL']
education=['سنة','collaboration','bac','EDUC','ÉCOLE','PROF','بتدائي','تعليم','أساتذة','دراس','طلبة']
culture=['LIVRE', 'BOOK', 'SHOP', 'FILM', 'MOVIE', 'MUSIC', 'TV', 'VOYAGE', 'CINEMA', 'ART', 'BLOG', 'SONG']
social=['الشباب','TWITTER','تغريد','متابعة','SOCI','NETFLIX','YOUTUBE','JOURNAL','solde','liquidationy']
politique=['manifestation','AFFAIRE', 'PUBLIQUE', 'AMBASSADE', 'CIRCONSPECTION', 'CIVI', 'COMBINAISON', 'DÉMAGOGIE', 'DÉMOCRATIE', 'DIPLOMATE', 'ÉCONOMIE', 'ÉTAT', 'FÉDÉRALISME', 'GOUVERNEMENT', 'LEGATION', 'MACHIAVÉLIQUE', 'MACHIAVÉLISME', 'MANŒUVRIER', 'NÉGOCI', 'POLICE', 'POUVOIR', 'PRUDE', 'PUBLIC', 'STRATÉGIE', 'TRACTATION', 'RÉPUBLIQUE', 'SONDAGE', 'OPINION', 'PARLEMENT', 'CITO', 'DÉPUTÉ', 'DIRIGEANT', 'MAIRE', 'MINIST', 'SECRÉTAIRE', 'SÉNATEUR', 'CONSEILLER', 'MAIRE', 'COMMUNES', 'MEMBRE', 'DU', 'PARLEMENT', 'CONGRÈS', 'SÉNAT', 'PROTESTATION', 'PROCURATION', 'POUVOIR', 'FRAUDE','النفط','وزير','HOLLANDE','خدعة','تبون','الغلق','BORIS JOHNSON','وزار','رئيس','DROITS','مؤامرة','والي','TRUMP','GOUVERNEMEN', 'POLITI', 'OUYAHIA', 'ERAK', 'IRAK', 'REINE', 'MACRON', 'MINIST', 'AMBASSAD', 'MANIFEST', 'PRESIDENT', 'SELLAL', 'NATIONAL', 'مظاهر','MILITAIRE', 'DICTATEUR','اﻟﺤﺮاﻙ']
myJsonFile = open('tweet.json', encoding="utf-16")
resultat = open('texte.txt', 'w')
for line in myJsonFile:
data = json.loads(line)
c = c+1
text = data['raw_text'].upper()
tweet = any(ele in text for ele in covid )
if tweet == True:
n=n+1
#print(str(n) + " the " +str(c)+" tweet---------------------------------------------------------------------------------------")
in_sport = any(ele in text for ele in sport )
if in_sport:
data["cat"]='sport'
resultat.write(line)
sportcount=sportcount+1
print(data)
else :
in_sante = any(ele in text for ele in sante )
if in_sante :
data["cat"]='sante'
resultat.write(line)
santecount=santecount+1
print(data)
else :
in_politique = any(ele in text for ele in politique)
if in_politique:
data["cat"]='politique'
policount=policount+1
resultat.write(line)
print(data)
else:
in_culture = any(ele in text for ele in culture )
if in_culture:
data["cat"]='culture'
cultcount=cultcount+1
resultat.write(line)
print(data)
else:
in_religion = any(ele in text for ele in religion)
if in_religion:
data["cat"]='religion'
religcount=religcount+1
resultat.write(line)
print(data)
else:
in_education = any(ele in text for ele in education )
if in_education:
data["cat"]='education'
educcount=educcount+1
resultat.write(line)
print(data)
else:
in_social = any(ele in text for ele in social)
if in_social:
data["cat"]='social'
socicount=socicount+1
resultat.write(line)
print(data)
else:
print(" the tweet---------------------------------------------------------------------------------------")
print("a partir de",c," le nombre de tweets concernant le covid sont ",n)
print("sport ",sportcount," sante ",santecount," politique ",policount," culture ",cultcount," religion ",religcount," education ",educcount,"social",socicount)
print("les tweets non classifies",n-(educcount+religcount+cultcount+policount+santecount+sportcount+socicount))
error:
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/NIHAD/PycharmProjects/pythonProject3/classification.py", line 51, in <module>
resultat.write(line)
File "C:\Users\NIHAD\AppData\Local\Programs\Python\Python39\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 451-454: character maps to <undefined>
This might be a very simple case of fixing the encoding.
Your error says:
UnicodeEncodeError: 'charmap' codec can't encode characters in position 451-454: character maps to <undefined>
If it does not have to be encoded in utf-16, try opening the json file like:
myJsonFile = open('tweet.json', encoding="utf-8")
and specifying the encoding in the result file:
resultat = open(file, 'w', encoding='utf-8')
You might get lucky and that will fix it!
It it doesn't fix it, here and here are answers that go deeper into detail about diagnosing the problem and potential solutions:
UnicodeEncodeError: 'charmap' codec can't encode characters

OEM non printable characters in Python strings

I´m trying to port some Delphi code that sends data to a Universe database. In order to make the text legible by the DB we need to encode it in OEM.
In Delphi is done this way:
procedure TForm1.GenerarTablasNLS;
var
i: integer;
begin
for i := 0 to 255 do
begin
TablaUV_NLS[i] := AnsiChar(i);
TablaNLS_UV[i] := AnsiChar(i);
end;
// Nulo final
TablaUV_NLS[256] := #0;
TablaNLS_UV[256] := #0;
OemToCharA(#TablaUV_NLS[1], #TablaUV_NLS[1]);
CharToOemA(#TablaNLS_UV[1], #TablaNLS_UV[1]);
And then we translate our text simply like this
function StringToUniverse(const Value: string): AnsiString;
var
p: PChar;
q: PAnsiChar;
begin
SetLength(Result, Length(Value));
if Value = '' then Exit;
p := Pointer(Value);
q := Pointer(Result);
while p^ <> #0 do
begin
q^ := TablaNLS_UV[Ord(AnsiChar(p^))];
Inc(p);
Inc(q);
end;
end;
I follow the same logic in Python using a dictionary that stores each character translation
class StringUniverseDict(dict):
def __missing__(self, key):
return key
TablaString2UV = StringUniverseDict()
def rellenar_tablas_codificacion():
TablaString2UV['á'] = ' ' # chr(225) = chr(160)
TablaString2UV['é'] = '‚' # chr(233) = chr(130)
TablaString2UV['í'] = '¡' # chr(237) = chr(161)
TablaString2UV['ó'] = '¢' # chr(243) = chr(162)
TablaString2UV['ú'] = '£' # chr(250) = chr(163)
TablaString2UV['ñ'] = '¤' # chr(241) = chr(164)
TablaString2UV['ç'] = '‡' # chr(231) = chr(135)
TablaString2UV['Á'] = 'µ' # chr(193) = chr(181)
TablaString2UV['É'] = chr(144) # chr(201) = chr(144)
TablaString2UV['Í'] = 'Ö' # chr(205) = chr(214)
TablaString2UV['Ó'] = 'à' # chr(211) = chr(224)
TablaString2UV['Ñ'] = '¥' # chr(209) = chr(165)
TablaString2UV['Ç'] = '€' # chr(199) = chr(128)
TablaString2UV['ü'] = chr(129) # chr(252) = chr(129)
TablaString2UV[chr(129)] = '_' # chr(129) = chr(095)
TablaString2UV[chr(141)] = '_' # chr(141) = chr(095)
TablaString2UV['•'] = chr(007) # chr(149) = chr(007)
TablaString2UV['Å'] = chr(143) # chr(197) = chr(143)
TablaString2UV['Ø'] = chr(157) # chr(216) = chr(157)
TablaString2UV['ì'] = chr(141) # chr(236) = chr(141)
This works "fine" as long as I translate using printable characters. For example, the string
"á é í ó ú ñ ç Á Í Ó Ú Ñ Ç"
is translated, in Delphi, to the following bytes:
0xa0 0x20 0x82 0x20 0xa1 0x20 0xa2 0x20 0xa3 0x20 0xa4 0x20 0x87 0x20 0xb5 0x20 0xd6 0x20 0xe0 0x20 0xe9 0x20 0xa5 0x20 0x80 0xfe 0x73 0x64 0x73
(á translates to ' ', which is chr(160) or 0xA0 in hexa. é is '‚' or chr(130), 0x82 in hexa, í is '¡', char(161) or 0xA1 in hexa and so on)
In Python, when I try to encode this to OEM I do the following:
def convertir_string_a_universe(cadena_python):
resultado = ''
for letra in cadena_python:
resultado += TablaString2UV[letra]
return resultado
And then, to get the bytes
txt_registro = convertir_string_a_universe(txt_orig)
datos = bytes(txt_registro, 'cp1252')
With this I get the following bytes:
b'\xa0 \x82 \xa1 \xa2 \xa3 \xa4 \x87 \xb5 \xd6 \xe0 \xe9 \xa5 \x80 \x9a'
My problem is that this OEM encoding uses non-printable characters, like in 'É' = chr(144) (0x90 in hexa). If I try to call bytes(txt_registro, 'cp1252') with an array where I hava translated 'É' into chr(0x90) I get this error:
caracteres_mal = 'Éü'
txt_registro = convertir_string_a_universe(txt_orig)
datos = bytes(txt_registro, 'cp1252')
File "C:\Users\Hector\PyCharmProjects\pyuniverse\pyuniverse\UniverseRegister.py", line 138, in reconstruir_registro_universe
datos = bytes(txt_registro, 'cp1252')
File "C:\Users\Hector\AppData\Local\Programs\Python\Python36-32\lib\encodings\cp1252.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
UnicodeEncodeError: 'charmap' codec can't encode character '\x90' in position 0: character maps to <undefined>
How can I do this OEM encoding without raising this UnicodeEncodeError?
This is because cp1252 does not know about chr(0x90). If you try with utf-8 instead, it will work.
>>> chr(0x90).encode("utf8")
b'\xc2\x90'
I don't understand why you are trying to convert to cp1252 though: you have applied a custom conversion map and then, with bytes(txt_registro, 'cp1252'), you are converting your result again to cp1552.
I think what you are looking for is something like:
datos = bytes(txt_orig, 'uv')
where uv is your cutom codec.
So you would have to write an encoder and a decoder for it (which is basically what you have done already). Take a look at https://docs.python.org/3/library/codecs.html#codecs.register
to register a new codec. The function you will register with it should return a CodecInfo object described upper in the documentation.
import codecs
def buscar_a_uv(codec):
if codec == "uv":
return codecs.CodecInfo(
convertir_string_a_universe, convertir_universe_a_string, name="uv")
else:
return None
codecs.register(buscar_a_uv)
datos = bytes(txt_orig, 'uv')
EDIT
The encoder/decoder functions should return bytes, so you would need to update convertir_string_a_universe a bit.

How to change the "gbk" into "uft-8"?

I tried to input and output Russian language in my file but failed it keeps displaying something like \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82' when I run it in Terminal inside of python with print \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82'
print "Привет!"
a = raw_input()
print "Как у тебя дела сегодня?"
a_1 = raw_input()
print "Понятно.Тогда у тебя есть планы вечром?"
a_2 = raw_input()
print "Пока."
a_3 = raw_input()
print "Давой завтра!"
print "Бывают люди бледные бывают тусклые бывают блестящие...Она только сказала \"%r\" \"%r\" \"%r\" \"%r\"... " %(
a,a_1,a_2,a_3)
In Terminal:
MacBook-Pro:mystuff admin$ python ex11.py
Привет!
Привет
Как у тебя дела сегодня?
Нормально
Понятно Тогда у тебя есть планы вечром?
Да Я буду позвонить с другой
Пока
Пока
Давой завтра
Бывают люди бледные бывают тусклые бывают блестящие...Она только сказала "'\xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82'" "'\xd0\x9d\xd0\xbe\xd1\x80\xd0\xbc\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xbd\xd0\xbe'" "'\xd0\x94\xd0\xb0 \xd0\xaf \xd0\xb1\xd1\x83\xd0\xb4\xd1\x83 \xd0\xbf\xd0\xbe\xd0\xb7\xd0\xb2\xd0\xbe\xd0\xbd\xd0\xb8\xd1\x82\xd1\x8c \xd1\x81 \xd0\x98\xd0\xbb\xd1\x8c\xd0\xbe\xd0\xb9'" "'\xd0\x9f\xd0\xbe\xd0\xba\xd0\xb0'"...
You should use%s instead of
%r.
What %r does is that it looks for a class function called __repr__ of the object and returns __repr__(). You input a string Привет, and its __repr__() returns those hex numbers, which is the internal representation of the string.
If you use %s, it then looks for another function called __str__, and it will returns the string in the correct way.
Or you could use
print "Бывают люди бледные бывают тусклые бывают блестящие...Она только сказала \"{}\" \"{}\" \"{}\" \"{}\"... ".format(a,a_1,a_2,a_3)
try %s instead of %r:
\"%s\" \"%s\" \"%s\" \"%s\"... " %(a,a_1,a_2,a_3)

UnicodeWarning: special characters in Tkinter

I have written a program in Tkinter (Python 2.7), a scrabblehelper in Norwegian which contains some special characters (æøå), which means my wordlist (ordliste) contains words with special characters.
When I run my function finnord(c*), it returns 'cd'. I am using an entry.get() to get the word to put in my function.
My problem is with the encoding of entry.get(). I have local coding UTF-8, but I get an UniCodeError when I am writing any special characters in my entrybox and matching them to my wordliste.
Here is my output.
Warning (from warnings module):
File "C:\pythonprog\scrabble\feud.py", line 46
if s not in liste and s in ordliste:
UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode -
interpreting them as being unequal
When i write in my shell:
> ordinn.get()
u'k\xf8**e'
> ordinn.get().encode('utf-8')
'k\xc3\xb8**e'
> print ordinn.get()
kø**e
> print ordinn.get().encode('utf-8')
kø**e
Anyone knows why I can't match ordinn.get() (entry) to my wordlist ?
I can reproduce the error this way:
% python
Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
[GCC 4.6.1] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> 'k\xf8**e' in [u'k\xf8**e']
__main__:1: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
False
So perhaps s is a str object, and liste or ordliste contains unicode, or (as eryksun points out in the comments) vice versa. The solution is to decode the str objects (most likely with the utf-8 codec) to make them unicode.
If that does not help, please print out and post the output of
print(repr(s))
print(repr(liste))
print(repr(ordliste))
I believe the problem can be avoided by converting all strings to unicode.
When you generate ordliste from norsk.txt, use
codecs.open('norsk.txt','r','utf-8'):
encoding = sys.stdin.encoding
with codecs.open('norsk.txt','r','utf-8') as fil:
ordliste = [line.rstrip(u'\n') for line in fil]
Convert all user input to unicode as soon as possible:
def get_unicode(widget):
streng = widget.get()
try:
streng = streng.decode('utf-8')
except UnicodeEncodeError:
pass
return streng
So perhaps try this:
import Tkinter as tk
import tkMessageBox
import codecs
import itertools
import sys
alfabetet = (u"abcdefghijklmnopqrstuvwxyz"
u"\N{LATIN SMALL LETTER AE}"
u"\N{LATIN SMALL LETTER O WITH STROKE}"
u"\N{LATIN SMALL LETTER A WITH RING ABOVE}")
encoding = sys.stdin.encoding
with codecs.open('norsk.txt','r',encoding) as fil:
ordliste = set(line.rstrip(u'\n') for line in fil)
def get_unicode(widget):
streng = widget.get()
if isinstance(streng,str):
streng = streng.decode('latin-1')
return streng
def siord():
alfa=lagtabell()
try:
streng = get_unicode(ordinn)
ordene=finnord(streng,alfa)
if len(ordene) == 0:
# There are no words that match
tkMessageBox.showinfo('Dessverre..','Det er ingen ord som passer...')
else:
# Done: The words that fit the pattern
tkMessageBox.showinfo('Ferdig',
'Ordene som passer er:\n'+ordene.encode('utf-8'))
except Exception as err:
# There has been a mistake .. Check your word
print(repr(err))
tkMessageBox.showerror('ERROR','Det har skjedd en feil.. Sjekk ordet ditt.')
def finnord(streng,alfa):
liste = set()
for substitution in itertools.permutations(alfa,streng.count(u'*')):
s = streng
for ch in substitution:
s = s.replace(u'*',ch,1)
if s in ordliste:
liste.add(s)
liste = [streng]+list(liste)
return u','.join(liste)+u'.'
def lagtabell():
tinbox = get_unicode(bokstinn)
if not tinbox.isalpha():
alfa = alfabetet
else:
alfa = tinbox.lower()
return alfa
root = tk.Tk()
root.title('FeudHjelper av Martin Skow Røed')
root.geometry('400x250+450+200')
# root.iconbitmap('data/ikon.ico')
skrift1 = tk.Label(root,
text = '''\
Velkommen til FeudHjelper. Skriv inn de bokstavene du har, og erstatt ukjente med *.
F. eks: sl**ge
Det er kun lov til å bruke tre stjerner, altså tre ukjente bokstaver.''',
font = ('Verdana',8), wraplength=350)
skrift1.pack(pady = 5)
ordinn = tk.StringVar(None)
tekstboks = tk.Entry(root, textvariable = ordinn)
tekstboks.pack(pady = 5)
# What letters do you have? Eg "ahneki". Leave blank here if you want all the words.
skrift2 = tk.Label(root, text = '''Hvilke bokstaver har du? F. eks "ahneki". La det være blankt her hvis du vil ha alle ordene.''',
font = ('Verdana',8), wraplength=350)
skrift2.pack(pady = 10)
bokstinn = tk.StringVar(None)
tekstboks2 = tk.Entry(root, textvariable = bokstinn)
tekstboks2.pack()
knapp = tk.Button(text = 'Finn ord!', command = siord)
knapp.pack(pady = 10)
root.mainloop()

Categories