accented characters in a regex with Python - python

This is my code
# -*- coding: utf-8 -*-
import json
import re
with open("/Users/paul/Desktop/file.json") as json_file:
file = json.load(json_file)
print file["desc"]
key="capacità"
result = re.findall("((?:[\S,]+\s+){0,3})"+key+"\s+((?:[\S,]+\s*){0,3})", file["desc"], re.IGNORECASE)
print result
This is the content of the file
{
"desc": "Frigocongelatore, capacit\u00e0 di 215 litri, h 122 cm, classe A+"
}
My result is []
but what I want is result = "capacità"

You need to treat your string as an Unicode string, like this:
str = u"Frigocongelatore, capacit\u00e0 di 215 litri, h 122 cm, classe A+"
And as you can see if you print str.encode('utf-8') you'll get:
Frigocongelatore, capacità di 215 litri, h 122 cm, classe A+
The same way you can make your regex string an unicode or raw string with u or r respectively.

You can use this function to display different encodings.
The default encoding on your editor should be UTF-8. Check you settings with sys.getdefaultencoding().
def find_context(word_, n_before, n_after, string_):
# finds the word and n words before and after it
import re
b= '\w+\W+' * n_before
a= '\W+\w+' * n_after
pattern = '(' + b + word_ + a + ')'
return re.search(pattern, string_).groups(1)[0]
s = "Frigocongelatore, capacità di 215 litri, h 122 cm, classe A+"
# find 0 words before and 3 after the word capacità
print(find_context('capacità',0,3,s) )
capacità di 215 litri
print(find_context(' capacit\u00e0',0,3,s) )
capacità di 215 litri

Related

dataframe put must be a unicode string, not 0, how give the string not the dataframe

i try to manipulate some dataframe and i did a function to calculate the distance between 2 cities.
def find_distance(A,B):
key = '0377f0e6b42a47fe9d30a4e9a2b3bb63' # get api key from: https://opencagedata.com
geocoder = OpenCageGeocode(key)
result_A = geocoder.geocode(A)
lat_A = result_A[0]['geometry']['lat']
lng_A = result_A[0]['geometry']['lng']
result_B = geocoder.geocode(B)
lat_B = result_B[0]['geometry']['lat']
lng_B = result_B[0]['geometry']['lng']
return int(geodesic((lat_A,lng_A), (lat_B,lng_B)).kilometers)
this is my dataframe
2 32 Mulhouse 1874.0 2 797 16.8 16,3 € 10.012786
13 13 Saint-Étienne 1994.0 3 005 14.3 13,5 € 8.009882
39 39 Roubaix 2845.0 2 591 17.4 15,0 € 6.830968
27 27 Perpignan 2507.0 3 119 15.1 13,3 € 6.727255
40 40 Tourcoing 3089.0 2 901 17.5 15,3 € 6.327547
25 25 Limoges 2630.0 2 807 14.2 12,5 € 6.030424
20 20 Le Mans 2778.0 3 202 14.4 12,3 € 5.789559
there is my code:
def clean_text(row):
# return the list of decoded cell in the Series instead
return [r.decode('unicode_escape').encode('ascii', 'ignore') for r in row]
def main():
inFile = "prix_m2_france.xlsx" #On ouvre l'excel
inSheetName = "Sheet1" #le nom de l excel
cols = ['Ville', 'Prix_moyen', 'Loyer_moyen'] #Les colomnes
df =(pd.read_excel(inFile, sheet_name = inSheetName))
df[cols] = df[cols].replace({'€': '', ",": ".", " ": "", "\u202f":""}, regex=True)
# df['Prix_moyen'] = df.apply(clean_text)
# df['Loyer_moyen'] = df.apply(clean_text)
df['Prix_moyen'] = df['Prix_moyen'].astype(float)
df['Loyer_moyen'] = df['Loyer_moyen'].astype(float)
# df["Prix_moyen"] += 1
df["revenu"] = (df['Loyer_moyen'] * 12) / (df["Prix_moyen"] * 1.0744) * 100
# df['Ville'].replace({'Le-Havre': 'Le Havre', 'Le-Mans': 'Le Mans'})
df["Ville"] = df['Ville'].replace(['Le-Havre', 'Le-Mans'], ['Le Havre', 'Le Mans'])
df["distance"] = find_distance("Paris", df["Ville"])
df2 = df.sort_values(by = 'revenu', ascending = False)
print(df2.head(90))
main()
df["distance"] = find_distance("Paris", df["Ville"]) fails and give me this error:
opencage.geocoder.InvalidInputError: Input must be a unicode string, not 0 Paris
1 Marseille
2 Lyon
3 T
I imagine it as a loop where i will put the distance between paris and the city but i guess it take all the dataframe on my first value.
Thanks for your help
(Edit, i just pasted a part of my dataframe)
You can try something like :
df["distance"] = [find_distance("Paris", city) for city in df["Ville"]]

Custom encoding and decoding UTF special characters

Just for fun I've been embedding text in images. The following code is a distillation and demonstration of the encoding and decoding mechanism I am using.
class encChar:
def __init__(self,char):
self.p = self.enc(char)
def enc(self,char):
d = bin(ord(char)).split('b')[1]
while len(d)<8:
d = "0"+d
rdif = int(d[0])*4 + int(d[1])*2 + int(d[2])*1
gdif = int(d[3])*2 + int(d[4])*1
bdif = int(d[5])*4 + int(d[6])*2 + int(d[7])*1
return (rdif,gdif,bdif)
def dec(self):
dmap = {0:"000",1:"001",2:"010",3:"011",4:"100",5:"101",6:"110",7:"111"}
r = dmap[self.p[0]]
g = dmap[self.p[1]][1:]
b = dmap[self.p[2]]
n = int(r+g+b,2)
return chr(int(r+g+b,2))
testStr = """Languages
Deutsch
Español
Français
한국어
Italiano
Русский
Tagalog
Tiếng Việt
中文"""
result= ""
for line in testStr.split("\n"):
result+=line+"\n"
print(line)
print("".join([encChar(k).dec() for k in line]))
result+="".join([encChar(k).dec() for k in line])+"\n"
print()
result+="\n"
with open("errorop.txt","w",encoding="utf8") as op:
op.write(result)
Which produces the following document:
Languages
Languages
Deutsch
Deutsch
Español
Español
Français
Français
한국어
Õ­Å
Italiano
Italiano
Русский
Tagalog
Tagalog
Tiếng Việt
Tiõng Viöt
中文
Ë
As you can see several runes are altered by the process and I'm wondering how I can preserve them through this process.

Python Text Widget appending tab characters ( "\t")

I have this function in my code that just opens a Dialog and open a text file, so in this file it will extract some specific information, that its just basic numbers. Im extracting them by column, since they displays like this:
1[tab]2.942 2,885 3,013 2170 745 2,91 0,00000 0,000
2[tab]3,065 3,013 3,129 2500 834 3,00 0,00000 V 0,042
3[tab]3,188 3,129 3,261 9813 2449 4,01 0,00000 V 0,084
4[tab]3,307 3,261 3,409 4990 891 5,60 0,00000 V 0,124 [...]
[...]
10[tab]4,731 4,661 4,793 6855 2037 3,37 0,00000 T 0,608
11[tab]4,842 4,793 4,941 2834 829 3,42 0,00000 TV 0,646
I just need the first numbers, the enumerator, to print it in the Text Widget. The problem is that I`m extracting by column [0:2],and the first 9 numbers are single digit only, so the widget is append the tab character after the
1,2,3,4,5,6,7,8,9.. and printing like:
1
whitespace
2
whitespace
3
whitespace
10
11
12
I want to remove this spaces.
Here`s the code:
def grab_file(self):
self.f = tkFileDialog.askopenfile(mode="r",filetypes=(("Description file","*.desc"),("All files","*.*")))
self.data_list = self.f.readlines()
self.f.close()
del self.data_list[141:]
del self.data_list[0:62]
for self.line in self.data_list:
self.lines = self.line.strip().split("\t")
self.grab_file_irk.append((self.lines[1]))
self.grab_file_irk = [w.replace(',', '.')for w in self.grab_file_irk]
self.grab_file_irk.reverse()
for i in self.grab_file_irk:
self.grab_file_floated.append(float(i))
for self.column in self.data_list:
self.entry0.insert("end",self.column[0:2] + "\n")
Thanks for your time.
You could either find the index of the first tab character directly:
self.column[:self.column.find('\t')]
or use regex:
import re
m = re.match('(\d+)', self.column)
s_num = m.group(0)

Regex with unicode and str

I have a list of regex and a replace function.
regex function
replacement_patterns = [(ur'\\u20ac', ur' euros'),(ur'\xe2\x82\xac', r' euros'),(ur'\b[eE]?[uU]?[rR]\b', r' euros'), (ur'\b([0-9]+)[eE][uU]?[rR]?[oO]?[sS]?\b',ur' \1 euros')]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex, re.UNICODE | re.IGNORECASE), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
If I write the string as bellow:
string='730\u20ac.\r\n\n ropa surf ... 5,10 muy buen estado..... 170 \u20ac\r\n\nPack 850\u20ac, reparaci\u00f3n. \r\n\n'
replacer = RegexpReplacer()
texto= replacer.replace(string)
I get perfect results.
But if I call the function when iterating over a JSON file I have just loaded, it does not work (no error but no replacement)
What seems to happen is that when I call the function over the typed variable the function receives a STR, and when I call it from the JSON iteration it receives a unicode.
My question is why my regex is not working on the unicode, wouldnt it be supposed to?
Maybe you need something like this
import re
regex = re.compile("^http://.+", re.UNICODE)
And if you need more than one, you can do like this
regex = re.compile("^http://.+", re.UNICODE | re.IGNORECASE)
Get the example
>>> r = re.compile("^http://.+", re.UNICODE | re.IGNORECASE)
>>> r.match('HTTP://ыыы')
<_sre.SRE_Match object at 0x7f572455d648>
Does it correct result?
>>> class RegexpReplacer(object):
... def __init__(self, patterns=replacement_patterns):
... self.patterns = [(re.compile(regex, re.UNICODE | re.IGNORECASE), repl) for (regex, repl) in patterns]
... def replace(self, text):
... s = text
... for (pattern, repl) in self.patterns:
... (s, count) = re.subn(pattern, repl, s)
... return s
...
>>> string='730\u20ac.\r\n\n ropa surf ... 5,10 muy buen estado..... 170 \u20ac\r\n\nPack 850\u20ac, reparaci\u00f3n. \r\n\n'
>>> replacer = RegexpReplacer()
>>> texto= replacer.replace(string)
>>> texto
u'730 euros.\r\n\n ropa surf ... 5,10 muy buen estado..... 170 euros\r\n\nPack 850 euros, reparaci\\u00f3n. \r\n\n'
If you want Unicode replacement patterns, you need also be operating on Unicode strings. JSON should be returning Unicode as well.
Change the following by removing \\ and removing UTF-8 (won't see in a Unicode string). Also you compile with IGNORE_CASE so no need for [eE], etc.:
replacement_patterns = [(ur'\u20ac', ur' euros'),(ur'\be?u?r\b', r' euros'), (ur'\b([0-9]+)eu?r?o?s?\b',ur' \1 euros')]
Make the following a Unicode string (add u):
string = u'730\u20ac.\r\n\n ropa surf ... 5,10 muy buen estado..... 170 \u20ac\r\n\nPack 850\u20ac, reparaci\u00f3n. \r\n\n'
Then it should operator on Unicode JSON as well.

UnicodeWarning: special characters in Tkinter

I have written a program in Tkinter (Python 2.7), a scrabblehelper in Norwegian which contains some special characters (æøå), which means my wordlist (ordliste) contains words with special characters.
When I run my function finnord(c*), it returns 'cd'. I am using an entry.get() to get the word to put in my function.
My problem is with the encoding of entry.get(). I have local coding UTF-8, but I get an UniCodeError when I am writing any special characters in my entrybox and matching them to my wordliste.
Here is my output.
Warning (from warnings module):
File "C:\pythonprog\scrabble\feud.py", line 46
if s not in liste and s in ordliste:
UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode -
interpreting them as being unequal
When i write in my shell:
> ordinn.get()
u'k\xf8**e'
> ordinn.get().encode('utf-8')
'k\xc3\xb8**e'
> print ordinn.get()
kø**e
> print ordinn.get().encode('utf-8')
kø**e
Anyone knows why I can't match ordinn.get() (entry) to my wordlist ?
I can reproduce the error this way:
% python
Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
[GCC 4.6.1] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> 'k\xf8**e' in [u'k\xf8**e']
__main__:1: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
False
So perhaps s is a str object, and liste or ordliste contains unicode, or (as eryksun points out in the comments) vice versa. The solution is to decode the str objects (most likely with the utf-8 codec) to make them unicode.
If that does not help, please print out and post the output of
print(repr(s))
print(repr(liste))
print(repr(ordliste))
I believe the problem can be avoided by converting all strings to unicode.
When you generate ordliste from norsk.txt, use
codecs.open('norsk.txt','r','utf-8'):
encoding = sys.stdin.encoding
with codecs.open('norsk.txt','r','utf-8') as fil:
ordliste = [line.rstrip(u'\n') for line in fil]
Convert all user input to unicode as soon as possible:
def get_unicode(widget):
streng = widget.get()
try:
streng = streng.decode('utf-8')
except UnicodeEncodeError:
pass
return streng
So perhaps try this:
import Tkinter as tk
import tkMessageBox
import codecs
import itertools
import sys
alfabetet = (u"abcdefghijklmnopqrstuvwxyz"
u"\N{LATIN SMALL LETTER AE}"
u"\N{LATIN SMALL LETTER O WITH STROKE}"
u"\N{LATIN SMALL LETTER A WITH RING ABOVE}")
encoding = sys.stdin.encoding
with codecs.open('norsk.txt','r',encoding) as fil:
ordliste = set(line.rstrip(u'\n') for line in fil)
def get_unicode(widget):
streng = widget.get()
if isinstance(streng,str):
streng = streng.decode('latin-1')
return streng
def siord():
alfa=lagtabell()
try:
streng = get_unicode(ordinn)
ordene=finnord(streng,alfa)
if len(ordene) == 0:
# There are no words that match
tkMessageBox.showinfo('Dessverre..','Det er ingen ord som passer...')
else:
# Done: The words that fit the pattern
tkMessageBox.showinfo('Ferdig',
'Ordene som passer er:\n'+ordene.encode('utf-8'))
except Exception as err:
# There has been a mistake .. Check your word
print(repr(err))
tkMessageBox.showerror('ERROR','Det har skjedd en feil.. Sjekk ordet ditt.')
def finnord(streng,alfa):
liste = set()
for substitution in itertools.permutations(alfa,streng.count(u'*')):
s = streng
for ch in substitution:
s = s.replace(u'*',ch,1)
if s in ordliste:
liste.add(s)
liste = [streng]+list(liste)
return u','.join(liste)+u'.'
def lagtabell():
tinbox = get_unicode(bokstinn)
if not tinbox.isalpha():
alfa = alfabetet
else:
alfa = tinbox.lower()
return alfa
root = tk.Tk()
root.title('FeudHjelper av Martin Skow Røed')
root.geometry('400x250+450+200')
# root.iconbitmap('data/ikon.ico')
skrift1 = tk.Label(root,
text = '''\
Velkommen til FeudHjelper. Skriv inn de bokstavene du har, og erstatt ukjente med *.
F. eks: sl**ge
Det er kun lov til å bruke tre stjerner, altså tre ukjente bokstaver.''',
font = ('Verdana',8), wraplength=350)
skrift1.pack(pady = 5)
ordinn = tk.StringVar(None)
tekstboks = tk.Entry(root, textvariable = ordinn)
tekstboks.pack(pady = 5)
# What letters do you have? Eg "ahneki". Leave blank here if you want all the words.
skrift2 = tk.Label(root, text = '''Hvilke bokstaver har du? F. eks "ahneki". La det være blankt her hvis du vil ha alle ordene.''',
font = ('Verdana',8), wraplength=350)
skrift2.pack(pady = 10)
bokstinn = tk.StringVar(None)
tekstboks2 = tk.Entry(root, textvariable = bokstinn)
tekstboks2.pack()
knapp = tk.Button(text = 'Finn ord!', command = siord)
knapp.pack(pady = 10)
root.mainloop()

Categories