Regex with unicode and str - python

I have a list of regex and a replace function.
regex function
replacement_patterns = [(ur'\\u20ac', ur' euros'),(ur'\xe2\x82\xac', r' euros'),(ur'\b[eE]?[uU]?[rR]\b', r' euros'), (ur'\b([0-9]+)[eE][uU]?[rR]?[oO]?[sS]?\b',ur' \1 euros')]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex, re.UNICODE | re.IGNORECASE), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
If I write the string as bellow:
string='730\u20ac.\r\n\n ropa surf ... 5,10 muy buen estado..... 170 \u20ac\r\n\nPack 850\u20ac, reparaci\u00f3n. \r\n\n'
replacer = RegexpReplacer()
texto= replacer.replace(string)
I get perfect results.
But if I call the function when iterating over a JSON file I have just loaded, it does not work (no error but no replacement)
What seems to happen is that when I call the function over the typed variable the function receives a STR, and when I call it from the JSON iteration it receives a unicode.
My question is why my regex is not working on the unicode, wouldnt it be supposed to?

Maybe you need something like this
import re
regex = re.compile("^http://.+", re.UNICODE)
And if you need more than one, you can do like this
regex = re.compile("^http://.+", re.UNICODE | re.IGNORECASE)
Get the example
>>> r = re.compile("^http://.+", re.UNICODE | re.IGNORECASE)
>>> r.match('HTTP://ыыы')
<_sre.SRE_Match object at 0x7f572455d648>
Does it correct result?
>>> class RegexpReplacer(object):
... def __init__(self, patterns=replacement_patterns):
... self.patterns = [(re.compile(regex, re.UNICODE | re.IGNORECASE), repl) for (regex, repl) in patterns]
... def replace(self, text):
... s = text
... for (pattern, repl) in self.patterns:
... (s, count) = re.subn(pattern, repl, s)
... return s
...
>>> string='730\u20ac.\r\n\n ropa surf ... 5,10 muy buen estado..... 170 \u20ac\r\n\nPack 850\u20ac, reparaci\u00f3n. \r\n\n'
>>> replacer = RegexpReplacer()
>>> texto= replacer.replace(string)
>>> texto
u'730 euros.\r\n\n ropa surf ... 5,10 muy buen estado..... 170 euros\r\n\nPack 850 euros, reparaci\\u00f3n. \r\n\n'

If you want Unicode replacement patterns, you need also be operating on Unicode strings. JSON should be returning Unicode as well.
Change the following by removing \\ and removing UTF-8 (won't see in a Unicode string). Also you compile with IGNORE_CASE so no need for [eE], etc.:
replacement_patterns = [(ur'\u20ac', ur' euros'),(ur'\be?u?r\b', r' euros'), (ur'\b([0-9]+)eu?r?o?s?\b',ur' \1 euros')]
Make the following a Unicode string (add u):
string = u'730\u20ac.\r\n\n ropa surf ... 5,10 muy buen estado..... 170 \u20ac\r\n\nPack 850\u20ac, reparaci\u00f3n. \r\n\n'
Then it should operator on Unicode JSON as well.

Related

regular expressions: remove Greek words with capital letters

I want to delete greek words with capital letters, such as:
text = 'Ο Κώστας θέλει να ΠΑΙΞΕΙ ΑΎΡΙΟ ποδόσφαιρο στο σχολείο'
the output should be
text = 'Ο Κώστας θέλει να ποδόσφαιρο στο σχολείο'
I checked this one Regular expression : Remove words with Capital letters, but I don't know how to adopt the code into Greek aphabet.
We can (by consulting an Unicode chart) see that the Greek letters are approximately in the range U+0370..U+1FFF, and then filter using the unicodedata module:
>>> import unicodedata
>>> greek_capital_chars = set(chr(cp) for cp in range(0x0370, 0x1FFF) if "GREEK CAPITAL" in unicodedata.name(chr(cp), ""))
{'Β', 'Χ', 'ᾛ', 'Ἁ', 'Ὼ', 'ᾜ', 'ᾫ', 'Ἂ', 'Ὰ', 'Ἑ', 'Ω', 'Ἤ', 'Ε', 'Ρ', 'Η', 'ᾏ', 'Ϳ', 'Ή', 'Ἣ', 'Ἵ', 'ᾋ', 'Ύ', 'ᾚ', 'Ή', 'Ϲ', 'Ί', 'Ὥ', 'Ύ', 'Ξ', 'Ὄ', 'Ο', 'Θ', 'Ϗ', 'Ϋ', 'Ͻ', 'ᾘ', 'Ὑ', 'Ώ', 'Ᾰ', 'ᾝ', 'Ἐ', 'Ὦ', 'Ά', 'Σ', 'Ὂ', 'Ἱ', 'Ὤ', 'Ͷ', 'Ὴ', 'Ό', 'Ψ', 'ῼ', 'Φ', 'Ἒ', 'Ὕ', 'ᾪ', 'Ἅ', 'Ῑ', 'Ἧ', 'Λ', 'Ἢ', 'Ϸ', 'Ἔ', 'Ί', 'Ἇ', 'Ἲ', 'Ὓ', 'Ζ', 'Τ', 'Ὗ', 'Ϊ', 'Ͽ', 'Μ', 'Ὀ', 'Ἄ', 'ᾊ', 'Κ', 'Γ', 'Ὶ', 'Ϻ', 'Ᾱ', 'ᾬ', 'Ώ', 'Ἳ', 'Ἥ', 'Ἦ', 'Ι', 'Ἃ', 'ᾌ', 'Ὁ', 'Έ', 'Δ', 'Ὡ', 'Ἆ', 'Ἰ', 'ϴ', 'Ͼ', 'Ῠ', 'ῌ', 'Ἓ', 'Ἕ', 'Έ', 'Ὃ', 'Ὠ', 'ᾈ', 'Ͱ', 'ᾼ', 'Ὢ', 'ᾙ', 'ᾞ', 'ᾎ', 'Ὸ', 'Ῥ', 'Ἀ', 'Ὣ', 'Ͳ', 'Ἶ', 'Ῐ', 'ᾮ', 'ᾍ', 'Ἡ', 'Ῡ', 'Ὧ', 'ᾉ', 'ᾩ', 'ᾯ', 'ᾭ', 'ᾟ', 'Ό', 'Α', 'Ὲ', 'Υ', 'Π', 'Ἴ', 'Ά', 'Ἷ', 'ᾨ', 'Ὅ', 'Ὺ', 'Ν', 'Ἠ'}
Then, you can form a regexp that matches words (continous runs) of such characters. We'll also include Latin capital characters.
>>> import re
>>> import string
>>> chars_class = re.escape("".join(greek_capital_chars.union(string.ascii_uppercase)))
>>> r = re.compile(f"[{chars_class}]+")
>>> text = 'Ο Κώστας θέλει να ΠΑΙΞΕΙ ΑΎΡΙΟ ποδόσφαιρο στο σχολείο'
>>> r.sub("", text)
' ώστας θέλει να ποδόσφαιρο στο σχολείο'
As it is, the regex will of course also remove any capital letter; you may wish to do
>>> r = re.compile(f"[{chars_class}]{{2,}}")
>>> r.sub("", text)
'Ο Κώστας θέλει να ποδόσφαιρο στο σχολείο'
or similar instead, depending on your use case.

how to indicate raw string with regex() if my pattern come from another string?

I have a csv table from which I get my regex pattern, e.g. \bconden
Problem : I don't manage to specify to python that this is a raw string
How to put r before a pattern when it comes from a string ?
import re
a = 'de la matière condensée'
fromcsv = '\bconden'
print(re.search('r' + fromcsv, a))
result is None
You can use the str_to_raw function below to make a raw string out of an already declared plain string variable:
import re
a = 'de la matière condensée'
pattern = '\bconden'
escape_dict = {
'\a': r'\a',
'\b': r'\b',
'\c': r'\c',
'\f': r'\f',
'\n': r'\n',
'\r': r'\r',
'\t': r'\t',
'\v': r'\v',
'\'': r'\'',
'\"': r'\"',
'\0': r'\0',
'\1': r'\1',
'\2': r'\2',
'\3': r'\3',
'\4': r'\4',
'\5': r'\5',
'\6': r'\6',
'\7': r'\7',
'\8': r'\8',
'\9': r'\9'
}
def str_to_raw(s):
return r''.join(escape_dict.get(c, c) for c in s)
print(re.search(r'\bconden', a))
print(re.search(str_to_raw(pattern), a))
Output:
<re.Match object; span=(14, 20), match='conden'>
<re.Match object; span=(14, 20), match='conden'>
note: I got escape_dict from this page.

accented characters in a regex with Python

This is my code
# -*- coding: utf-8 -*-
import json
import re
with open("/Users/paul/Desktop/file.json") as json_file:
file = json.load(json_file)
print file["desc"]
key="capacità"
result = re.findall("((?:[\S,]+\s+){0,3})"+key+"\s+((?:[\S,]+\s*){0,3})", file["desc"], re.IGNORECASE)
print result
This is the content of the file
{
"desc": "Frigocongelatore, capacit\u00e0 di 215 litri, h 122 cm, classe A+"
}
My result is []
but what I want is result = "capacità"
You need to treat your string as an Unicode string, like this:
str = u"Frigocongelatore, capacit\u00e0 di 215 litri, h 122 cm, classe A+"
And as you can see if you print str.encode('utf-8') you'll get:
Frigocongelatore, capacità di 215 litri, h 122 cm, classe A+
The same way you can make your regex string an unicode or raw string with u or r respectively.
You can use this function to display different encodings.
The default encoding on your editor should be UTF-8. Check you settings with sys.getdefaultencoding().
def find_context(word_, n_before, n_after, string_):
# finds the word and n words before and after it
import re
b= '\w+\W+' * n_before
a= '\W+\w+' * n_after
pattern = '(' + b + word_ + a + ')'
return re.search(pattern, string_).groups(1)[0]
s = "Frigocongelatore, capacità di 215 litri, h 122 cm, classe A+"
# find 0 words before and 3 after the word capacità
print(find_context('capacità',0,3,s) )
capacità di 215 litri
print(find_context(' capacit\u00e0',0,3,s) )
capacità di 215 litri

What is the regular expression for matching *text*?

What is the regex to match this is *some text*. but not this is \*another \*text. The regex is supposed to match the texts between the asterisks.
pattern = "\*(\w+(?:\s+\w+)*)\*"
re.findall(pattern, "this is *some text*.") // return 'some text'
re.findall(pattern, "this is \*another \*text") // return nothing
For replacing '*' with '$':
subpattern = "(\*(\w+(?:\s+\w+)*)\*)"
re.sub(subpattern, r"$\2$", "this is *some text*.") // return 'this is $some text$.'

Regex match words and end of string

2 Regex question
How can I match a word or 2 words in a subpattern ()?
How can i match a word or 2 words that's either followed by a specific word like "with" OR the end of the string $
I tried
(\w+\W*\w*\b)(\W*\bwith\b|$)
but it's definitely not working
edit:
I'm thinking of matching both "go to mall" and "go to", in a way that i can group "go to" in python.
Perhaps something like this?
>>> import re
>>> r = re.compile(r'(\w+(\W+\w+)?)(\W+with\b|\Z)')
>>> r.search('bar baz baf bag').group(1)
'baf bag'
>>> r.search('bar baz baf with bag').group(1)
'baz baf'
>>> r.search('bar baz baf without bag').group(1)
'without bag'
>>> r.search('bar with bag').group(1)
'bar'
>>> r.search('bar with baz baf with bag').group(1)
'bar'
Here's what I came up with:
import re
class Bunch(object):
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
match = re.compile(
flags = re.VERBOSE,
pattern = r"""
( (?!with) (?P<first> [a-zA-Z_]+ ) )
( \s+ (?!with) (?P<second> [a-zA-Z_]+ ) )?
( \s+ (?P<awith> with ) )?
(?![a-zA-Z_\s]+)
| (?P<error> .* )
"""
).match
s = 'john doe with'
b = Bunch(**match(s).groupdict())
print 's:', s
if b.error:
print 'error:', b.error
else:
print 'first:', b.first
print 'second:', b.second
print 'with:', b.awith
Output:
s: john doe with
first: john
second: doe
with: with
Tried it also with:
s: john
first: john
second: None
with: None
s: john doe
first: john
second: doe
with: None
s: john with
first: john
second: None
with: with
s: john doe width
error: john doe width
s: with
error: with
BTW: re.VERBOSE and re.DEBUG are your friends.
Regards,
Mick.

Categories