I just learnt from Format numbers as currency in Python that the Python module babel provides babel.numbers.format_currency to format numbers as currency. For instance,
from babel.numbers import format_currency
s = format_currency(123456.789, 'USD', locale='en_US') # u'$123,456.79'
s = format_currency(123456.789, 'EUR', locale='fr_FR') # u'123\xa0456,79\xa0\u20ac'
How about the reverse, from currency to numbers, such as $123,456,789.00 --> 123456789? babel provides babel.numbers.parse_number to parse local numbers, but I didn't found something like parse_currency. So, what is the ideal way to parse local currency into numbers?
I went through Python: removing characters except digits from string.
# Way 1
import string
all=string.maketrans('','')
nodigs=all.translate(all, string.digits)
s = '$123,456.79'
n = s.translate(all, nodigs) # 12345679, lost `.`
# Way 2
import re
n = re.sub("\D", "", s) # 12345679
It doesn't take care the decimal separator ..
Remove all non-numeric characters, except for ., from a string (refer to here),
import re
# Way 1:
s = '$123,456.79'
n = re.sub("[^0-9|.]", "", s) # 123456.79
# Way 2:
non_decimal = re.compile(r'[^\d.]+')
s = '$123,456.79'
n = non_decimal.sub('', s) # 123456.79
It does process the decimal separator ..
But the above solutions don't work when coming to, for instance,
from babel.numbers import format_currency
s = format_currency(123456.789, 'EUR', locale='fr_FR') # u'123\xa0456,79\xa0\u20ac'
new_s = s.encode('utf-8') # 123 456,79 €
As you can see, the format of currency varies. What is the ideal way to parse currency into numbers in a general way?
Below is a general currency parser that doesn't rely on the babel library.
import numpy as np
import re
def currency_parser(cur_str):
# Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR)
cur_str = re.sub("[^-0-9.,]", '', cur_str)
# Remove any 000s separators (either , or .)
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
if '.' in list(cur_str[-3:]):
num = float(cur_str)
elif ',' in list(cur_str[-3:]):
num = float(cur_str.replace(',', '.'))
else:
num = float(cur_str)
return np.round(num, 2)
Here is a pytest script that tests the function:
import numpy as np
import pytest
import re
def currency_parser(cur_str):
# Remove any non-numerical characters
# except for ',' '.' or '-' (e.g. EUR)
cur_str = re.sub("[^-0-9.,]", '', cur_str)
# Remove any 000s separators (either , or .)
cur_str = re.sub("[.,]", '', cur_str[:-3]) + cur_str[-3:]
if '.' in list(cur_str[-3:]):
num = float(cur_str)
elif ',' in list(cur_str[-3:]):
num = float(cur_str.replace(',', '.'))
else:
num = float(cur_str)
return np.round(num, 2)
#pytest.mark.parametrize('currency_str, expected', [
(
'.3', 0.30
),
(
'1', 1.00
),
(
'1.3', 1.30
),
(
'43,324', 43324.00
),
(
'3,424', 3424.00
),
(
'-0.00', 0.00
),
(
'EUR433,432.53', 433432.53
),
(
'25.675,26 EUR', 25675.26
),
(
'2.447,93 EUR', 2447.93
),
(
'-540,89EUR', -540.89
),
(
'67.6 EUR', 67.60
),
(
'30.998,63 CHF', 30998.63
),
(
'0,00 CHF', 0.00
),
(
'159.750,00 DKK', 159750.00
),
(
'£ 2.237,85', 2237.85
),
(
'£ 2,237.85', 2237.85
),
(
'-1.876,85 SEK', -1876.85
),
(
'59294325.3', 59294325.30
),
(
'8,53 NOK', 8.53
),
(
'0,09 NOK', 0.09
),
(
'-.9 CZK', -0.9
),
(
'35.255,40 PLN', 35255.40
),
(
'-PLN123.456,78', -123456.78
),
(
'US$123.456,79', 123456.79
),
(
'-PLN123.456,78', -123456.78
),
(
'PLN123.456,79', 123456.79
),
(
'IDR123.457', 123457
),
(
'JP¥123.457', 123457
),
(
'-JP\xc2\xa5123.457', -123457
),
(
'CN\xc2\xa5123.456,79', 123456.79
),
(
'-CN\xc2\xa5123.456,78', -123456.78
),
])
def test_currency_parse(currency_str, expected):
assert currency_parser(currency_str) == expected
Using babel
The babel documentation notes that the number parsing is not fully implemented yes but they have done a lot of work to get currency info into the library. You can use get_currency_name() and get_currency_symbol() to get currency details, and also all other get_... functions to get the normal number details (decimal point, minus sign, etc.).
Using that information you can exclude from a currency string the currency details (name, sign) and groupings (e.g. , in the US). Then you change the decimal details into the ones used by the C locale (- for minus, and . for the decimal point).
This results in this code (i added an object to keep some of the data, which may come handy in further processing):
import re, os
from babel import numbers as n
from babel.core import default_locale
class AmountInfo(object):
def __init__(self, name, symbol, value):
self.name = name
self.symbol = symbol
self.value = value
def parse_currency(value, cur):
decp = n.get_decimal_symbol()
plus = n.get_plus_sign_symbol()
minus = n.get_minus_sign_symbol()
group = n.get_group_symbol()
name = n.get_currency_name(cur)
symbol = n.get_currency_symbol(cur)
remove = [plus, name, symbol, group]
for token in remove:
# remove the pieces of information that shall be obvious
value = re.sub(re.escape(token), '', value)
# change the minus sign to a LOCALE=C minus
value = re.sub(re.escape(minus), '-', value)
# and change the decimal mark to a LOCALE=C decimal point
value = re.sub(re.escape(decp), '.', value)
# just in case remove extraneous spaces
value = re.sub('\s+', '', value)
return AmountInfo(name, symbol, value)
#cur_loc = os.environ['LC_ALL']
cur_loc = default_locale()
print('locale:', cur_loc)
test = [ (n.format_currency(123456.789, 'USD', locale=cur_loc), 'USD')
, (n.format_currency(-123456.78, 'PLN', locale=cur_loc), 'PLN')
, (n.format_currency(123456.789, 'PLN', locale=cur_loc), 'PLN')
, (n.format_currency(123456.789, 'IDR', locale=cur_loc), 'IDR')
, (n.format_currency(123456.789, 'JPY', locale=cur_loc), 'JPY')
, (n.format_currency(-123456.78, 'JPY', locale=cur_loc), 'JPY')
, (n.format_currency(123456.789, 'CNY', locale=cur_loc), 'CNY')
, (n.format_currency(-123456.78, 'CNY', locale=cur_loc), 'CNY')
]
for v,c in test:
print('As currency :', c, ':', v.encode('utf-8'))
info = parse_currency(v, c)
print('As value :', c, ':', info.value)
print('Extra info :', info.name.encode('utf-8')
, info.symbol.encode('utf-8'))
The output looks promising (in US locale):
$ export LC_ALL=en_US
$ ./cur.py
locale: en_US
As currency : USD : b'$123,456.79'
As value : USD : 123456.79
Extra info : b'US Dollar' b'$'
As currency : PLN : b'-z\xc5\x82123,456.78'
As value : PLN : -123456.78
Extra info : b'Polish Zloty' b'z\xc5\x82'
As currency : PLN : b'z\xc5\x82123,456.79'
As value : PLN : 123456.79
Extra info : b'Polish Zloty' b'z\xc5\x82'
As currency : IDR : b'Rp123,457'
As value : IDR : 123457
Extra info : b'Indonesian Rupiah' b'Rp'
As currency : JPY : b'\xc2\xa5123,457'
As value : JPY : 123457
Extra info : b'Japanese Yen' b'\xc2\xa5'
As currency : JPY : b'-\xc2\xa5123,457'
As value : JPY : -123457
Extra info : b'Japanese Yen' b'\xc2\xa5'
As currency : CNY : b'CN\xc2\xa5123,456.79'
As value : CNY : 123456.79
Extra info : b'Chinese Yuan' b'CN\xc2\xa5'
As currency : CNY : b'-CN\xc2\xa5123,456.78'
As value : CNY : -123456.78
Extra info : b'Chinese Yuan' b'CN\xc2\xa5'
And it still works in different locales (Brazil is notable for using the comma as a decimal mark):
$ export LC_ALL=pt_BR
$ ./cur.py
locale: pt_BR
As currency : USD : b'US$123.456,79'
As value : USD : 123456.79
Extra info : b'D\xc3\xb3lar americano' b'US$'
As currency : PLN : b'-PLN123.456,78'
As value : PLN : -123456.78
Extra info : b'Zloti polon\xc3\xaas' b'PLN'
As currency : PLN : b'PLN123.456,79'
As value : PLN : 123456.79
Extra info : b'Zloti polon\xc3\xaas' b'PLN'
As currency : IDR : b'IDR123.457'
As value : IDR : 123457
Extra info : b'Rupia indon\xc3\xa9sia' b'IDR'
As currency : JPY : b'JP\xc2\xa5123.457'
As value : JPY : 123457
Extra info : b'Iene japon\xc3\xaas' b'JP\xc2\xa5'
As currency : JPY : b'-JP\xc2\xa5123.457'
As value : JPY : -123457
Extra info : b'Iene japon\xc3\xaas' b'JP\xc2\xa5'
As currency : CNY : b'CN\xc2\xa5123.456,79'
As value : CNY : 123456.79
Extra info : b'Yuan chin\xc3\xaas' b'CN\xc2\xa5'
As currency : CNY : b'-CN\xc2\xa5123.456,78'
As value : CNY : -123456.78
Extra info : b'Yuan chin\xc3\xaas' b'CN\xc2\xa5'
It is worth to point out that babel has some encoding problems. That is because the locale files (in locale-data) do use different encoding themselves. If you're working with currencies you're familiar with that should not be a problem. But if you try unfamiliar currencies you might run into problems (i just learned that Poland uses iso-8859-2, not iso-8859-1).
Related
I have the following column which consists of email subject headers:
Subject
EXT || Transport enquiry
EXT || RE: EXTERNAL: RE: 0001 || Copy of enquiry
EXT || FW: Model - Jan
SV: [EXTERNAL] Calculations
What I want to achieve is:
Subject
Transport enquiry
0001 || Copy of enquiry
Model - Jan
Calculations
and for this I am using the below code which only takes into account the first regular expression that I am passing and ignoring the rest
def clean_subject_prelim(text):
text = re.sub(r'^EXT \|\| $' , '' , text)
text = re.sub(r'EXT \|\| RE: EXTERNAL: RE:', '' , text)
text = re.sub(r'EXT \|\| FW:', '' , text)
text = re.sub(r'^SV: \[EXTERNAL]$' , '' , text)
return text
df['subject_clean'] = df['Subject'].apply(lambda x: clean_subject_prelim(x))
Why this is not working, what am I missing here?
You can use
pattern = r"""(?mx) # MULTILINE mode on
^ # start of string
(?: # non-capturing group start
EXT\s*\|\|\s*(?:RE:\s*EXTERNAL:\s*RE:|FW:)? # EXT || or EXT || RE: EXTERNAL: RE: or EXT || FW:
| # or
SV:\s*\[EXTERNAL]# SV: [EXTERNAL]
) # non-capturing group end
\s* # zero or more whitespaces
"""
df['subject_clean'] = df['Subject'].str.replace(pattern', '', regex=True)
See the regex demo.
Since the re.X ((?x)) is used, you should escape literal spaces and # chars, or just use \s* or \s+ to match zero/one or more whitespaces.
Get rid of the $ sign in the first expression and switch some of regex expressions from place. Like this:
import pandas as pd
import re
def clean_subject_prelim(text):
text = re.sub(r'EXT \|\| RE: EXTERNAL: RE:', '' , text)
text = re.sub(r'EXT \|\| FW:', '' , text)
text = re.sub(r'^EXT \|\|' , '' , text)
text = re.sub(r'^SV: \[EXTERNAL]' , '' , text)
return text
data = {"Subject": [
"EXT || Transport enquiry",
"EXT || RE: EXTERNAL: RE: 0001 || Copy of enquiry",
"EXT || FW: Model - Jan",
"SV: [EXTERNAL] Calculations"]}
df = pd.DataFrame(data)
df['subject_clean'] = df['Subject'].apply(lambda x: clean_subject_prelim(x))
I have this one string, which is actually price, this price value comes with any currency symbol (currency_list), I am trying to remove these currency symbols from price and return only price.\
Till now I am able to do it for prefix and suffix currency symbol using below code , everything works till here.
I just want to add one validation where if the symbol is not prefix or suffix like "200$434" in btw, then it should return not valid format. which I am not able to understand how should be implemented.
currency_list = ['USD', 'UNITED STATES DOLLAR', '$', 'EUR', 'EURO', '€', 'GBP','BRITISH POUND', '£']
Normally input string can be
"$1212212"
"1212212EURO"
"1212212"
"1212212 BRITISH POUND"
need help to validate values like "1212$343" or "1212212EURO323.23"
Code:
for symb in currency_list:
if symb in amount:
data = amount.replace(symb, '')
After going through multiple blog post, I found this answer which gets the job done.
def validateCurrency(amount):
new_amount=None
for cur in currency_list:
if amount.startswith(cur) or amount.endswith(cur):
new_amount = amount.replace(cur, "", 1)
if new_amount == None:
return "Currency is not valid a string."
return f"Price after removeing symbol is {new_amount}"
// print(validateCurrency('$1212212'))
You can use regex to achieve your purpose.
import re
currency_list = ['USD', 'UNITED STATES DOLLAR', '$', 'EUR', 'EURO', '€', 'GBP', 'BRITISH POUND', '£']
p = re.compile(r'([\D]*)([\d]+\.?[\d]+)(.*)')
def verify_or_get_amount(amount):
first, mid, last = [i.strip() for i in p.search(amount).groups()]
if (first and first not in currency_list) or (last and last not in currency_list):
print('invalid:', amount)
else:
amount = mid
print('amount:', amount)
return mid
for i in ['EURO123', 'EURO 123', 'EURO 123.', 'EURO .12', 'EURO 12.12', '$1212212', '1212212EURO', '1212212', '1212212 BRITISH POUND', '1212$343']:
verify_or_get_amount(i)
using regex:
import re
currency_list = ['USD', 'UNITED STATES DOLLAR', '\$', 'EUR', 'EURO', '€', 'GBP', 'BRITISH POUND', '£']
currencies = '|'.join(currency_list)
c = re.compile(rf'^({currencies})? *(\d+(\.\d+)?) *({currencies})?$')
for i in ['$1212212', '1212212EURO', '1212212', '1212212 BRITISH POUND', '1212$343']:
match_obj = c.match(i)
if match_obj:
print(match_obj.group(2))
else:
print('not found')
output :
1212212
1212212
1212212
1212212
not found
Explanation :
to see actual pattern : print(c.pattern) which gives :
^(USD|UNITED STATES DOLLAR|\$|EUR|EURO|€|GBP|BRITISH POUND|£)?(\d+(\.\d+)?) *(USD|UNITED STATES DOLLAR|\$|EUR|EURO|€|GBP|BRITISH POUND|£)?$
I've escaped $ in the currency_list.
currencies = '|'.join(currency_list) for building possible prefixes or suffixes.
(\d+(\.\d+)?) is for matching price which accept float as well. (you can omit the (\.\d+) part)
the * that you see in regex, is for for example BRITISH POUND which have a space after the number.
I am assuming you want a currency validation function
def validateCurrency(input):
input_length = len(input)
if input.isdigit():return False
split = [re.findall(r'(\D+?)(\d+)|(\d+?)(\D+)', input)[0] ]
total_length = 0
for i in split[0]:
if i in currency_list:
total_length+=len(i)
if str(i).isdigit():
total_length+=len(i)
if total_length == input_length:
return True
else:
return False
I'm trying to remove trademark symbol (™) but only in the case it's not followed by any other symbol for instance I might have ’ which is a bad encoding of quotation mark (') so I don't want to remove trademark symbol (™) and hence broking the pattern that i'm using to replace xx™ with quotation mark.
dict = {};
chars = {
'\xe2\x84\xa2': '', # ™
'\xe2\x80\x99': "'", # ’
}
def stats_change(char, number):
if dict.has_key(char):
dict[char] = dict[char]+number
else:
dict[char] = number # Add new entry
def replace_chars(match):
char = match.group(0)
stats_change(char,1)
return chars[char]
i, nmatches = re.subn("(\\" + '|\\'.join(chars.keys()) + ")", replace_chars, i)
count_matches += nmatches
Input: foo™ oof
Output: foo oof
Input: o’f oof
Output: o'f oof
Any suggestions ?
Consider the below mcve:
import re
import textwrap
import traceback
import unittest
def replace_words(content, replacements):
rc = re.compile(r"[A-Za-z_]\w*")
def translate(match):
word = match.group(0)
return replacements.get(word, word)
return rc.sub(translate, content, re.IGNORECASE | re.MULTILINE)
class class_name(unittest.TestCase):
def setUp(self):
self.replacements = [
{
'PLUS': '"+"',
'DASH': '"-"',
'BANG': '"!"',
'TILDE': '"~"',
'STAR': '"*"',
'SLASH': '"/"',
'PERCENT': '"%"',
'LEFT_PAREN': '"("',
'RIGHT_PAREN': '")"'
}, {
"IF": "fi",
"FOO": "oof",
"BAR": "rab",
"OP_FOO": "oof_op"
}
]
self.texts = [
textwrap.dedent("""\
variable_identifier :
IDENTIFIER
primary_expression :
foo1
foo2
foo3
LEFT_PAREN expression RIGHT_PAREN
unary_operator :
PLUS
DASH
BANG
TILDE
multiplicative_expression :
unary_expression
multiplicative_expression STAR unary_expression
multiplicative_expression SLASH unary_expression
multiplicative_expression PERCENT unary_expression\
"""),
textwrap.dedent("""\
IF identifier IDENTIFIER FOO BAR BARycentric
OP_FOO
""")
]
self.expected_results = [
textwrap.dedent("""\
variable_identifier :
IDENTIFIER
primary_expression :
foo1
foo2
foo3
"(" expression ")"
unary_operator :
"+"
"-"
"!"
"~"
multiplicative_expression :
unary_expression
multiplicative_expression "*" unary_expression
multiplicative_expression "/" unary_expression
multiplicative_expression "%" unary_expression\
"""),
textwrap.dedent("""\
fi identifier IDENTIFIER oof rab BARycentric
oof_op
""")
]
def _tester(self, f):
replacements = self.replacements
expected_results = self.expected_results
texts = self.texts
self.assertEqual(f(texts[0], replacements[0]), expected_results[0])
self.assertEqual(f(texts[1], replacements[1]), expected_results[1])
def test_replace_words(self):
self._tester(replace_words)
if __name__ == "__main__":
unittest.main()
replace_words function is attempting to search and replace case sensitive whole words in a given text using a dictionary of replacements above code but it will fail in the line self.assertEqual(f(texts[0], replacements[0]), expected_results[0]) though and I don't know why.
So the question would be, how do you find and replace case sensitive whole words using a replacements dictionary in python?
You can use re.sub and re.findall:
import re
def regex_string(d, to_lower = False):
if not to_lower:
return '|'.join(r'\b{}\b'.format(i) for i in d.keys())
return '|'.join([c for b in [[r'\b{}\b'.format(i.lower()), r'\b{}\b'.format(i)] for i in d.keys()] for c in b])
replacements = {
'PLUS': '"+"',
'DASH': '"-"',
'BANG': '"!"',
'TILDE': '"~"',
'STAR': '"*"',
'SLASH': '"/"',
'PERCENT': '"%"',
'LEFT_PAREN': '"("',
'RIGHT_PAREN': '")"'
}
replaced = re.sub(regex_string(replacements, True), '{}', content)
final_result = replaced.format(*[replacements.get(i, i) for i in re.findall(regex_string(replacements, True), content)])
Output (case 1):
variable_identifier :
IDENTIFIER
primary_expression :
foo1
foo2
foo3
"(" expression ")"
unary_operator :
"+"
"-"
"!"
"~"
multiplicative_expression :
unary_expression
multiplicative_expression "*" unary_expression
multiplicative_expression "/" unary_expression
multiplicative_expression "%" unary_expression
Output (case 2):
fi identifier IDENTIFIER oof rab BARycentric
oof_op
Or, even shorter:
replaced = re.sub(regex_string(replacements, True), lambda x:replacements.get(x.group(), x.group()), content)
I need to replace some special characters from user input for different platform (i.e. Linux and Windows) using Python. Here is my code:
if request.method == 'POST':
rname1 = request.POST.get('react')
Here I am getting the user input by post method. I need to the following characters to remove from the user input (if there is any).
1- Escape or filter special characters for windows, ( ) < > * ‘ = ? ; [ ] ^ ~ ! . ” % # / \ : + , `
2- Escape or filter special characters for Linux, { } ( ) < > * ‘ = ? ; [ ] $ – # ~ ! . ” % / \ : + , `
The special characters are given above. Here I need to remove for both Linux and Windows.
Python strings have a built in method translate for substitution/deletion of characters. You need to build a translation table and then call the function.
import sys
if "win" in sys.platform:
special = """( ) < > * ‘ = ? ; [ ] ^ ~ ! . ” % # / \ : + , `""".split()
else:
special = """{ } ( ) < > * ‘ = ? ; [ ] $ – # ~ ! . ” % / \ : + , `""".split()
trans_dict = {character: None for character in special}
trans_table = str.maketrans(trans_dict)
print("Lo+=r?e~~m ipsum dol;or sit!! amet, consectet..ur ad%".translate(trans_table))
Will print Lorem ipsum dolor sit amet consectetur ad.
If you want to use a replacement character instead of deleting, then replace None above with the character. You can build a translation table with specific substitutions, `{"a": "m", "b": "n", ...}
Edit: The above snippet is indeed in Python3. In Python2 (TiO) it's easier to delete characters:
>>> import sys
>>> import string
>>> if "win" in sys.platform:
... special = """()<>*'=?;[]^~!%#/\:=,`"""
... else:
... special = """{}()<>*'=?;[]$-#~!."%/\:+"""
...
>>> s = "Lo+r?e~~/\#<>m ips()u;m"
>>> string.translate(s, None, special)
'Lorem ipsum'
Note that I've substituted ‘ with ' and similarly replaced ” with " because I think you're only dealing with ascii strings.