I'm trying to remove trademark symbol (™) but only in the case it's not followed by any other symbol for instance I might have ’ which is a bad encoding of quotation mark (') so I don't want to remove trademark symbol (™) and hence broking the pattern that i'm using to replace xx™ with quotation mark.
dict = {};
chars = {
'\xe2\x84\xa2': '', # ™
'\xe2\x80\x99': "'", # ’
}
def stats_change(char, number):
if dict.has_key(char):
dict[char] = dict[char]+number
else:
dict[char] = number # Add new entry
def replace_chars(match):
char = match.group(0)
stats_change(char,1)
return chars[char]
i, nmatches = re.subn("(\\" + '|\\'.join(chars.keys()) + ")", replace_chars, i)
count_matches += nmatches
Input: foo™ oof
Output: foo oof
Input: o’f oof
Output: o'f oof
Any suggestions ?
Related
I have the following column which consists of email subject headers:
Subject
EXT || Transport enquiry
EXT || RE: EXTERNAL: RE: 0001 || Copy of enquiry
EXT || FW: Model - Jan
SV: [EXTERNAL] Calculations
What I want to achieve is:
Subject
Transport enquiry
0001 || Copy of enquiry
Model - Jan
Calculations
and for this I am using the below code which only takes into account the first regular expression that I am passing and ignoring the rest
def clean_subject_prelim(text):
text = re.sub(r'^EXT \|\| $' , '' , text)
text = re.sub(r'EXT \|\| RE: EXTERNAL: RE:', '' , text)
text = re.sub(r'EXT \|\| FW:', '' , text)
text = re.sub(r'^SV: \[EXTERNAL]$' , '' , text)
return text
df['subject_clean'] = df['Subject'].apply(lambda x: clean_subject_prelim(x))
Why this is not working, what am I missing here?
You can use
pattern = r"""(?mx) # MULTILINE mode on
^ # start of string
(?: # non-capturing group start
EXT\s*\|\|\s*(?:RE:\s*EXTERNAL:\s*RE:|FW:)? # EXT || or EXT || RE: EXTERNAL: RE: or EXT || FW:
| # or
SV:\s*\[EXTERNAL]# SV: [EXTERNAL]
) # non-capturing group end
\s* # zero or more whitespaces
"""
df['subject_clean'] = df['Subject'].str.replace(pattern', '', regex=True)
See the regex demo.
Since the re.X ((?x)) is used, you should escape literal spaces and # chars, or just use \s* or \s+ to match zero/one or more whitespaces.
Get rid of the $ sign in the first expression and switch some of regex expressions from place. Like this:
import pandas as pd
import re
def clean_subject_prelim(text):
text = re.sub(r'EXT \|\| RE: EXTERNAL: RE:', '' , text)
text = re.sub(r'EXT \|\| FW:', '' , text)
text = re.sub(r'^EXT \|\|' , '' , text)
text = re.sub(r'^SV: \[EXTERNAL]' , '' , text)
return text
data = {"Subject": [
"EXT || Transport enquiry",
"EXT || RE: EXTERNAL: RE: 0001 || Copy of enquiry",
"EXT || FW: Model - Jan",
"SV: [EXTERNAL] Calculations"]}
df = pd.DataFrame(data)
df['subject_clean'] = df['Subject'].apply(lambda x: clean_subject_prelim(x))
So I have an assignment about class in python. I've done almost the same as the desired output. But mine prints with parenthesis and comma and in a tuple (I don't even know how its in a tuple). Nowhere do I specify it to be tuple. What did I do wrong?
class buttons:
def __init__(self,word,spaces,border):
self.word = word
self.spaces = spaces
self.border = border
word = "CANCEL"
spaces = 10
border = 'x'
b1 = buttons(word, spaces, border)
print('CANCEL Button Specification:')
print('Button Name:', b1.word)
b1_bord = 1+b1.spaces+len(b1.word)+b1.spaces+1
print('Number of the border characters for the top and the bottom:', b1_bord)
print('Number of spaces between the left side border and the first character of the button \nname:', b1.spaces)
print('Number of spaces between the right side border and the last character of the button \nname:', b1.spaces)
print('Characters representing the borders:', b1.border)
emp = ''
print(b1.border*b1_bord)
print(f'{b1.border,(emp*spaces),b1.word,(emp*b1.spaces),b1.border}')
print(b1.border*b1_bord)
print("=======================================================")
b2 = buttons("Notify",3, '!')
print('NOTIFY Button Specification:')
print('Button Name:', b2.word)
b2_bord = 1+b2.spaces+len(b2.word)+b2.spaces+1
print('Number of the border characters for the top and the bottom:', b2_bord)
print('Number of spaces between the left side border and the first character of the button \nname:', b2.spaces)
print('Number of spaces between the right side border and the last character of the button \nname:', b2.spaces)
print('Characters representing the borders:', b2.border)
print(b2.border*b2_bord)
print(f'{b2.border,(emp*b2.spaces),b2.word,(emp*b2.spaces),b2.border}')
print(b2.border*b2_bord)
print("=======================================================")
b3 = buttons('SAVE PROGRESS', 5, '$')
print('SAVE PROGRESS Button Specification:')
print('Button Name:', b3.word)
b3_bord = 1+b3.spaces+len(b3.word)+b3.spaces+1
print('Number of the border characters for the top and the bottom:', b3_bord)
print('Number of spaces between the left side border and the first character of the button \nname:', b3.spaces)
print('Number of spaces between the right side border and the last character of the button \nname:', b3.spaces)
print('Characters representing the borders:', b3.border)
print(b3.border*b3_bord)
print(f'{b3.border,(emp*b3.spaces),b3.word,(emp*b3.spaces),b3.border}')
print(b3.border*b3_bord)
The output I get:
xxxxxxxxxxxxxxxxxxxxxxxxxxxx
('x', '', 'CANCEL', '', 'x')
xxxxxxxxxxxxxxxxxxxxxxxxxxxx
!!!!!!!!!!!!!!
('!', '', 'Notify', '', '!')
!!!!!!!!!!!!!!
$$$$$$$$$$$$$$$$$$$$$$$$$
('$', '', 'SAVE PROGRESS', '', '$')
$$$$$$$$$$$$$$$$$$$$$$$$$
The output I need:
xxxxxxxxxxxxxxxxxxxxxxxxxxxx
x CANCEL x
xxxxxxxxxxxxxxxxxxxxxxxxxxxx
!!!!!!!!!!!!!!
! Notify !
!!!!!!!!!!!!!!
$$$$$$$$$$$$$$$$$$$$$$$$$
$ SAVE PROGRESS $
$$$$$$$$$$$$$$$$$$$$$$$$$
You can do this
def make_btn(word, space, border):
middle = border + word.center(space*2+len(word)) + border
edge = border*len(middle)
return f'{edge}\n{middle}\n{edge}'
print(make_btn('CANCEL',10,'x'))
print(make_btn('Notify',3,'!'))
print(make_btn('SAVE PROGRESS',5,'$'))
Firstly,
emp = ''
should be:
emp = ' '
And secondly,
print(f'{b1.border(emp*b1.spaces),b1.word(emp*b1.spaces),b1.border}')
should be:
print(b1.border+(emp*spaces)+b1.word+(emp*b1.spaces)+b1.border)
You are using Formatted String Literals with printf which is unnecessary in this case. Just use print. And also, use '+' instead of ',' to join strings.
Repeat the print statement for b2 and b3.
I have a long dictionary which looks like this:
name = 'Barack.'
name_last = 'Obama!'
street_name = "President Streeet?"
list_of_slot_names = {'name':name, 'name_last':name_last, 'street_name':street_name}
I want to remove the punctation for every slot (name, name_last,...).
I could do it this way:
name = name.translate(str.maketrans('', '', string.punctuation))
name_last = name_last.translate(str.maketrans('', '', string.punctuation))
street_name = street_name.translate(str.maketrans('', '', string.punctuation))
Do you know a shorter (more compact) way to write this?
Result:
>>> print(name, name_last, street_name)
>>> Barack Obama President Streeet
Use a loop / dictionary comprehension
{k: v.translate(str.maketrans('', '', string.punctuation)) for k, v in list_of_slot_names.items()}
You can either assign this back to list_of_slot_names if you want to overwrite existing values or assign to a new variable
You can also then print via
print(*list_of_slot_names.values())
name = 'Barack.'
name_last = 'Obama!'
empty_slot = None
street_name = "President Streeet?"
print([str_.strip('.?!') for str_ in (name, name_last, empty_slot, street_name) if str_ is not None])
-> Barack Obama President Streeet
Unless you also want to remove them from the middle. Then do this
import re
name = 'Barack.'
name_last = 'Obama!'
empty_slot = None
street_name = "President Streeet?"
print([re.sub('[.?!]+',"",str_) for str_ in (name, name_last, empty_slot, street_name) if str_ is not None])
import re, string
s = 'hell:o? wor!d.'
clean = re.sub(rf"[{string.punctuation}]", "", s)
print(clean)
output
hello world
There are four keywords: title, blog, tags, state
Excess keyword occurrences are being removed from their respective matches.
Example:
blog: blog state title tags and returns state title tags and instead of
blog state title tags and
The sub function should be matching .+ after it sees blog:, so I don't know why it treats blog as an exception to .+
Regex:
re.sub(r'((^|\n|\s|\b)(title|blog|tags|state)(\:\s).+(\n|$))', matcher, a)
Code:
def n15():
import re
a = """blog: blog: fooblog
state: private
title: this is atitle bun
and text"""
kwargs = {}
def matcher(string):
v = string.group(1).replace(string.group(2), '').replace(string.group(3), '').replace(string.group(4), '').replace(string.group(5), '')
if string.group(3) == 'title':
kwargs['title'] = v
elif string.group(3) == 'blog':
kwargs['blog_url'] = v
elif string.group(3) == 'tags':
kwargs['comma_separated_tags'] = v
elif string.group(3) == 'state':
kwargs['post_state'] = v
return ''
a = re.sub(r'((^|\n|\s|\b)(title|blog|tags|state)(\:\s).+(\n|$))', matcher, a)
a = a.replace('\n', '<br />')
a = a.replace('\r', '')
a = a.replace('"', r'\"')
a = '<p>' + a + '</p>'
kwargs['body'] = a
print kwargs
Output:
{'body': '<p>and text</p>', 'post_state': 'private', 'blog_url': 'foo', 'title': 'this is a bun'}
Edit:
Desired Output:
{'body': '<p>and text</p>', 'post_state': 'private', 'blog_url': 'fooblog', 'title': 'this is atitle bun'}
replace(string.group(3), '')
is replacing all occurrences of 'blog' with '' .
Rather than try to replace all the other parts of the matched string, which will be hard to get right, I suggest capture the string you actually want in the original match.
r'((^|\n|\s|\b)(title|blog|tags|state)(\:\s)(.+)(\n|$))'
which has () around the .+ to capture that part of the string, then
v = match.group(5)
at the start of matcher.
What is the regex to match this is *some text*. but not this is \*another \*text. The regex is supposed to match the texts between the asterisks.
pattern = "\*(\w+(?:\s+\w+)*)\*"
re.findall(pattern, "this is *some text*.") // return 'some text'
re.findall(pattern, "this is \*another \*text") // return nothing
For replacing '*' with '$':
subpattern = "(\*(\w+(?:\s+\w+)*)\*)"
re.sub(subpattern, r"$\2$", "this is *some text*.") // return 'this is $some text$.'