Removing words with special characters "\" and "/" - python

During the analysis of tweets, I run in the "words" that have either \ or / (could have more than one appearance in one "word"). I would like to have such words removed completely but can not quite nail this
This is what I tried:
sen = 'this is \re\store and b\\fre'
sen1 = 'this i\s /re/store and b//fre/'
slash_back = r'(?:[\w_]+\\[\w_]+)'
slash_fwd = r'(?:[\w_]+/+[\w_]+)'
slash_all = r'(?<!\S)[a-z-]+(?=[,.!?:;]?(?!\S))'
strt = re.sub(slash_back,"",sen)
strt1 = re.sub(slash_fwd,"",sen1)
strt2 = re.sub(slash_all,"",sen1)
print strt
print strt1
print strt2
I would like to get:
this is and
this i\s and
this and
However, I receive:
and
this i\s / and /
i\s /re/store b//fre/
To add: in this scenario the "word" is a string separated either by spaces or punctuation signs (like a regular text)

How's this? I added some punctuation examples:
import re
sen = r'this is \re\store and b\\fre'
sen1 = r'this i\s /re/store and b//fre/'
sen2 = r'this is \re\store, and b\\fre!'
sen3 = r'this i\s /re/store, and b//fre/!'
slash_back = r'\s*(?:[\w_]*\\(?:[\w_]*\\)*[\w_]*)'
slash_fwd = r'\s*(?:[\w_]*/(?:[\w_]*/)*[\w_]*)'
slash_all = r'\s*(?:[\w_]*[/\\](?:[\w_]*[/\\])*[\w_]*)'
strt = re.sub(slash_back,"",sen)
strt1 = re.sub(slash_fwd,"",sen1)
strt2 = re.sub(slash_all,"",sen1)
strt3 = re.sub(slash_back,"",sen2)
strt4 = re.sub(slash_fwd,"",sen3)
strt5 = re.sub(slash_all,"",sen3)
print(strt)
print(strt1)
print(strt2)
print(strt3)
print(strt4)
print(strt5)
Output:
this is and
this i\s and
this and
this is, and!
this i\s, and!
this, and!

One way you could do it without re is with join and a comprehension.
sen = 'this is \re\store and b\\fre'
sen1 = 'this i\s /re/store and b//fre/'
remove_back = lambda s: ' '.join(i for i in s.split() if '\\' not in i)
remove_forward = lambda s: ' '.join(i for i in s.split() if '/' not in i)
>>> print(remove_back(sen))
this is and
>>> print(remove_forward(sen1))
this i\s and
>>> print(remove_back(remove_forward(sen1)))
this and

Related

Line split is not functioning as intended

I am trying to get this code to split one at a time, but it is not functioning as expected:
for line in text_line:
one_line = line.split(' ',1)
if len(one_line) > 1:
acro = one_line[0].strip()
meaning = one_line[1].strip()
if acro in acronyms_dict:
acronyms_dict[acro] = acronyms_dict[acro] + ', ' + meaning
else:
acronyms_dict[acro] = meaning
Remove the ' ' from the str.split. The file is using tabs to delimit the acronyms:
import requests
data_site = requests.get(
"https://raw.githubusercontent.com/priscian/nlp/master/OpenNLP/models/coref/acronyms.txt"
)
text_line = data_site.text.split("\n")
acronyms_dict = {}
for line in text_line:
one_line = line.split(maxsplit=1) # <-- remove the ' '
if len(one_line) > 1:
acro = one_line[0].strip()
meaning = one_line[1].strip()
if acro in acronyms_dict:
acronyms_dict[acro] = acronyms_dict[acro] + ", " + meaning
else:
acronyms_dict[acro] = meaning
print(acronyms_dict)
Prints:
{
'24KHGE': '24 Karat Heavy Gold Electroplate',
'2B1Q': '2 Binary 1 Quaternary',
'2D': '2-Dimensional',
...

How to add a space in hangman game

I'm trying to make a game, where a song name is picked from a file, and the title is replaced with underscores (apart from the first letter)
However I'm not sure how to add a space into it, as some songs are more than one word, this is what I have currently:
def QuizStart():
line = random.choice(open('songnamefile.txt').readlines())
line.split('-')
songname, artist = line.split('-')
underscoresong = songname
i=0
song_name = range(1,len(songname))
for i in song_name:
if ' ' in song_name:
i=i+1
else:
underscoresong = underscoresong.replace(songname[i],"_")
i=i+1
print(underscoresong, ' - ', artist)
It would be good to include expected output for a given input examples.
You can just multiply an array containing the placeholder character n times. e.g.:
songname = 'My blue submarine'
underscoresong = ''.join([songname[0]] + ['_'] * (len(songname) - 1))
print(underscoresong)
Output:
M________________
That will add the first character and then the underscore for as long as the songname is, minus one (for the first character). The join converts it to a string.
Or if you want to preserve spaces:
underscoresong = ''.join(
[songname[0]] + ['_' if c != ' ' else ' ' for c in songname[1:]]
)
print(underscoresong)
Output:
M_ ____ _________
Or if you want to also preserve the single quote:
songname = "God's Plan-Drake"
underscoresong = ''.join(
[songname[0]] +
['_' if c not in {' ', "'"} else c for c in songname[1:]]
)
print(underscoresong)
Output:
G__'_ __________
You could also use regular expressions:
import re
songname = "God's Plan-Drake"
underscoresong = songname[0] + re.sub(r"[^ ']", '_', songname[1:])
print(underscoresong)
Output:
G__'_ __________

String Operation on captured group in re Python

I have a string:
str1 = "abc = def"
I want to convert it to:
str2 = "abc = #Abc#"
I am trying this:
re.sub("(\w+) = (\w+)",r"\1 = %s" % ("#"+str(r"\1").title()+"#"),str1)
but it returns: (without the string operation done)
"abc = #abc#"
What is the possible reason .title() is not working.?
How to use string operation on the captured group in python?
You can see what's going on with the help of a little function:
import re
str1 = "abc = def"
def fun(m):
print("In fun(): " + m)
return m
str2 = re.sub(r"(\w+) = (\w+)",
r"\1 = %s" % ("#" + fun(r"\1") + "#"),
# ^^^^^^^^^^
str1)
Which yields
In fun(): \1
So what you are basically trying to do is to change \1 (not the substitute!) to an uppercase version which obviously remains \1 literally. The \1 is replaced only later with the captured content than your call to str.title().
Go with a lambda function as proposed by #Rakesh.
Try using lambda.
Ex:
import re
str1 = "abc = def"
print( re.sub("(?P<one>(\w+)) = (\w+)",lambda match: r'{0} = #{1}#'.format(match.group('one'), match.group('one').title()), str1) )
Output:
abc = #Abc#

Removing Spaces in Python not working

I am trying to remove spaces.
I have tried everything from previous threads including re.sub
Code:
wordinput = (input("Input:\n"))
wordinput = wordinput.lower()
cleanword = wordinput.replace(" ","")
cleanword = wordinput.replace(",","")
cleanword = wordinput.replace(".","")
revword = cleanword [::-1]
print(cleanword)
print(revword)
print("Output:")
if (cleanword == revword):
print('"The word ' + wordinput + ' is a palindrome!"')
else:
print('"Unfortunately the word ' + wordinput + ' is not a palindrome. :(')
Output:
Input:
mr owl ate my metal worm
mr owl ate my metal worm
mrow latem ym eta lwo rm
Output:
"Unfortunately the word mr owl ate my metal worm is not a palindrome. :(
The problem you are having is here:
cleanword = wordinput.replace(" ","")
cleanword = wordinput.replace(",","")
cleanword = wordinput.replace(".","")
You are not saving the results of the previous replace.
Try:
cleanword = wordinput.replace(" ", "").replace(",", "").replace(".", "")
#StephenRauch explains your problem well.
But here is a better way to implement your logic:
chars = ',. '
wordinput = 'mr owl ate my metal worm '
cleanword = wordinput.translate(dict.fromkeys(map(ord, chars)))
# 'mrowlatemymetalworm'
Did you try something like:
import re
cleanword = re.sub(r'\W+', '', wordinput.lower())
wordinput = (input("Input:\n"))
cleanword=''.join([e for e in wordinput.lower() if e not in ", ."])
You could try this comprehension
Maybe you should try this one:
wordinput = raw_input("Input:\n")
cleanword =''.join([x for x in wordinput.lower() if x not in (',','.',' ')])
if cleanword[:] == cleanword[::-1]:
print ('"The word ' + wordinput + ' is a palindrome!"')
else:
print ('"The word ' + wordinput + ' is not a palindrome!"')
After first replace, on subsequent replace, you need to use cleanword which is the updated string instead of wordinput. You can try following:
wordinput = (input("Input:\n"))
wordinput = wordinput.lower()
cleanword = wordinput.replace(" ","")
# updating 'cleanword' and saving it
cleanword = cleanword.replace(",","")
cleanword = cleanword.replace(".","")
revword = cleanword [::-1]
print(cleanword)
print(revword)
print("Output:")
if (cleanword == revword):
print('"The word ' + wordinput + ' is a palindrome!"')
else:
print('"Unfortunately the word ' + wordinput + ' is not a palindrome. :(')

Changing words in a string to capitalize a text file

In order to fix a bunch all-uppercase text files, I have written a script that:
Lowers all characters and capitalizes the first word of each line and the first word after a period.
Capitalizes all words that are in a list of city and country names (from another text file)
def lowit(line):
line = line.lower()
sentences = line.split('. ')
sentences2 = [sentence[0].capitalize() + sentence[1:] for sentence in sentences]
string2 = '. '.join(sentences2)
return string2
def capcico(line, allKeywords):
allWords = line.split(' ')
original = line.split(' ')
for i,words in enumerate(allWords):
words = words.replace(',', '')
words = words.replace('.', '')
words = words.replace(';', '')
if words in allKeywords:
original[i] = original[i].capitalize()
return ' '.join(original)
def main():
dfile = open('fixed.txt', 'w')
f = open('allist.txt', 'r')
allKeywords = f.read().split('\n')
with open('ulm.txt', 'r') as fileinput:
for line in fileinput:
low_line = lowit(line)
dfile.write('\n' + capcico(low_line, allKeywords))
dfile.close()
if __name__ == '__main__':
main()
It works, but the problem is that it doesn't capitalize a city/Country if there are more than one in the same line:
TOWN IN WUERTTEMBERG, GERMANY.
changes to:
Town in Wuerttemberg, germany.
Any Ideas to what's wrong?
TNX
It is because "germany" is really "germany\n".
Strip the EOL off the word...
words = words.replace(',', '')
words = words.replace('.', '')
words = words.replace(';', '')
# Add in this line to strip the EOL
words = words.rstrip('\r\n')
#Input
fileinput = open("ulm.txt").read()
##Input lower
filow = fileinput.lower()
#Keywords
allKeywords = open("allist.txt").read().split("\n")
for kw in allKeywords:
filow = filow.replace(kw.strip().lower(), kw.capitalize())
#Dots
fidots = filow.split(".")
for i,d in enumerate(fidots):
c = d.strip().capitalize()
dc = d.replace(c.lower(), c)
fidots[i] = dc
#Result
dfile = open("fixed.txt", "w")
result = ".".join(fidots)
dfile.write(result)
dfile.close()

Categories