I want to clean my reviews data. Here's my code :
def processData(data):
data = data.lower() #casefold
data = re.sub('<[^>]*>',' ',data) #remove any html
data = re.sub(r'#([^\s]+)', r'\1', data) #Replace #word with word
remove = string.punctuation
remove = remove.replace("'", "") # don't remove '
p = r"[{}]".format(remove) #create the pattern
data = re.sub(p, "", data)
data = re.sub('[\s]+', ' ', data) #remove additional whitespaces
pp = re.compile(r"(.)\1{1,}", re.DOTALL) #pattern for remove repetitions
data = pp.sub(r"\1\1", data)
return data
This code almost work well, but there still a problem.
For this sentence "she work in public-service" ,
I got "she work in publicservice".
The problem is there are no whitespace after string punctuation.
I want my sentence to be like this "she work in public service".
Can you help me with my code?
I think you want this:
>>> st = 'she works in public-service'
>>> import re
>>> re.sub(r'([{}])'.format(string.punctuation),r' ',st)
'she works in public service'
>>>
Related
I have a sentences (logs) look like that ['GET http://10.0.0.0:1000/ HTTP/X.X']
I want to have it in this form :
['GET', 'http://10.0.0.0:1000/', 'HTTP/X.X']
but that's not the fall. i've used this code :
import re
sentences = ['GET http://10.0.0.0:1000/ HTTP/X.X']
rx = re.compile(r'\b(\w{3})\s+(\d{1,2})\s+(\d{2}:\d{2}:\d{2})\s+(\w+)\W+(\d{1,3}(?:\.\d{1,3}){3})(?:\s+\S+){2}\s+\[([^][\s]+)\s+([+\d]+)]\s+"([A-Z]+)\s+(\S+)\s+(\S+)"\s+(\d+)\s+(\d+)\s+\S+\s+"([^"]*)')
words=[]
for sent in sentences:
m = rx.search(sent)
if m:
words.append(list(m.groups()))
else:
words.append(nltk.word_tokenize(sent))
print(words)
i get as an output :
[['GET', 'http', ':', '//10.0.0.0:1000/', 'HTTP/X.X']]
can someone know where is the error, or why it's not working as i want .
Thank you
import re
sentences = ['GET http://10.0.0.0:1000/ HTTP/X.X']
words=[]
for sent in sentences:
words.append(list(sent.split(' ')))
print(words)
Can you use a simple space split? I think nltk.word_tokenize is given you the wrong output!
Looks like, you want to split it using a space. So,
sentences = ['GET http://10.0.0.0:1000/ HTTP/X.X']
words = [x.split(" ") for x in sentences]
print(words)
I wrote the code below. My sentence is part of Twitter. I want to remove all emojis from my list, but my emoji function does not work. Why?
And also I want to remove users. Users start from the beginning of a sentence, but sometimes it keeps users and sometimes it removes users. Also my punctuation does not work and I commented it. How can I fix that?
import spacy, re
nlp = spacy.load('en')
stop_words = [w.lower() for w in stopwords.words()]
def sanitize(input_string):
""" Sanitize one string """
# Remove emoji
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags=re.UNICODE)
string = emoji_pattern.sub(r'', input_string) # No emoji
# Normalize to lowercase
string = input_string.lower()
# Spacy tokenizer
string_split = [token.text for token in nlp(string)]
# In case the string is empty
if not string_split:
return ''
# Remove user
# Assuming user is the first word and contains an #
if '#' in string_split[0]:
del string_split[0]
# Join back to string
string = ' '.join(string_split)
# Remove # and #
for punc in '":!##':
string = string.replace(punc, '')
# Remove 't.co/' links
string = re.sub(r'http//t.co\/[^\s]+', '', string, flags=re.MULTILINE)
# Removing stop words
string = ' '.join([w for w in string.split() if w not in stop_words])
#Punctuation
# string = [''.join(w for w in string.split() if w not in string.punctuation) for w in string]
# return string
#list = ['#cosmetic_candy I think a lot of people just enjoy being a pain in the ass on there',
'Best get ready sunbed and dinner with nana today :)',
'#hardlyin70 thats awesome!',
'Loving this weather',
'“#danny_boy_37: Just seen an absolute idiot in shorts! Be serious!” Desperado gentleman',
'#SamanthaOrmerod trying to resist a hardcore rave haha! Resisting towns a doddle! Posh dance floor should wear them in quite easy xx',
'59 days until #Beyonce!!! Wooo #jfracassini #cannotwait',
'That was the dumbest tweet I ever seen',
'Oh what to do on this fine sunny day?',
'#Brooke_C_X hows the fish ? Hope they r ok. Xx',
'#Jbowe_ 😠',
'Or this #louise_munchi',
'#guy_clifton your diary is undoubtedly busier than mine, but feel free to check ',
'Willy🐟👌⚽']
list_sanitized = [sanitize(string) for string in list]
list_sanitized[:50]
I'm drawing on some other SO answers here:
removing textual emojis: https://stackoverflow.com/a/61758471/42346
removing graphical emojis: https://stackoverflow.com/a/50602709/42346
This will also remove any Twitter username wherever it appears in the string.
import emoji
import spacy
import stop_words
nlp = spacy.load('en_core_web_sm')
stopwords = [w.lower() for w in stop_words.get_stop_words('en')]
emoticon_string = r"""
(?:
[<>]?
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{#\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{#\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]?
)"""
def give_emoji_free_text(text):
return emoji.get_emoji_regexp().sub(r'', text)
def sanitize(string):
""" Sanitize one string """
# remove graphical emoji
string = give_emoji_free_text(string)
# remove textual emoji
string = re.sub(emoticon_string,'',string)
# normalize to lowercase
string = string.lower()
# spacy tokenizer
string_split = [token.text for token in nlp(string)]
# in case the string is empty
if not string_split:
return ''
# join back to string
string = ' '.join(string_split)
# remove user
# assuming user has # in front
string = re.sub(r"""(?:#[\w_]+)""",'',string)
#remove # and #
for punc in '":!##':
string = string.replace(punc, '')
# remove 't.co/' links
string = re.sub(r'http//t.co\/[^\s]+', '', string, flags=re.MULTILINE)
# removing stop words
string = ' '.join([w for w in string.split() if w not in stopwords])
return string
I am using the following code to replace the strings in words with words[0] in the given sentences.
import re
sentences = ['industrial text minings', 'i love advanced data minings and text mining']
words = ["data mining", "advanced data mining", "data minings", "text mining"]
start_terms = sorted(words, key=lambda x: len(x), reverse=True)
start_re = "|".join(re.escape(item) for item in start_terms)
results = []
for sentence in sentences:
for terms in words:
if terms in sentence:
result = re.sub(start_re, words[0], sentence)
results.append(result)
break
print(results)
My expected output is as follows:
[industrial text minings', 'i love data mining and data mining]
However, what I am getting is:
[industrial data minings', 'i love data mining and data mining]
In the first sentence text minings is not in words. However, it contains "text mining" in the words list, so the condition "text mining" in "industrial text minings" becomes True. Then post replacement, it "text mining" becomes "data mining", with the 's' character staying at the same place. I want to avoid such situations.
Therefore, I am wondering if there is a way to use if condition in re.sub to see if the next character is a space or not. If a space, do the replacement, else do not do it.
I am also happy with other solutions that could resolve my issue.
I modifed your code a bit:
# Using Python 3.6.1
import re
sentences = ['industrial text minings and data minings and data', 'i love advanced data mining and text mining as data mining has become a trend']
words = ["data mining", "advanced data mining", "data minings", "text mining", "data", 'text']
# Sort by length
start_terms = sorted(words, key=len, reverse=True)
results = []
# Loop through sentences
for sentence in sentences:
# Loop through sorted words to replace
result = sentence
for term in start_terms:
# Use exact word matching
exact_regex = r'\b' + re.escape(term) + r'\b'
# Replace matches with blank space (to avoid priority conflicts)
result = re.sub(exact_regex, " ", result)
# Replace inserted blank spaces with "data mining"
blank_regex = r'^\s(?=\s)|(?<=\s)\s$|(?<=\s)\s(?=\s)'
result = re.sub(blank_regex, words[0] , result)
results.append(result)
# Print sentences
print(results)
Output:
['industrial data mining minings and data mining and data mining', 'i love data mining and data mining as data mining has become a trend']
The regex can be a bit confusing so here's a quick breakdown:
\bword\b matches exact phrases/words since \b is a word boundary (more on that here)
^\s(?=\s) matches a space at the beginning followed by another space.
(?<=\s)\s$ matches a space at the end preceded by another space.
(?<=\s)\s(?=\s) matches a space with a space on both sides.
For more info on positive look behinds (?<=...) and positive look aheads (?=...) see this Regex tutorial.
You can use a word boundary \b to surround your whole regex:
start_re = "\\b(?:" + "|".join(re.escape(item) for item in start_terms) + ")\\b"
Your regex will become something like:
\b(?:data mining|advanced data mining|data minings|text mining)\b
(?:) denotes a non-capturing group.
I've a text file where important phrases are indicated with special symbols. To be exact, they will start with <highlight> and end with <\highlight>.
For example,
"<highlight>machine learning<\highlight> is gaining more popularity, so do <highlight>block chain<\highlight>."
In this sentence, important phrases are segmented by <highlight> and <\highlight>.
I need to remove the <highlight> and <\highlight>, and replace the space connecting words surrounded by them with underscore. Namely, convert "<highlight>machine learning<\highlight>" to "machine_learning". The whole sentence after processing will be "machine_learning is gaining more popularity, so do block_chain".
Try this:
>>> text = "<highlight>machine learning<\\highlight> is gaining more popularity, so do <highlight>block chain<\\highlight>."
>>> re.sub(r"<highlight>(.*?)<\\highlight>", lambda x: x.group(1).replace(" ", "_"), text)
'machine_learning is gaining more popularity, so do block_chain.'
There you go:
import re
txt = "<highlight>machine learning<\\highlight> is gaining more popularity, so do <highlight>block chain<\\highlight>."
words = re.findall('<highlight>(.*?)<\\\highlight', txt)
for w in words:
txt = txt.replace(w, w.replace(' ', '_'))
txt = txt.replace('<highlight>', '')
txt = txt.replace('<\highlight>', '')
print(txt)
I am trying to preprocess a string using lemmatizer and then remove the punctuation and digits. I am using the code below to do this. I am not getting any error but the text is not preprocessed appropriately. Only the stop words are removed but the lemmatizing does not work and punctuation and digits also remain.
from nltk.stem import WordNetLemmatizer
import string
import nltk
tweets = "This is a beautiful day16~. I am; working on an exercise45.^^^45 text34."
lemmatizer = WordNetLemmatizer()
tweets = lemmatizer.lemmatize(tweets)
data=[]
stop_words = set(nltk.corpus.stopwords.words('english'))
words = nltk.word_tokenize(tweets)
words = [i for i in words if i not in stop_words]
data.append(' '.join(words))
corpus = " ".join(str(x) for x in data)
p = string.punctuation
d = string.digits
table = str.maketrans(p, len(p) * " ")
corpus.translate(table)
table = str.maketrans(d, len(d) * " ")
corpus.translate(table)
print(corpus)
The final output I get is:
This beautiful day16~ . I ; working exercise45.^^^45 text34 .
And expected output should look like:
This beautiful day I work exercise text
No, your current approach does not work, because you must pass one word at a time to the lemmatizer/stemmer, otherwise, those functions won't know to interpret your string as a sentence (they expect words).
import re
__stop_words = set(nltk.corpus.stopwords.words('english'))
def clean(tweet):
cleaned_tweet = re.sub(r'([^\w\s]|\d)+', '', tweets.lower())
return ' '.join([lemmatizer.lemmatize(i, 'v')
for i in cleaned_tweet.split() if i not in __stop_words])
Alternatively, you can use a PorterStemmer, which does the same thing as lemmatisation, but without context.
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
And, call the stemmer like this:
stemmer.stem(i)
I think this is what you're looking for, but do this prior to calling the lemmatizer as the commenter noted.
>>>import re
>>>s = "This is a beautiful day16~. I am; working on an exercise45.^^^45text34."
>>>s = re.sub(r'[^A-Za-z ]', '', s)
This is a beautiful day I am working on an exercise text
To process a tweet properly you can use following code:
import re
import nltk
def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
""" Normalizes case and handles punctuation
Inputs:
text: str: raw text
lemmatizer: an instance of a class implementing the lemmatize() method
(the default argument is of type nltk.stem.wordnet.WordNetLemmatizer)
Outputs:
list(str): tokenized text
"""
bcd=[]
pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
text1= text.lower()
text1= re.sub(pattern,"", text1)
text1= text1.replace("'s "," ")
text1= text1.replace("'","")
text1= text1.replace("—", " ")
table= str.maketrans(string.punctuation,32*" ")
text1= text1.translate(table)
geek= nltk.word_tokenize(text1)
abc=nltk.pos_tag(geek)
output = []
for value in abc:
value = list(value)
if value[1][0] =="N":
value[1] = 'n'
elif value[1][0] =="V":
value[1] = 'v'
elif value[1][0] =="J":
value[1] = 'a'
elif value[1][0] =="R":
value[1] = 'r'
else:
value[1]='n'
output.append(value)
abc=output
for value in abc:
bcd.append(lemmatizer.lemmatize(value[0],pos=value[1]))
return bcd
here I have use post_tag (only N,V,J,R and converted rest all into noun as well). This will return a tokenized and lemmatized list of words.