import re
import spacy
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
from nltk.corpus import wordnet
inputfile = open('inputfile.txt', 'r')
String= inputfile.read()
nlp = spacy.load('en_core_web_sm')
def candidate_name_extractor(input_string, nlp):
input_string = str(input_string)
doc = nlp(input_string)
# Extract entities
doc_entities = doc.ents
# Subset to person type entities
doc_persons = filter(lambda x: x.label_ == 'PERSON', doc_entities)
doc_persons = filter(lambda x: len(x.text.strip().split()) >= 2, doc_persons)
doc_persons = list(map(lambda x: x.text.strip(), doc_persons))
print(doc_persons)
# Assuming that the first Person entity with more than two tokens is the candidate's name
candidate_name = doc_persons[0]
return candidate_name
if __name__ == '__main__':
names = candidate_name_extractor(String, nlp)
print(names)
I want to extract the name of candidate from text file, but it returns the wrong value. when i remove list with map then map is also not working and gives the error
import re
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
from nltk.corpus import wordnet
String = 'Ravana was killed in a war'
Sentences = nltk.sent_tokenize(String)
Tokens = []
for Sent in Sentences:
Tokens.append(nltk.word_tokenize(Sent))
Words_List = [nltk.pos_tag(Token) for Token in Tokens]
Nouns_List = []
for List in Words_List:
for Word in List:
if re.match('[NN.*]', Word[1]):
Nouns_List.append(Word[0])
Names = []
for Nouns in Nouns_List:
if not wordnet.synsets(Nouns):
Names.append(Nouns)
print (Names)
Check this code. I am getting Ravana as output.
EDIT:
I used a few sentences from my resume to create a text file, and gave it as input to my program. Only the changed portion of the code is shown below:
import io
File = io.open("Documents\\Temp.txt", 'r', encoding = 'utf-8')
String = File.read()
String = re.sub('[/|.|#|%|\d+]', '', String)
And it is returning all the names that are not in the wordnet corpus, like my name, my house name, place, college name and place.
From the word list obtained after parts-of-speech tagging, extract all the words having noun tag using regular expression:
Nouns_List = []
for Word in nltk.pos_tag(Words_List):
if re.match('[NN.*]', Word[1]):
Nouns_List.append(Word[0])
For each word in the Nouns_List, check whether it is an English word. This can be done by checking whether synsets are available for that word in wordnet:
from nltk.corpus import wordnet
Names = []
for Nouns in Nouns_List:
if not wordnet.synsets(Nouns):
#Not an English word
Names.append(Nouns)
Since Indian names cannot be entries in English dictionary, this can be a possible method to extract them from a text.
Related
I am trying to remove Japanese stopwords from a text corpus from twitter.
Unfortunately the frequently used nltk does not contain Japanese, so I had to figure out a different way.
This is my MWE:
import urllib
from urllib.request import urlopen
import MeCab
import re
# slothlib
slothlib_path = "http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt"
sloth_file = urllib.request.urlopen(slothlib_path)
# stopwordsiso
iso_path = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ja/master/stopwords-ja.txt"
iso_file = urllib.request.urlopen(iso_path)
stopwords = [line.decode("utf-8").strip() for line in iso_file]
stopwords = [ss for ss in stopwords if not ss==u'']
stopwords = list(set(stopwords))
text = '日本語の自然言語処理は本当にしんどい、と彼は十回言った。'
tagger = MeCab.Tagger("-Owakati")
tok_text = tagger.parse(text)
ws = re.compile(" ")
words = [word for word in ws.split(tok_text)]
if words[-1] == u"\n":
words = words[:-1]
ws = [w for w in words if w not in stopwords]
print(words)
print(ws)
Successfully Completed: It does give out the original tokenized text as well as the one without stopwords
['日本語', 'の', '自然', '言語', '処理', 'は', '本当に', 'しんどい', '、', 'と', '彼', 'は', '十', '回', '言っ', 'た', '。']
['日本語', '自然', '言語', '処理', '本当に', 'しんどい', '、', '十', '回', '言っ', '。']
There is still 2 issues I am facing though:
a) Is it possible to have 2 stopword lists regarded? namely iso_file and sloth_file ? so if the word is either a stopword from iso_file or sloth_file it will be removed? (I tried to use line 14 as
stopwords = [line.decode("utf-8").strip() for line in zip('iso_file','sloth_file')]
but received an error as tuple attributes may not be decoded
b) The ultimate goal would be to generate a new text file in which all stopwords are removed.
I had created this MWE
### first clean twitter csv
import pandas as pd
import re
import emoji
df = pd.read_csv("input.csv")
def cleaner(tweet):
tweet = re.sub(r"#[^\s]+","",tweet) #Remove #username
tweet = re.sub(r"(?:\#|http?\://|https?\://|www)\S+|\\n","", tweet) #Remove http links & \n
tweet = " ".join(tweet.split())
tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI) #Remove Emojis
tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
return tweet
df['text'] = df['text'].map(lambda x: cleaner(x))
df['text'].to_csv(r'cleaned.txt', header=None, index=None, sep='\t', mode='a')
### remove stopwords
import urllib
from urllib.request import urlopen
import MeCab
import re
# slothlib
slothlib_path = "http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt"
sloth_file = urllib.request.urlopen(slothlib_path)
#stopwordsiso
iso_path = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ja/master/stopwords-ja.txt"
iso_file = urllib.request.urlopen(iso_path)
stopwords = [line.decode("utf-8").strip() for line in iso_file]
stopwords = [ss for ss in stopwords if not ss==u'']
stopwords = list(set(stopwords))
with open("cleaned.txt",encoding='utf8') as f:
cleanedlist = f.readlines()
cleanedlist = list(set(cleanedlist))
tagger = MeCab.Tagger("-Owakati")
tok_text = tagger.parse(cleanedlist)
ws = re.compile(" ")
words = [word for word in ws.split(tok_text)]
if words[-1] == u"\n":
words = words[:-1]
ws = [w for w in words if w not in stopwords]
print(words)
print(ws)
While it works for the simple input text in the first MWE, for the MWE I just stated I get the error
in method 'Tagger_parse', argument 2 of type 'char const *'
Additional information:
Wrong number or type of arguments for overloaded function 'Tagger_parse'.
Possible C/C++ prototypes are:
MeCab::Tagger::parse(MeCab::Lattice *) const
MeCab::Tagger::parse(char const *)
for this line: tok_text = tagger.parse(cleanedlist)
So I assume I will need to make amendments to the cleanedlist?
I have uploaded the cleaned.txt on github for reproducing the issue:
[txt on github][1]
Also: How would I be able to get the tokenized list that excludes stopwords back to a text format like cleaned.txt? Would it be possible to for this purpose create a df of ws?
Or might there even be a more simple way?
Sorry for the long request, I tried a lot and tried to make it as easy as possible to understand what I'm driving at :-)
Thank you very much!
[1]: https://gist.github.com/yin-ori/1756f6236944e458fdbc4a4aa8f85a2c
It sounds like you want to:
combine two lists of stopwords
save text that has had stopwords removed
For problem 1, if you have two lists you can make them into one list with full_list = list1 + list2. You can then make them into a set after that.
The reason you are getting the MeCab error is probably that you are passing a list to parse, which expects a string. (What MeCab wrapper are you using? I have never seen that particular error.) As a note, you should pass each individual tweet to MeCab, instead of the combined text of all tweets, something like:
tokenized = [tagger.parse(tweet) for tweet in cleanedlist]
That should resolve your problem.
Saving text with stopwords removed is just the same as any text file.
As a separate point...
Stopword lists are not very useful in Japanese because if you're using something like MeCab you already have part of speech information. So you should use that instead to throw out verb endings, function words, and so on.
Also removing stopwords is probably actively unhelpful if you're using any modern NLP methods, see the spaCy preprocessing FAQ.
I am very novice in python and I treat to extract emotions from sentence in datafram though senticNet
this my code but its not correct
I don't know what's the wrong
from senticnet.senticnet import SenticNet
def emotion_list1(text):
Emotion_list=[]
Emotion = pd.DataFrame(columns=['Emotion'])
sn = SenticNet()
for elemnt in text:
for word in elemnt:
try:
Emotion_list.append(sn.moodtags(word))
except:
pass
Emotion = Emotion.append(pd.Series(Emotion_list),ignore_index=True)
return Emotion
dfe= pd.DataFrame()
clean_text_list = df['translated'].values
words_list = [text.split() for text in clean_text_list]
dfe = emotion_list1(words_list)
Are you facing any specific errors? I am able to extract the emotions using sn.moodtags() from a sentence.
# import
from senticnet.senticnet import SenticNet
from nltk.tokenize import word_tokenize
# define sentinet()
sn = SenticNet()
# create empty list to store results
emotion_list = []
# tokenize text
# you can use word_tokenize() from the nltk library to tokenize your text
text = 'love hate python'
tokenized_text = word_tokenize(text)
# loop through tokenized text and emtion and append to list
for word in tokenized_text:
emotion_list.append(sn.moodtags(word))
# print
print(emotion_list)
This outputs:
[['#joy', '#eagerness'], ['#pleasantness', '#fear'], ['#pleasantness', '#fear']]
I'm a new student of natural language processing and I have a task regarding simple corpus analysis. Given an input file (MovieCorpus.txt) we are assigned to compute the following statistics:
Number of sentences, tokens, types (lemmas)
Distribution of sentence length, types, POS
import nltk
import spacy as sp
from nltk import word_tokenize
# Setting Spacy Modelsp
nlp = sp.load('en_core_web_sm')
# Movie Corpus
with open ('MovieCorpus.txt','r') as f:
read_data = f.read().splitlines()
# Tokenize, POS, Lemma
tokens = []
lemma = []
pos = []
for doc in nlp.pipe(read_data):
if doc.is_parsed:
tokens.append([n.text for n in doc])
lemma.append([n.lemma_ for n in doc])
pos.append([n.pos_ for n in doc])
else:
tokens.append(None)
lemma.append(None)
pos.append(None)
ls = len(read_data)
print("The amount of sentences is %d:" %ls)
lt = len(tokens)
print("The amount of tokens is %d:" %lt)
ll = len(lemma)
print("The amount of lemmas is %d:" %ll)
This is attempt at answering those questions but since the file is very large (>300.000 sentences) it takes forever to analyze. Is there anything I did wrong? Should I rather use NLTK instead of spacy?
import pandas as pd
import nltk
from nltk import word_tokenize
# Movie Corpus
with open ('MovieCorpus.txt','r') as f:
read_data = f.read().splitlines()
df = pd.DataFrame({"text": read_data}) # Assuming your data has no header
data = data.head(10)
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
data['lemma'] = data.text.apply(lemmatize_text)
data["tokens"] = data.text.apply(nltk.word_tokenize)
data["posR"] = data.tokens.apply(lambda x: nltk.pos_tag(x))
tags = [[tag for word, tag in _] for _ in data["posR"].to_list()]
data["pos"] = tags
print(data)
From here on you should be able to do all other tasks by yourself.
LANGUAGE = "english"
stemmer = Stemmer(LANGUAGE)
def get_luhn_summary(text):
summ = list()
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
summarizer = LuhnSummarizer()
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document,10):
summ.append(str(sentence))
return summ
summaryA_luhn = get_luhn_summary(textA)
Always returns the original string. I am confused cause I am following the documentation to the t
The summarization is done by sentence count.
import nltk
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
LANGUAGE = "english"
SENTENCES_COUNT = 2
nltk.download('punkt')
parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, SENTENCES_COUNT):
print(sentence)
The following will read sentences from file name document.txt and based on SENTENCES_COUNT it will summarize based on the number of sentences you specify.
So if document.txt has 10 sentences and you set SENTENCES_COUNT = 2 you will get a summarization of two sentences.
You can also simply swap out:
parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
with:
text = "This is the string to parse. Hopefully it will be more than one sentence. Like so!"
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
If you what to parse from string instead of a file.
Trying to remove punctuation from the list of words. New to python programming so if someone could help that would be great. The purpose of this is to be used for email spam classification. Previously I had joined the words after checking to see if punctuation was present, but this gave me single characters rather than whole words. After changing it to get words this is what I have below so now trying to remove the punctuation as won't work the same as I did before.
import os
import string
from collections import Counter
from os import listdir # return all files and folders in the directory
import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# used for importing the lingspam dataset
def importLingspamDataset(dir):
allEmails = [] # for storing the emails once read
fileNames = []
for file in listdir(dir):
f = open((dir + '/' + file), "r") # used for opening the file in read only format
fileNames.append(file)
allEmails.append(f.read()) # appends the read emails to the emails array
f.close()
return allEmails, fileNames
def importEnronDataset(dir):
allEmails = [] # for storing the emails once read
fileNames = []
for file in listdir(dir):
f = open((dir + '/' + file), "r") # used for opening the file in read only format
fileNames.append(file)
allEmails.append(f.read()) # appends the read emails to the emails array
f.close()
return allEmails, fileNames
# used to remove punctuation from the emails as this is of no use for detecting spam
def removePunctuation(cleanedEmails):
punc = set(string.punctuation)
for word, line in enumerate(cleanedEmails):
words = line.split()
x = [''.join(c for c in words if c not in string.punctuation)]
allWords = []
allWords += x
return allWords
# used to remove stopwords i.e. words of no use in detecting spam
def removeStopwords(cleanedEmails):
removeWords = set(stopwords.words('english')) # sets all the stopwords to be removed
for stopw in removeWords: # for each word in remove words
if stopw not in removeWords: # if the word is not in the stopwords to be removed
cleanedEmails.append(stopw) # add this word to the cleaned emails
return(cleanedEmails)
# funtion to return words to its root form - allows simplicity
def lemmatizeEmails(cleanedEmails):
lemma = WordNetLemmatizer() # to be used for returning each word to its root form
lemmaEmails = [lemma.lemmatize(i) for i in cleanedEmails] # lemmatize each word in the cleaned emails
return lemmaEmails
# function to allow a systematic process of elimating the undesired elements within the emails
def cleanAllEmails(cleanedEmails):
cleanPunc = removePunctuation(cleanedEmails)
cleanStop = removeStopwords(cleanPunc)
cleanLemma = lemmatizeEmails(cleanStop)
return cleanLemma
def createDictionary(email):
allWords = []
allWords.extend(email)
dictionary = Counter(allWords)
dictionary.most_common(3000)
word_cloud = WordCloud(width=400, height=400, background_color='white',
min_font_size=12).generate_from_frequencies(dictionary)
plt.imshow(word_cloud)
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()
word_cloud.to_file('test1.png')
def featureExtraction(email):
emailFiles = []
emailFiles.extend(email)
featureMatrix = np.zeros((len(emailFiles), 3000))
def classifyLingspamDataset(email):
classifications = []
for name in email:
classifications.append("spmsg" in name)
return classifications
# Lingspam dataset
trainingDataLingspam, trainingLingspamFilename = importLingspamDataset("spam-non-spam-dataset/train-mails") # extract the training emails from the dataset
#testingDataLingspam, testingLingspamFilename = importLingspamDataset("spam-non-spam-dataset/test-mails") # extract the testing emails from the dataset
trainingDataLingspamClean = cleanAllEmails(trainingDataLingspam)
#testingDataLingspamClean = cleanAllEmails(testingDataLingspam)
#trainClassifyLingspam = classifyLingspamDataset(trainingDataLingspam)
#testClassifyLingspam = classifyLingspamDataset(testingDataLingspam)
trainDictionary = createDictionary(trainingDataLingspamClean)
#createDictionary(testingDataLingspamClean)
#trainingDataEnron, trainingEnronFilename = importEnronDataset("spam-non-spam-dataset-enron/bigEmailDump/training/")
Based on your question, I assume that you have a list of emails, which for each email you would like to remove the punctuation marks. This answer was based on the first revision of the code you posted.
import string
def removePunctuation(emails):
# I am using a list comprehension here to iterate over the emails.
# For each iteration, translate the email to remove the punctuation marks.
# Translate only allows a translation table as an argument.
# This is why str.maketrans is used to create the translation table.
cleaned_emails = [email.translate(str.maketrans('', '', string.punctuation))
for email in emails]
return cleaned_emails
if __name__ == '__main__':
# Assuming cleanedEmails is a list of emails,
# I am substituting cleanedEmails with emails.
# I used cleanedEmails as the result.
emails = ["This is a, test!", "This is another##! \ntest"]
cleaned_emails = removePunctuation(emails)
print(cleaned_emails)
input: ["This is a, test!", "This is another##! \ntest"]
output: ['This is a test', 'This is another \ntest']
EDIT:
Issue is resolved after having a conversation with OP. OP was having an issue with WordCloud and the solution I provided is working. Manage to guide OP through getting WordCloud working. OP is now fine tuning the results of the WordCloud.