What Am I doing wrong? I already defined it but it keeps on saying its not defined.
# Pre-process the comments
def preprocess_text(text):
# Lowercase the text
text = text.lower()
# Remove punctuations
text = re.sub(r'\[^\\w\\s\]', '', text)
# Tokenize the text
words = word_tokenize(text)
stop_words = set(stopwords.words("english"))
stop_words.update(["a", "an", "and", "are", "as", "at", "be", "by",
"for", "from", "has", "he", "in", "is", "it", "its", "of", "on",
"that", "the", "to", "was", "were", "will", "with"])
words = [word for word in words if word not in stop_words]
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
return words
comments_processed = [preprocess_text(comment) for comment in comments]
# Perform sentiment analysis on the comments
sentiments = []
for comment in comments:
sentiment = TextBlob(comment).sentiment.polarity
sentiments.append(sentiment)
# Identify the top 3 best and worst things about the product
positive_features = {}
negative_features = {}
for i in range(len(comments)):
comment = comments[i]
sentiment = sentiments[i]
words = preprocess_text(comment)
for word in words:
if sentiment > 0:
if word in positive_features:
positive_features[word] += 1
else:
positive_features[word] = 1
elif sentiment < 0:
if word in negative_features:
negative_features[word] += 1
else:
negative_features[word] = 1
top_positive_features = sorted(positive_features, key=positive_features.get, reverse=True)[:3]
top_negative_features = sorted(negative_features, key=negative_features.get, reverse=True)[:3]
# Visualize the results using word clouds
positive_cloud = WordCloud(width=800, height=800, background_color='white', stopwords=stop_words, min_font_size=10).generate_from_frequencies(positive_features)
negative_cloud = WordCloud(width=800, height=800, background_color='white', stopwords=stop_words, min_font_size=10).generate_from_frequencies(negative_features)
What is wrong here?
NameError
Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_1612\1814734049.py in <module>
63
64 # Visualize the results using word clouds
---> 65 positive_cloud = WordCloud(width=800, height=800,
background_color='white', stopwords=stop_words,
min_font_size=10).generate_from_frequencies(positive_features)
66
67 negative_cloud = WordCloud(width=800, height=800,
background_color='white', stopwords=stop_words,
min_font_size=10).generate_from_frequencies(negative_features)
NameError: name 'stop_words' is not defined
Your stop_words is defined only in the function preprocess_text(), so it's scope is limited to only that function.
Related
I am using spacy to lemmatize and parse a list of sentences. the data are contained in an excel file.
I would like to write a function which allow me to return different lemma of my sentences.
For example returning only lemma with a specific tag ("VERB" OR "VERB" +"ADJ")
This is my code :
import spacy
from spacy.lang.fr import French
from spacy_lefff import LefffLemmatizer, POSTagger
nlp = spacy.load("fr_core_news_sm")
nlp=spacy.load('fr')
parser = French()
path = 'Gold.xlsx'
my_sheet ="Gold"
df = read_excel(path, sheet_name= my_sheet)
def tokenizeTexte(sample):
tokens = parser(sample)
lemmas = []
for tok in tokens:
lemmas.append((tok.lemma_.lower(), tok.tag_, tok.pos_))
tokens = lemmas
tokens = [tok for tok in tokens if tok not in stopwords]
return tokens
df['Preprocess_verbatim'] = df.apply(lambda row:tokenizeTexte(row['verbatim']), axis=1)
print(df)
df.to_excel('output.xlsx')
I would like to be able to return all lemma with for example "verb" or "adj" or "adv" tag and then modify to return all the lemma.
I also wish to return different combination of lemma ( "PRON" +" "VERB"+"ADJ")
How can i do that with spacy ?
this is what i obtain with my code
id ... Preprocess_verbatim
0 463 ... [(ce, , ), (concept, , ), (résoudre, , ), (que...
1 2647 ... [(alors, , ), (ça, , ), (vouloir, , ), (dire, ...
2 5391 ... [(ça, , ), (ne, , ), (changer, , ), (rien, , )...
3 1120 ... [(sur, , ), (le, , ), (station, , ), (de, , ),
tok.tag and tok.pos does not appear , do you know why?
My file :
example of my data :
id verbatim
14 L'économe originellement est donc celui qui a la responsabilité, pour des personnes d'une maison, d'une unité d'organisation donnée .
25 De leur donner des rations de ressources au temps opportun.
56 Contrairement à l'idée qu'on se fait l'économe n'est pas axé sur le capital, c'est-à-dire sur l'action de capitaliser, mais sur les individus d'une unité organisation, c'est-à-dire sur l'action de partager, de redistribuer d'une façon juste et opportune des ressources aux différents membre
First, I think your model isn't working correctly because you're defining the nlp object twice. I believe you only need it once. I am also not sure what parser is doing and I'm not sure you need it. For this code, I would use something like the following:
nlp = spacy.load("fr_core_news_sm")
doc = nlp(sample)
tokens = [tok for tok in doc]
Then, doc is a spacy Doc object, and tokens is a list of spaCy Token objects. From here, the loop that iterates over your tokens would work.
If you want to do the POS selection in your existing preprocessing function, I think you only need to change one line in your loop:
for tok in tokens:
if tok.pos_ in ("VERB", "ADJ", "ADV"):
lemmas.append((tok.lemma_.lower(), tok.tag_, tok.pos_))
This will only add tokens with those specific parts of speech to your lemmas list.
I also noticed another issue in your code on this line further down:
tokens = [tok for tok in tokens if tok not in stopwords]
At this point tok is your tuple of (lemma, tag, pos), so unless your list of stopwords is tuples of the same format, and not only lemmas or tokens you want to exclude, this step will not exclude anything.
Putting it all together, you'd have something like this, which would return a list of tuples of (lemma, tag, pos) if the POS is correct:
nlp = spacy.load("fr_core_news_sm")
stopwords = ["here", "are", "some", "stopwords"]
def tokenizeTexte(sample):
doc = nlp(sample)
lemmas = []
for tok in tokens:
if tok.pos_ in ("VERB", "ADJ", "ADV"):
lemmas.append((tok.lemma_.lower(), tok.tag_, tok.pos_))
tokens = [(lemma, tag, pos) for (lemma, tag, pos) in lemmas if lemma not in stopwords]
return tokens
Intent is to capitalize based on POS tags, which I could achieve with the help of the below link.
How can I best determine the correct capitalization for a word?
Trying to achieve similar results using spacy?
def truecase(doc):
truecased_sents = [] # list of truecased sentences
tagged_sent = token.tag_([word.lower() for token in doc])
normalized_sent = [w.capitalize() if t in ["NN","NNS"] else w for (w,t) in tagged_sent]
normalized_sent[0] = normalized_sent[0].capitalize()
string = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent))
return string
it throws out this error
tagged_sent = token.tag_([word.lower() for token in doc])
NameError: global name 'token' is not defined
how to declare token as global and solve this issue. Is my approach correct?
import spacy, re
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'autonomous cars shift insurance liability toward manufacturers.')
tagged_sent = [(w.text, w.tag_) for w in doc]
normalized_sent = [w.capitalize() if t in ["NN","NNS"] else w for (w,t) in tagged_sent]
normalized_sent[0] = normalized_sent[0].capitalize()
string = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent))
print string
Output:
Autonomous Cars shift Insurance Liability toward Manufacturers.
I am currently programming an Artificial Intelligence in Python, with some basic code from ELIZA. I will improve on the code once I get it working. My problem is that when I run the program and enter a query to the computer, there is no response. My code is below.
import string
# OSWALD v1.0
switch = [
["I need \(.*\)",
[ "Why do you need %1?",
"Would it REALLY help you to get %1?",
"Are you sure you need %1?"]]
#There is more code with responses.
]
gPats = {
"am" : "are",
"was" : "were",
"i" : "you",
"i'd" : "you would",
"i've" : "you have",
"i'll" : "you will",
"my" : "your",
"are" : "am",
"you've": "I have",
"you'll": "I will",
"your" : "my",
"yours" : "mine",
"you" : "me",
"me" : "you",
}
s = input
gKeys = map(lambda x:regex.compile(x[0]),gPats)
gValues = map(lambda x:x[1],gPats)
print ("Hello, mortal. My name is Oswald. What would you like to talk about?")
while s == input:
try: s = input(">")
def translate(str,dict):
words = string.split(string.lower(str))
keys = dict.keys();
for i in range(0,len(words)):
if words[i] in keys:
words[i] = dict[words[i]]
return print(switch)
def respond(str,keys,values):
for i in range(0,len(keys)):
if input == input:
respnum = whrandom.randint(0,len(values[word])-1)
resp = values[i][respnum]
pos = string.find(resp,'%')
print(string.find(resp,'%'))
while pos > -1:
num = string.atoi(resp[pos+1:pos+2])
resp = resp[:pos] + \
translate(keys[i].group(num),gReflections) + \
resp[pos+2:]
pos = string.find(resp,'%')
if resp[-2:] == '?.': resp = resp[:-2] + '.'
if resp[-2:] == '??': resp = resp[:-2] + '?'
print(string.find(resp,'%'))
I'm trying to write a program that allows the user to input a word then find all words of length 4 or greater hidden within that word that are in a word text file. So far my code can detect the words in the user inputted word that aren't jumbled. For example if I type in houses, the output will show house, houses, ho, us, use, uses. It should also recognize hose, hoses, shoe, shoes, hues, etc.
I know itertools is the simplest solution but I want to use a different method using only loops, dictionaries, and lists.
Here is my code so far:
def main():
filename = open('dictionary.txt').readlines()
word_list = []
for line in filename:
word_list.append(line.strip())
print 'Lets Play Words within a Word!\n'
word = raw_input('Enter a word: ')
words_left = 0
for words in word_list:
letters = list(words)
if words in word:
print words
words_left += 1
else:
False
The output format I'm trying to create should look like so:
Lets play Words within a Word!
Enter a word: exams #user inputted word
exams --- 6 words are remaining
> same #user types in guess
Found! # prints 'Found!' if above word is found in the dictionary.txt file
exams --- 5 words are remaining
> exam
Found!
exams --- 4 words are remaining
> mesa
Found!
exams --- 3 words are remaining
> quit() #if they type this command in the game will end
So my question is, after entering the base word (in the ex it's EXAMS), how do I determine the total number of words within that word and if the user inputted word guesses are in the text file? Also print if the word was found.
something like this should work...
wordlist=[list of words]
solutionlist=[]
userword=[userword[i] for i in range(len(userword))]
for word in wordlist:
inword=True
letters=[word[j] for j in range(len(word))]
for letter in set(letters):
if letters.count(letter)>userword.count(letter):
inword=False
break
if inword:
solutionlist.append(word)
for line in solutionlist:
print line
This works:
# read from file in actual implementation
all_words = [
"foo", "bar", "baz", "hose", "hoses", "shoe", "shoes", "hues", "house",
"houses", "ho", "us", "use", "uses", "shoe", "same", "exam", "mesa", "mass"]
RETAIN_ORDERING = False
def matches(inp, word):
if inp[0] == word[0]:
return (
True if len(word) == 1 else
False if len(inp) == 1 else
matches(inp[1:], word[1:]))
else:
return matches(inp[1:], word) if len(inp) >= 2 else False
# with sorting enabled, "houses" will also match "shoe"; otherwise not
def maybe_sort(x):
return x if RETAIN_ORDERING else ''.join(sorted(x))
inp = raw_input("enter a word: ")
results = [word for word in all_words if matches(maybe_sort(inp), maybe_sort(word))]
print results
Output:
$ python matches.py
enter a word: houses
['hose', 'hoses', 'shoe', 'shoes', 'hues', 'house', 'houses', 'ho', 'us', 'use', 'uses', 'shoe']
$ python matches.py
enter a word: exams
['same', 'exam', 'mesa']
If you want to avoid matches like shoe where the ordering of letters is not the same as in the input, just set RETAIN_ORDERING = True.
A naive implementation (using collections.Counter):
>>> all_words = ['foo', 'bar', 'baz', 'hose', 'hoses', 'shoe', 'shoes', 'hues', 'house', 'houses', 'ho', 'us', 'use', 'uses', 'shoe', 'same', 'exam', 'mesa', 'mass']
>>> def find_hidden(user_input):
from collections import Counter
user_word_counts = Counter(user_input)
for word in all_words:
isvalid = True
for letter, count in Counter(word).iteritems():
if user_word_counts[letter] == 0 or user_word_counts[letter] < count:
isvalid = False
break
if isvalid: yield word
>>> list(find_hidden("houses"))
['hose', 'hoses', 'shoe', 'shoes', 'hues', 'house', 'houses', 'ho', 'us', 'use', 'uses', 'shoe']
>>> list(find_hidden("exams"))
['same', 'exam', 'mesa']
Or,
Using permutations:
>>> all_words = ['foo', 'bar', 'baz', 'hose', 'hoses', 'shoe', 'shoes', 'hues', 'house', 'houses', 'ho', 'us', 'use', 'uses', 'shoe', 'same', 'exam', 'mesa', 'mass']
>>> def permutations(s, n): # could use itertools.permutations
if n == 1:
for c in s:
yield c
for i in range(len(s)):
for p in permutation(s[:i]+s[i+1:], n-1):
yield s[i] + p
>>> def find_hidden(input_str):
for word_len in range(2, len(input_str)+1):
for word in permutations(input_str, word_len):
if word in all_words:
yield word
>>> set(find_hidden("houses"))
set(['use', 'hose', 'shoes', 'houses', 'house', 'us', 'hues', 'hoses', 'uses', 'ho', 'shoe'])
>>> set(find_hidden("exams"))
set(['mesa', 'exam', 'same'])
I need to create a program which removes punctuation, some specific words, duplicates and return the words left and their respective lines. I also need to keep track of the duplicates. For instance,
Python IDLE
Indexer: type in lines, finish with a . at start of line only
It is a briskly blowing wind that blows
from the north, the North of my youth.
The wind is cold too, colder than the
winds of yesteryear.
.
The index is:
brisk 1
blow 1
wind 1, 3, 4
north 2
youth 2
cold 3
yesteryear 4
The Problem: I need to keep track of the line number of the words left and also their duplicates. I'm not being able to do that.
from string import *
stopWords = [ "a", "i", "it", "am", "at", "on", "in", "to", "too", "very", \
"of", "from", "here", "even", "the", "but", "and", "is", "my", \
"them", "then", "this", "that", "than", "though", "so", "are" ]
endings = [ "es" , "ed" , "er", "ly"]
punctuation = [ ".", "," , ":" , ";" , "!" , "?" , "&" , "'" ]
unindexed_sentence = raw_input("type in lines, finish with a . at start of line only").lower()
#removing duplicates.
def unique_string(l):
ulist = []
ulist2 = []
[ulist.append(x) for x in l if x not in ulist]
[ulist2.append(x)]
global ulist2
return ulist
unindexed_sentence =' '.join(unique_string(unindexed_sentence.split()))
unindexed_sentence1 = split(unindexed_sentence,"\n")
list_unindexed = []
# splitting
i = 0
while i<len(unindexed_sentence1):
list_unindexed += [split(unindexed_sentence1[i])]
i+=1
countline = 0
i = 0
while i < len(list_unindexed):
j = 0
while j < len(list_unindexed[i]):
if list_unindexed[i][j][0] in punctuation:
list_unindexed[i][j] = list_unindexed[i][j][:0]
if list_unindexed[i][j][-1] in punctuation:
list_unindexed[i][j] = list_unindexed[i][j][:-1]
if list_unindexed[i][j][-1] == "s":
list_unindexed[i][j] = list_unindexed[i][j][:-1]
if list_unindexed[i][j][-2:] in endings:
list_unindexed[i][j] = list_unindexed[i][j][:-2]
if list_unindexed[i][j][-3:] == "ing":
list_unindexed[i][j] = list_unindexed[i][j][:-3]
if list_unindexed[i][j] in stopWords:
del list_unindexed[i][j]
else:
j += 1
i += 1
countline += 1
def new_line(n):
split(n,"\n")
count = 1
if n[-1] == "\n":
count += 1
return count
string1 = str(list_unindexed)
string2 = str(string1)
string2 ='\n'.join(unique_string(string2.split()))
print string2
Is it your homework?
Here some tips:
Don't do: from string import *. You don't need it.
Use data.splitlines() to get list of lines
Use enumerate() to get a index, e.g.: for i, line in enumerate(data.splitlines())
Use a dictionary for keeping track of all words. Each value could be a list or a set of line numbers
Don't remove duplicates initially. You can do this using dictionaries or sets.