NLP: Replicating a sentiment function for pandas dataframes - python

I am passing function to count the number of positive and negative words inside a dataframe.
I am using a dictionary like this:
import re
lmdict = {'Negative': ['closes','buoyed','underpinned', 'volatile'],
'Positive': ['up','strong','higher','versatile']}
and a negate dictionary to classify negation words:
negate = [ "havent", "isnt", "mightnt", "mustnt",
"neither", "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't",
"never", "none", "nope", "nor", "not", "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "wasnt",
"werent", "oughtn't", "shan't", "shouldn't", "wasn't", "weren't", "without", "wont", "wouldnt", "won't",
"wouldn't"]
The function that I was using is useful for individual news articles is the following:
def negated(word):
"""
Determine if preceding word is a negation word
"""
if word.lower() in negate:
return True
else:
return False
def tone_count_with_negation_check(dict, article):
"""
Count positive and negative words with negation check. Account for simple negation only for positive words.
Simple negation is taken to be observations of one of negate words occurring within three words
preceding a positive words.
"""
pos_count = 0
neg_count = 0
pos_words = []
neg_words = []
input_words = re.findall(r'\b([a-zA-Z]+n\'t|[a-zA-Z]+\'s|[a-zA-Z]+)\b', article.lower())
word_count = len(input_words)
for i in range(0, word_count):
if input_words[i] in dict['Negative']:
neg_count += 1
neg_words.append(input_words[i])
if input_words[i] in dict['Positive']:
if i >= 3:
if negated(input_words[i - 1]) or negated(input_words[i - 2]) or negated(input_words[i - 3]):
neg_count += 1
neg_words.append(input_words[i] + ' (with negation)')
else:
pos_count += 1
pos_words.append(input_words[i])
elif i == 2:
if negated(input_words[i - 1]) or negated(input_words[i - 2]):
neg_count += 1
neg_words.append(input_words[i] + ' (with negation)')
else:
pos_count += 1
pos_words.append(input_words[i])
elif i == 1:
if negated(input_words[i - 1]):
neg_count += 1
neg_words.append(input_words[i] + ' (with negation)')
else:
pos_count += 1
pos_words.append(input_words[i])
elif i == 0:
pos_count += 1
pos_words.append(input_words[i])
print('The results with negation check:', end='\n\n')
print('The # of positive words:', pos_count)
print('The # of negative words:', neg_count)
print('The list of found positive words:', pos_words)
print('The list of found negative words:', neg_words)
print('\n', end='')
results = [word_count, pos_count, neg_count, pos_words, neg_words]
return results
tone_count_with_negation_check(lmdict, article)
But I want to use for this function for dataframes like this:
articles = pd.DataFrame({'articles': ['S Africa s JSE closes firmer buoyed by industrial stocks',
'Sasol says HEPS up percent current economic climate volatile'
'Sasol Delivers Strong Financial Results Underpinned by Higher Crude Oil and Product Demand',
'South Africa s Sasol H earnings up on higher oil']})
Can you help me to do this?
Thank you!

Related

cs50 readability pset6 python - Index calculation Issue

An Index calculation Issue in my pset6 readability
This is my code and I think it is correct but, the grade for every paragraph is wrong because the index calculation is wrong. Although, the letters, words, and sentence counting is correct. So, where is the problem? someone help me, please?
from cs50 import get_string
import math
# Loop to count the letters
letters = 0
words = 0
sentences = 0
# Prompt user for some text.
text = get_string("Text: ")
# Loop to count the letters(s) of a paragraph
def count_letters():
global letters
for i in range(len(text)):
if text[i].lower() or text[i].upper():
letters += 1
# pass
print(letters)
return letters
count_letters()
# Loop to count the words of the paragraph.
def count_words():
global words
for i in range(len(text)):
if text[i].isspace():
words += 1
# TODO
if text[i].isspace() and text[i + 1].isspace():
words -= 1
print(words + 1)
return words + 1
count_words()
# Loop to count sentences of the paragraph.
def count_sentences():
global sentences
for i in range(len(text)):
if text[i] == "." or text[i] == "!" or text[i] == "?":
sentences += 1
print(sentences)
return sentences
count_sentences()
# Calc the index of the grades
def indexOfGrade():
global letters
global words
global sentences
l = letters / words * 100
s = sentences / words * 100
index = round(0.0588 * l - 0.296 * s - 15.8)
print(letters / words)
print(l)
print(s)
print(index)
# grades
if index >= 16:
print("Grade 16+")
elif index < 1:
print("Before Grade 1")
else:
print(f"Grade {index}")
indexOfGrade()
Yahooo, after a lot of time I figured out your mistake.
def count_words():
global words
for i in range(len(text)):
if text[i].isspace():
words += 1
# TODO
if text[i].isspace() and text[i + 1].isspace():
words -= 1
print(words + 1)
return words + 1 #This part is wrong
The last line, which is return will just return the value but will not change the value of the variable words, therefore, you need to correct it as follows
def count_words():
global words
for i in range(len(text)):
if text[i].isspace():
words += 1
# TODO
if text[i].isspace() and text[i + 1].isspace():
words -= 1
print(words + 1)
words += 1
return words
Also the letters, words and sentences variables may optionally be float and not int and while calculating the index, python will omit the remaining decimal part, therefore the round may not work.
Also, I have performed check50 on my device and all the results were green with (correct)
Also, if text[i].isspace() and text[i + 1].isspace(): was wrong you need to completely delete that part.
Therefore here is the final answer with the required changes.
from cs50 import get_string
import math
# Loop to count the letters
letters = float(0)
words = float(0)
sentences = float(0)
# Prompt user for some text.
text = get_string("Text: ")
# Loop to count the letters(s) of a paragraph
def count_letters():
global letters
for i in range(len(text)):
if (text[i] >= 'a' and text[i] <= 'z') or (text[i] >= 'A' and text[i] <= 'Z'):
letters += 1
# pass
# # print(letters)
return letters
count_letters()
# Loop to count the words of the paragraph.
def count_words():
global words
for i in range(len(text)):
if text[i].isspace():
words += 1
# # TODO
# if text[i].isspace() and text[i + 1].isspace():
# pass
# # print(words + 1)
words += 1
return words + 1
count_words()
# Loop to count sentences of the paragraph.
def count_sentences():
global sentences
for i in range(len(text)):
if text[i] == "." or text[i] == "!" or text[i] == "?":
sentences += 1
# # print(sentences)
return sentences
count_sentences()
# Calc the index of the grades
def indexOfGrade():
global letters
global words
global sentences
# print(letters)
# print(words)
# print(sentences)
l = 100 * letters / words
# print(l)
s = sentences / words * 100
# print(s)
index = round(0.0588 * l - 0.296 * s - 15.8)
# # print(letters / words)
# print(0.0588 * l - 0.296 * s - 15.8)
# print(l)
# print(s)
# print(index)
# grades
if index >= 16:
print("Grade 16+")
elif index < 1:
print("Before Grade 1")
else:
print(f"Grade {index}")
indexOfGrade()
Note: you may optionally remove all the comment statements.
Link to my check50 result

Python: Match phrases in dictionary values to a sentence (the dictionary key) and output based on match

I have a dictionary, where each key is a sentence, and the values are particular words or phrases in that sentence.
for example:
dict1 = {'it is lovely weather and it is kind of warm':['lovely weather', 'it is kind of warm'],'and the weather is rainy and cold':['rainy and cold'],'the temperature is ok':['temperature']}
I would like my output to be each sentence tagged based on whether or not the phrase is in the dictionary values.
In this example, the output would be (where 0 is not in the values, and 1 is in the values)
*
it 0
is 0
lovely weather 1 (combined because it's a phrase)
and 0
it is kind of warm 1 (combined because it's a phrase)
*
and 0
the 0
weather 0
is 0
rainy and cold 1 (combined because it's a phrase)
...(and so on)...
I can get something like this to work, but only by hard coding the number of words in the phrase:
for k,v in dict1.items():
words_in_val = v.split()
if len(words_in_val) == 1:
words = k.split()
for each_word in words:
if v == each_word:
print(each_word + '\t' + '1')
else:
print(each_word + '\t' + '0')
if len(words_in_val) == 2::
words = k.split()
for index,item in enumerate(words[:-1]):
if words[index] == words_in_val[0]:
if words[index+1] == words_in_val[1]:
words[index] = ' '.join(words_in_val)
words.remove(words[index+1])
....something like this...
My issue is that i can see it starts to get messy, and also in theory, I can have an unlimited number of words in the phrase I want to match, although it's usually <10.
Would someone have a better idea of how to do this?
So this is how I would do it:
from collections import defaultdict
dict1 = {'it is lovely weather and it is kind of warm':['it is kind of', 'it is kind'],'and the weather is rainy and cold':['rainy and cold'],'the temperature is ok':['temperature']}
def tag_sentences(dict):
id = 1
tagged_results = []
for sentence, phrases in dict.items():
words = sentence.split()
phrases_split = [phrase.split() for phrase in phrases]
positions_keeper = {}
sentence_results = [(word, 0) for word in words]
for word_index, word in enumerate(words):
for index, phrase in enumerate(phrases_split):
position = positions_keeper.get(index, 0)
if phrase[position] == word:
if len(phrase) > position + 1:
positions_keeper[index] = position + 1
else:
for i in range(len(phrase)):
sentence_results[word_index - i] = (sentence_results[word_index - i][0], id)
id = id + 1
else:
positions_keeper[index] = 0
tagged_results.append(sentence_results)
return tagged_results
def print_tagged_results(tagged_results):
for tagged_result in tagged_results:
memory = 0
memory_sentence = ""
for result, id in tagged_result:
if memory != 0 and memory != id:
print(memory_sentence + "1")
memory_sentence = ""
if id == 0:
print(result, 0)
else:
memory_sentence += result + " "
memory = id
if memory != 0:
print(memory_sentence + "1")
tagged_results = tag_sentences(dict1)
print_tagged_results(tagged_results)
This is basically doing the following:
First I make a tagged list in the format: [(it, 0), (is, 0), (lovely, 0) ...]
In the tagged list I mark 0 => not in a group and other integer number as grouping together (words with tag 1 grouped together, words with tag 2 grouped together)
I iterate through each word and mark it if it match with the beginning of a phrase or if I'm already in the loop with the current phrase position
If it is the end of the phrase I mark the word and all words that matched in the past with this phrase with the same id
If it is not the end I will keep the position and start the next iteration.
In the end I have a tagged list in the format [(it, 0), (is, 0), (lovely, 1) ... (kind,2), (of, 2), ...]
It won't work if a phrase is a subphrase of another phrase but you never mention this in your example how it should react to this situation.

How can I find this word is Substitutions: or Insertions python

For example, my word is "brain", we need to create function that check is it subsitutions and is it insertions.
For example, Substitutions can be: train grain blain bryin ... Insertions: barain bryain ...
#I am not sure I did the right way or missing something. And any other way that faster?
#string0 is pattern
def subsitutions(string0, string1):
sublist = []
for i in range(len(string0)):
sublist.append(string0[:i] + string0[i + 1:])
for i in range(len(string1)):
if string1[:i] + string1[i + 1:] in sublist:
print("This word is subsitutions")
break
def insertions(string0, string1):
if len(string0)+1 == len(string1):
for i in range(len(string1)):
if string1[:i] + string1[i + 1:] == string0:
print("This word is insertions")
break
set.difference would easily do this for you:
>>> set('brian') - set('bryan')
{'i'}
>>> set('brian') - set('byrian')
set()
Here's a dirty example of this check:
def check_sub_or_ins(pattern, string):
if set(pattern) - set(string):
print("This word is subsitutions")
else:
print("This word is insertions")
You will need to check the length of each string though as just using set won't work for all strings
See if this helps
word = "brain"
word_length = len(word)
words_to_check = ["train", "grain", "blain", "bryin", "barain", "bryain"]
for word_to_check in words_to_check:
boolean_list = [character in word for character in word_to_check]
character_occurences = sum(boolean_list)
if character_occurences == word_length - 1:
print("{0} is substitution".format(word_to_check))
elif character_occurences >= word_length - 1 and len(boolean_list) > word_length:
print("{0} is insertion".format(word_to_check))

calling another function of list of str to new function list of list

I'm trying use my previous function calculate_total_sentiment(tweet1): which sorts out the tweet into different category of "very positive", "positive", "neutral, "negative" and "very negative".
Now I want to use this new function of group_by_sentiment which takes in a list of strings and i want to categorize them in the same category as my calculate_total_sentiment.
How do i sort them out like that? Below is my code of calculate_total_sentiment which works and i'm trying to figure out how to implement that function to this new one and making it sort into the same category.
Examples of group_by_sentiment would be
group_by_sentiment(['sad', '#notok'])
[[], [], [], [], ['sad', 'notok']]
which the last of the list would contain all the very negative category while the first is very positive
def calculate_total_sentiment(tweet1):
total = negative_word_score(tweet) + positive_word_score(tweet) + \
positive_hashtag_score(tweet) + negative_hashtag_score(tweet) + \
emoticon_score(tweet)
if total > 2:
return ("Very Positive")
elif total > 0:
return ("Positive")
elif total == 0:
return ("Neutral")
elif total > -3:
return ("Negative")
else:
return ("Very Negative")
def group_by_sentiment(L):
very_negative = []
negative = []
neutral = []
positive = []
very_positive = []
output = [very_positive, positive, neutral, negative, very_negative]
for char in range(len(L)):
if calculate_total_sentiment(tweet1) == 'Very Positive':
output[0].append(very_positive)
elif calculate_total_sentiment(tweet1) == 'Positive':
output[1].append(positive)
elif calculate_total_sentiment(tweet1) == 'Neutral':
output[2].append(neutral)
elif calculate_total_sentiment(tweet1) == 'Negative':
output[3].append(negative)
elif calculate_total_sentiment(tweet1) == 'Very Negative':
output[4].append(very_negative)
return output
I'm not sure I understand your question fully, but is there any reason you aren't using a dictionary (or is this particular case a defaultdict):
from collections import defaultdict
def group_by_sentiment(L):
output = defaultdict(list)
for tweet in L:
output[calculate_total_sentiment(tweet)].append(tweet)]
return output
>>> group_by_sentiment(['sad', '#notok'])
defaultdict(list, {'Very Negative': ['sad', '#notok']})
You obviously have a number of other issues with your code like tweet not being defined in calculate_total_sentiment because the arg is called tweet1

Python data validation for words

Hello i have a problem making my validation that forces the user to enter 5 words , currently if they enter a space they can go trough also. Here is my code:
cntr = 0
for x in range(len(this.val)):
if this.val[x]. == ' ' and not this.val[x-1] == ' ' and x != 0:
cntr = cntr + 1
if cntr lt 4:
error(res.Q3error)
words = sentence.split()
if len(words) == 5:
print('good')
Or perhaps add a "is_word" test to apply to "words" using an all() - in case you want to reject numbers or something.

Categories