I am trying to run sentiment analysis on a selection of a data set, but every time I do I get this error: KeyError: 0
For reference, this is the code I am working with:
OC = df[df["text"].str.contains("Obamacare")]
from textblob import TextBlob
import re
def clean_tweet(tweet):
return " ".join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
def analize_sentiment(tweet):
analysis = TextBlob(clean_tweet(tweet))
if analysis.sentiment.polarity > 0:
return 1
elif analysis.sentiment.polarity == 0:
return 0
else:
return -1
df["sentiment"] = np.array([ analize_sentiment(tweet) for tweet in df["text"]])
pos_tweets = [tweet for index, tweet in enumerate(OC['text']) if OC['sentiment'][index] > 0]
neu_tweets = [ tweet for index, tweet in enumerate(OC['text']) if OC['sentiment'][index] == 0]
neg_tweets = [ tweet for index, tweet in enumerate(OC['text']) if OC['sentiment'][index] < 0]
It's after I try to run the pos_tweets, neu_tweets, neg_tweets that I keep getting Key Error: 0
I'm not sure what you're enumerating or why that is part of a sentiment analysis. This is how I did it...
def clean(tweet):
return " ".join(re.sub("(#[A-Za-z0-9]+) | ([^0-9A-Za-z\t]) | (w+:\/\/\s+)", " ", tweet).split())
def sentiment_analysis(tweet):
analysis = TextBlob(clean(tweet))
if analysis.sentiment.polarity > 0:
return 1
elif analysis.sentiment.polarity == 0:
return 0
else:
return -1
df["sentiment"] = np.array([sentiment_analysis(tweet) for tweet in
df["text"]])
df["OC"] = df.text.str.contains("obamacare", case = False)
df2 = df.loc[df["OC"] == True]
df2.sentiment.value_counts()
I basically just ran your list comprehension against the entire df, and then parsed.
Related
I'm using this code to calculate pivot points.
def pivots_low(osc, LBR, LBL):
pivots = []
for i in range(len(osc) - LBR):
pivots.append(0)
pivot = True
if i > LBL:
for j in range(1, LBR + 1):
if osc[i] >= osc[i + j]:
pivot = False
for j in range(1, LBL + 1):
if osc[i] > osc[i - j]:
pivot = False
if pivot is True:
pivots[len(pivots) - 1] = osc[i]
for i in range(LBR):
pivots.append(0)
return pivots
This returns an array with 0's where there's no pivots and the value of the pivot if there is one.
When Comparing the results to TradingView (downloaded csv with pivot points), the only time it matches exactly is when lookback left and right are both 5. Otherwise it deviates in the number of total pivots and the location of some.
But using this code to calculate pivot highs:
def pivots_high(osc, LBR, LBL):
pivots = []
for i in range(len(osc)-LBR):
pivots.append(0)
pivot = True
if i > LBL:
for j in range(1,LBL + 1):
if osc[i] < osc[i-j]:
pivot = False
for j in range(1,LBR + 1):
if osc[i] <= osc[i+j]:
pivot = False
if pivot is True:
pivots[len(pivots)-1] = osc[i]
for i in range(LBR):
pivots.append(0)
return pivots
the results are perfect regardless of lookback values. But the code is almost exactly the same besides comparison.
What is going wrong here? This is day 3 of having this problem and I just cant fix it
To Reproduce:
Load Data:
Full_Data = pd.read_csv(file)
use this simple function to check matches between calculated pivots and TradingView pivots.
def match_pivs(data, pivs_h, pivs_l): //Data is a DataFrame loaded from tradingview csv
global lblh
global lbrh
global lbll
global lbrl
start = lbrh
if lbrl > lbrh:
start = lbrl
match_h = 0
tot_hd = 0
tot_hp = 0
match_l = 0
tot_ld = 0
tot_lp = 0
for i in range(start, len(data)):
if data['PivHigh'][i] != 0 and pivs_h[i-lbrh] != 0:
match_h += 1
if data['PivLow'][i] != 0 and pivs_l[i-lbrl] != 0:
match_l += 1
if data['PivHigh'][i] != 0:
tot_hd += 1
if data['PivLow'][i] != 0:
tot_ld += 1
if pivs_h[i] != 0:
tot_hp += 1
if pivs_l[i] != 0:
tot_lp += 1
print('PivsLow ' + str(tot_lp))
print('DataLows ' + str(tot_ld))
print('MatchesL ' + str(match_l))
print('PivsHigh ' + str(tot_hp))
print('DataHighs ' + str(tot_hd))
print('MatchesH ' + str(match_h))
and to get csv from TradingView:
//#version=5
indicator("Data Script", overlay=true, max_labels_count=500)
leftLenL = input.int(title="Pivot Low", defval=10, minval=1, inline="Pivot Low", group=lengthGroupTitle)
rightLenL = input.int(title="/", defval=10, minval=1, inline="Pivot Low", group=lengthGroupTitle)
leftLenH = input.int(title="Pivot High", defval=10, minval=1, inline="Pivot High", group=lengthGroupTitle)
rightLenH = input.int(title="/", defval=10, minval=1, inline="Pivot High", group=lengthGroupTitle)
ph = ta.pivothigh(leftLenH, rightLenH)
pl = ta.pivotlow(leftLenL, rightLenL)
if not na(ph)
plth := ph
else
plth := 0.0
if not na(pl)
pltl := pl
else
pltl := 0.0
plot(plth, 'PivHigh')
plot(pltl, 'PivLow')
then just download csv with this script loaded.
Run program with these three lines:
pl = pivots_low(Full_Data['low'], lbll, lbrl)
ph = pivots_high(Full_Data['high'], lbrh, lblh)
match_pivs(Full_Data, ph, pl)
Finally found a way.
I still have no idea why that code does not work but I've made a different way that seems to be doing the job 100% to tradingview data.
def checkhl(data_back, data_forward, hl):
if hl == 'high' or hl == 'High':
ref = data_back[len(data_back)-1]
for i in range(len(data_back)-1):
if ref < data_back[i]:
return 0
for i in range(len(data_forward)):
if ref <= data_forward[i]:
return 0
return 1
if hl == 'low' or hl == 'Low':
ref = data_back[len(data_back)-1]
for i in range(len(data_back)-1):
if ref > data_back[i]:
return 0
for i in range(len(data_forward)):
if ref >= data_forward[i]:
return 0
return 1
def pivot(osc, LBL, LBR, highlow)
left = []
right = []
for i in range(len(osc)):
pivots.append(0.0)
if i < LBL + 1:
left.append(osc[i])
if i > LBL:
right.append(osc[i])
if i > LBL + LBR:
left.append(right[0])
left.pop(0)
right.pop(0)
if checkhl(left, right, highlow):
pivots[i - LBR] = osc[i - LBR]
return pivots
then just do:
pivots_low = pivot(data, lbl, lbr, 'low')
pivots_high = pivot(data, lbl, lbr, 'high')
All the pivots will be in the actual position that they occur, not lbr bars after, otherwise the value will be 0.0
I'm not sure if this is efficient or not but it seems to work.
I am trying to do a Sentiment Analysis on tweets using BERT and HuggingFace transformer library. I am facing the following error and I am unsure how to solve it despite searching for possible solutions online.
TypeError: tokenize_data() got an unexpected keyword argument 'batched'
Here are the relevant parts of the codes:
path = "/content/drive/MyDrive/CSV/test.csv"
df = pd.read_csv(path)
df.head()
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
# Convert Sentiment into an integers
def transform_labels(label):
label = label['Sentiment']
num = 0
if label == 'Positive':
num = 0
elif label == 'Negative':
num = 1
elif label == 'Neutral':
num = 2
elif label == 'Extremely Positive':
num = 3
elif label == 'Extremely Negative':
num = 4
return {'labels': num}
# Tokenize the tweets
def tokenize_data(example):
return tokenizer(example['OriginalTweet'], padding='max_length')
df = df.apply(tokenize_data, batched=True)
# drop unwanted columns
remove_columns = ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']
df = df.apply(transform_labels, remove_columns=remove_columns)
def simulate_end_season(Parameters, Team,response):
# Update result data in dicts - faster than updating a DataFrame.
points=defaultdict(int)
wins=defaultdict(int)
losses=defaultdict(int)
draws=defaultdict(int)
goals_for=defaultdict(int)
goals_against=defaultdict(int)
Max=10
games_played = 0
# Simulate all the games in a season
for match in response["matches"]:
home_team=match['homeTeam']['name']
away_team=match['awayTeam']['name']
if match['status']=='FINISHED':
home_goals=match['score']['fullTime']['homeTeam']
away_goals=match['score']['fullTime']['awayTeam']
else :
GameResult = SimulateMatch(home_team, away_team, Parameters, Team)
home_goals = GameResult[0]
away_goals =GameResult[1]
# Update the points and win/draw/loss statistics.
if home_goals > away_goals:
points[home_team] += 3
wins[home_team] += 1
losses[away_team] += 1
elif home_goals == away_goals:
points[home_team] += 1
points[away_team] += 1
draws[home_team] += 1
draws[away_team] += 1
else:
points[away_team] += 3
wins[away_team] += 1
losses[home_team] += 1
# Update the goals.
goals_for[home_team] += home_goals
goals_against[home_team] += away_goals
goals_for[away_team] += away_goals
goals_against[away_team] += home_goals
# Return the table as a DataFrame (needs to be sorted on points and goal diff).
# Build the empty table
empty_rows = np.zeros((len(Team),7), dtype=int)
season_table = pd.DataFrame(empty_rows, index=Team,columns=['points', 'wins', 'draws', 'losses', 'goals for', 'goals against', 'goal diff'])
# Fill in the table
for team in Team:
values_list = [points[team], wins[team], draws[team], losses[team], goals_for[team], goals_against[team]]
season_table.loc[team, ['points', 'wins', 'draws', 'losses', 'goals for', 'goals against']] = values_list
# Calculate the goal diff.
season_table.loc[:, 'goal diff']= season_table.loc[:, 'goals for'] - season_table.loc[:, 'goals against']
season_table=season_table.sort_values(['points', 'goal diff'], ascending=[False, False])
rank=[k for k in range(1,len(Team)+1)]
season_table.insert(1,"rank",rank,True)
season_table=season_table.sort_index()
return season_table
def simulate_n_seasons(Team, Parameters, response,n,Max = 10):
start = time.time()
team_position=np.zeros((len(Team),n),dtype=int)
GoalsFor =np.zeros((len(Team),n))
GoalsAgainst= np.zeros((len(Team),n))
Losses = np.zeros((len(Team),n))
Wins = np.zeros((len(Team),n))
Draws = np.zeros((len(Team),n))
Points = np.zeros((len(Team),n))
Name= Team.sort()
with concurrent.futures.ProcessPoolExecutor() as executor:
results= [executor.submit(simulate.simulate_end_season,Params, Team, response) for k in range(n)]
i=0
for f in concurrent.futures.as_completed(results):
season_table = f.result()
team_position[:,i]=list(season_table['rank'])
GoalsFor[:,i]=list(season_table['goals for'])
GoalsAgainst[:,i]=list(season_table['losses'])
Losses[:,i]= list(season_table['losses'])
Draws[:,i]= list(season_table['draws'])
Wins[:,i]= list(season_table['wins'])
Points[:,i]=list(season_table['points'])
i+=1
end = time.time()
Name= sorted(Team)
print(end - start)
return Name,team_position,GoalsFor,GoalsAgainst,Points,Wins,Draws,Losses
Hello everyone, I am using concurrent.futures in order to do multiproscessing for my project in python. I have read that i need to import the function used for multiprocessing from another file, i can run the function simulate_n_seasons on Google colab but on Spyder or Pycharm, i got the following error message which is in the title.
What should i do ?
Thanks for your help.
I am running a sentiment analysis on a csv file and I am receiving this error message. I have tried a few things to resolve it and have not been successful. Any help would be greatly appreciated! Thank you!
Here is my code:
def sentimentAFINN(text):
words = pattern_split.split(text.lower())
sentiments = len(list(map(lambda word: afinn.get(word, 0), words)))
if sentiments:
sentiment = float(sum(sentiments))/math.sqrt(len(sentiments))
else:
sentiment = 0
return sentiment
def sentimentDisplayValue(sentimentScore):
if sentimentScore > 0.1:
return "Positive"
elif sentimentScore < -0.1:
return "Negative"
else:
return "Neutral"
totals = defaultdict(int)
for (index, row) in data.iterrows():
text = row['comment']
text_munged = munger(text)
sentimentScore = sentimentAFINN(text_munged)
sentimentDisplay = sentimentDisplayValue(sentimentScore)
totals[sentimentDisplay] = totals[sentimentDisplay] + 1
pt.add_row([text_munged, sentimentScore, sentimentDisplay])
print (pt)
print (totals)
This is my error message:
TypeError Traceback (most recent call last)
<ipython-input-73-b20887003b41> in <module>
4 text = row['LikelyToReferComment']
5 text_munged = munger(text)
----> 6 sentimentScore = sentimentAFINN(text_munged)
7 sentimentDisplay = sentimentDisplayValue(sentimentScore)
8 totals[sentimentDisplay] = totals[sentimentDisplay] + 1
<ipython-input-72-f95f79f94b60> in sentimentAFINN(text)
29 sentiments = len(list(map(lambda word: afinn.get(word, 0), words)))
30 if sentiments:
---> 31 sentiment = float(sum(sentiments))/math.sqrt(len(sentiments))
32
33 else:
TypeError: 'int' object is not iterable
Your sentiments variable is an int since its the value returned by len(). You are trying to call sum() and len() on sentiments. Both sum() and len() expect an iterable datatype.
You can change your sentimentAFINN() like this
def sentimentAFINN(text):
words = pattern_split.split(text.lower())
# save your list in sentiments
sentiments = list(map(lambda word: afinn.get(word, 0), words))
# now you check length of sentiments and return accordingly
return float(sum(sentiments))/math.sqrt(len(sentiments)) if len(sentiments) > 0 else 0
I am trying to plot a decision tree using ID3 in Python. I am really new to Python and couldn't understand the implementation of the following code. I need to know how I can apply this code to my data.
from math import log
import operator
def entropy(data):
entries = len(data)
labels = {}
for feat in data:
label = feat[-1]
if label not in labels.keys():
labels[label] = 0
labels[label] += 1
entropy = 0.0
for key in labels:
probability = float(labels[key])/entries
entropy -= probability * log(probability,2)
return entropy
def split(data, axis, val):
newData = []
for feat in data:
if feat[axis] == val:
reducedFeat = feat[:axis]
reducedFeat.extend(feat[axis+1:])
newData.append(reducedFeat)
return newData
def choose(data):
features = len(data[0]) - 1
baseEntropy = entropy(data)
bestInfoGain = 0.0;
bestFeat = -1
for i in range(features):
featList = [ex[i] for ex in data]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
newData = split(data, i, value)
probability = len(newData)/float(len(data))
newEntropy += probability * entropy(newData)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeat = i
return bestFeat
def majority(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def tree(data,labels):
classList = [ex[-1] for ex in data]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(data[0]) == 1:
return majority(classList)
bestFeat = choose(data)
bestFeatLabel = labels[bestFeat]
theTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [ex[bestFeat] for ex in data]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
theTree[bestFeatLabel][value] = tree(split/(data, bestFeat, value),subLabels)
return theTree
So what I did after this is the following:
infile=open("SData.csv","r")
data=infile.read()
tree(data)
The error which I got is "1 argument is missing" which is the label which I have to define and this is where I don't know what I have to put. I tried the variable for which I have to make the decision tree but it doesn't work:
tree(data,MinTemp)
Here I get an error "MinTemp is not defined".
Please help me out and let me know what I should do to have a look at the tree.
Following is the part of data and I want to generate a tree for MinTemp
MinTemp,Rainfall,Tempat9,RHat9,CAat9,WSat9
high,no,mild,normal,overcast,weak
high,no,mild,normal,cloudy,weak
high,no,mild,normal,cloudy,mild
high,yes,mild,high,cloudy,weak
high,yes,mild,high,cloudy,mild
medium,yes,mild,high,cloudy,mild
high,no,mild,high,overcast,weak
high,no,mild,normal,sunny,weak
high,no,hot,normal,sunny,weak
high,no,hot,normal,overcast,weak