ModuleNotFoundError tried pip installing and uninstalling sympy numerous times - python

When running the code for a NHL Score Prediction Model I received this error:
Traceback (most recent call last):
File "C:\PythonApps\main.py", line 6, in
from fontTools.misc.symfont import y
File "C:\PythonApps\venv\lib\site-packages\fontTools\misc\symfont.py", line 4, in
import sympy as sp
ModuleNotFoundError: No module named 'sympy'
I have tried pip installing and uninstalling sympy numerous times and have made no progress. Here is the full code please criticize me to the fullest so I can learn
from datetime import datetime
import os
os.environ['GIT_PYTHON_REFRESH'] = 'quiet'
import season as season
from fontTools.misc.symfont import y
from nhl import schedule_url
import joblib
import numpy as np
import requests
from sklearn.metrics import mean_absolute_error, r2_score
import sys
print(sys.executable)
# Set API key
api_key = '93a99fe5a13c4157b8caf96d524c5158'
# Set endpoint URLs
base_url = 'https://statsapi.web.nhl.com'
roster_url = base_url + '/api/v1/teams/{}/roster'
teams_url = base_url + '/api/v1/teams'
# Fetch player IDs from API
team_id = 10 # for example, the Edmonton Oilers
r = requests.get(roster_url.format(team_id))
roster_data = r.json()
player_ids = {player['person']['fullName']: player['person']['id'] for player in roster_data['roster']}
# Fetch team IDs from API
r = requests.get(teams_url)
teams_data = r.json()
team_ids = {team['name']: team['id'] for team in teams_data['teams']}
# Define function to get team stats
def get_team_stats(team_id, game_date):
# Get game IDs for team and date
params = {'teamId': team_id, 'date': game_date}
r = requests.get(schedule_url, params=params)
game_ids = [game['gamePk'] for game in r.json()['dates'][0]['games']]
# Get game stats for team
stats = []
for game_id in game_ids:
url = base_url + '/api/v1/game/{}/boxscore'.format(game_id)
r = requests.get(url)
game_stats = r.json()
# Get team's stats for game
team_stats = \
game_stats['teams']['away' if game_stats['teams']['away']['team']['id'] == team_id else 'home'][
'teamStats'][
'teamSkaterStats']
stats.append([
team_stats['goals'],
team_stats['pim'],
team_stats['shots'],
team_stats['powerPlayPercentage'],
team_stats['powerPlayGoals'],
team_stats['powerPlayOpportunities'],
team_stats['faceOffWinPercentage'],
team_stats['blocked'],
team_stats['takeaways'],
team_stats['giveaways'],
team_stats['hits'],
])
# Return average of stats
return np.mean(stats, axis=0)
# Define function to get player stats
def get_player_stats(player_id, season):
url = base_url + '/api/v1/people/{}/stats?stats=statsSingleSeason&season={}'.format(player_id, season)
r = requests.get(url)
stats = r.json()['stats'][0]['splits'][0]['stat']
return [stats['timeOnIce'],
stats['assists'],
stats['goals'],
stats['shots'],
stats['hits'],
stats['powerPlayGoals'],
stats['powerPlayAssists'],
stats['penaltyMinutes'],
stats['faceOffPct'],
stats['faceOffWins'],
stats['faceoffTaken'],
stats['takeaways'],
stats['giveaways'],
stats['shortHandedGoals'],
stats['shortHandedAssists'],
stats['blocked'],
stats['plusMinus'],
stats['evenTimeOnIce'],
stats['powerPlayTimeOnIce'],
stats['shortHandedTimeOnIce']
]
# Load training data
X_train = np.load('X.npy')
y_train = np.load('y.npy')
# Process data
X = np.array([team_stats + get_player_stats(player_id, season) for player_id in player_ids.values()])
# Save training data
np.save('X.npy', X)
np.save('y.npy', y)
# Create an array for X.npy
X = np.array([team_stats + get_player_stats(player_id, season) for player_id in player_ids.values()])
# Save training data
np.save('X.npy', X)
np.save('y.npy', y)
# Split data into training and test sets
split_idx = int(0.8 * X_train.shape[0])
X_test = X_train[split_idx:]
y_test = y_train[split_idx:]
X_train = X_train[:split_idx]
y_train = y_train
# Train MLPRegressor model
from sklearn.neural_network import MLPRegressor
model = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=500, alpha=0.0001, solver='adam',
verbose=10, random_state=21, tol=0.000000001)
model.fit(X_train, y_train)
# Evaluate model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
y_pred = model.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_pred))
print('R2 score:', r2_score(y_test, y_pred))
# Save model
joblib.dump(model, 'nhl_score_predictor.joblib')
print('Model saved')
# Get today's date
today = datetime.now().strftime('%Y-%m-%d')
# Get user input
team_name = input('Enter team name: ')
season = input('Enter season (yyyy-yyyy format): ')
player_name = input('Enter player name: ')
# Get team and player stats
team_stats = get_team_stats(team_ids[team_name], today)
player_stats = get_player_stats(player_ids[player_name], season)
stats = team_stats + player_stats
# Define function to predict score
def predict_score(X):
model = joblib.load('nhl_score_predictor.joblib')
return int(model.predict([X])[0])
# Predict
predicted_score = predict_score([team_stats])
print('Predicted score:', predicted_score)
# Save model
joblib.dump(model, 'nhl_score_predictor.joblib')
print('Model saved')
# Get today's date
today = datetime.now().strftime('%Y-%m-%d')
# Get user input
team_name = input('Enter team name: ')
season = input('Enter season (yyyy-yyyy format): ')
player_name = input('Enter player name: ')
# Get team and player stats
print("Getting team and player stats...")
team_stats = get_team_stats(team_ids[team_name], today)
player_stats = get_player_stats(player_ids[player_name], season)
stats = team_stats + player_stats
print("Team stats: ", team_stats)
print("Player stats: ", player_stats)
# Define function to predict score
print("Defining function to predict score...")
def predict_score(X):
model = joblib.load('nhl_score_predictor.joblib')
return int(model.predict([X])[0])
# Predict
print("Predicting score...")
predicted_score = predict_score([team_stats])
print('Predicted score:', predicted_score)
# Save model
print("Saving model...")
joblib.dump(model, 'nhl_score_predictor.joblib')
print('Model saved')

Related

"TypeError: getattr(): attribute name must be string" getattr() does not call the models from the folder

I am having an issue to understand this given error here. I am adding the source code and the traceback below here. The code is supposed to call necessary models to run the model such as brits, rits, rits_i etc. However, it is not working the way it should. Here is the source code, and the getattr() is close to the end of this source code:
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import numpy as np
import time
import utils
import models
import argparse
import data_loader
import pandas as pd
import ujson as json
from sklearn import metrics
from ipdb import set_trace
parser = argparse.ArgumentParser()
parser.add_argument('--epochs', type=int, default=1000)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--model', type=str)
parser.add_argument('--hid_size', type=int)
parser.add_argument('--impute_weight', type=float)
parser.add_argument('--label_weight', type=float)
args = parser.parse_args()
def train(model):
optimizer = optim.Adam(model.parameters(), lr=1e-3)
data_iter = data_loader.get_loader(batch_size=args.batch_size)
for epoch in range(args.epochs):
model.train()
run_loss = 0.0
for idx, data in enumerate(data_iter):
data = utils.to_var(data)
ret = model.run_on_batch(data, optimizer, epoch)
run_loss += ret['loss'].item()
print ('\r Progress epoch {}, {:.2f}%, average loss {}'.format(epoch, (idx + 1) * 100.0 / len(data_iter), run_loss / (idx + 1.0)), evaluate(model, data_iter))
def evaluate(model, val_iter):
model.eval()
labels = []
preds = []
evals = []
imputations = []
save_impute = []
save_label = []
for idx, data in enumerate(val_iter):
data = utils.to_var(data)
ret = model.run_on_batch(data, None)
# save the imputation results which is used to test the improvement of traditional methods with imputed values
save_impute.append(ret['imputations'].data.cpu().numpy())
save_label.append(ret['labels'].data.cpu().numpy())
pred = ret['predictions'].data.cpu().numpy()
label = ret['labels'].data.cpu().numpy()
is_train = ret['is_train'].data.cpu().numpy()
eval_masks = ret['eval_masks'].data.cpu().numpy()
eval_ = ret['evals'].data.cpu().numpy()
imputation = ret['imputations'].data.cpu().numpy()
evals += eval_[np.where(eval_masks == 1)].tolist()
imputations += imputation[np.where(eval_masks == 1)].tolist()
# collect test label & prediction
pred = pred[np.where(is_train == 0)]
label = label[np.where(is_train == 0)]
labels += label.tolist()
preds += pred.tolist()
labels = np.asarray(labels).astype('int32')
preds = np.asarray(preds)
print ('AUC {}'.format(metrics.roc_auc_score(labels, preds)))
evals = np.asarray(evals)
imputations = np.asarray(imputations)
print ('MAE', np.abs(evals - imputations).mean())
print ('MRE', np.abs(evals - imputations).sum() / np.abs(evals).sum())
save_impute = np.concatenate(save_impute, axis=0)
save_label = np.concatenate(save_label, axis=0)
np.save('./result/{}_data'.format(args.model), save_impute)
np.save('./result/{}_label'.format(args.model), save_label)
def run():
model = getattr(models, args.model).Model(args.hid_size, args.impute_weight, args.label_weight)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Total params is {}'.format(total_params))
if torch.cuda.is_available():
model = model.cuda()
train(model)
if __name__ == '__main__':
run()
Here is the traceback I have:
runfile('E:/GAIN Time Series Data Imputation/BRITS-master/main.py', wdir='E:/GAIN Time Series Data Imputation/BRITS-master')
Traceback (most recent call last):
File "E:\GAIN Time Series Data Imputation\BRITS-master\main.py", line 120, in <module>
run()
File "E:\GAIN Time Series Data Imputation\BRITS-master\main.py", line 109, in run
model = getattr(models, args.model).Model(args.hid_size, args.impute_weight, args.label_weight)
TypeError: getattr(): attribute name must be string
The necessary models (rits, brits etc.) are under the models folder. But, the function just does not call these models. I will be looking for the resolution.

Python KNN Regression

I am trying to predict a cars MPG by using a KNN algorithm. I first cleaned my data, made a test and training dataset, and then I made a normalized and non-normalized KNN function. Now I am trying to pass my testing data through a KNN algorithm, and then create a list of all the predictions. I then want to use mean squared error to analyze my predictions. Currently, I have not been able to set up the function to pass my testing data through. Any guidance would be greatly appreciated!
import pandas as pd
import numpy as np
import math
from google.colab import drive
drive.mount('/content/drive')
pd.set_option('display.max_columns', 100)
vehicles = pd.read_csv('/content/drive/MyDrive/CS_167/vehicles (2).csv')
subset_cars = vehicles[vehicles["fuelType"] == 'Regular']
final_sub = subset_cars[["comb08", "year", "cylinders", "displ"]]
column_nulls = final_sub.isna().any()
Cylinder_no_null = final_sub.cylinders.dropna()
displ_no_null = final_sub.displ.dropna()
pure_data = final_sub.dropna()
# pure_data.head()
shuffled_data = pure_data.sample(frac=1, random_state=41)
test_data = shuffled_data.iloc[0:500]
train_data = shuffled_data.iloc[500:]
train_data_euc = train_data.copy()
test_data_euc = test_data.copy()
def Regression_KNN(MPG,train_data_euc,k):
train_data_euc['euc_dis'] = np.sqrt(
(MPG['year']-train_data_euc['year'])**2+
(MPG['cylinders']-train_data_euc['cylinders'])**2+
(MPG['displ']-train_data_euc['displ'])**2)
sorted_train_data = train_data_euc.sort_values(['euc_dis'])
prediction = sorted_train_data.iloc[0:k]['comb08'].mean()
return prediction
MPG ={}
MPG['year'] = 2020
MPG['cylinders'] = 4
MPG['displ'] = 5.2
print(f"The average MPG for this car is: %d" %Regression_KNN(MPG, train_data_euc, 5))
z_train_copy = train_data_euc.copy()
z_train_year_std = z_train_copy['year'].std()
z_train_year_mean = z_train_copy['year'].mean()
z_train_cylinders_std = z_train_copy['cylinders'].std()
z_train_cylinders_mean = z_train_copy['cylinders'].mean()
z_train_displ_std = z_train_copy['displ'].std()
z_train_displ_mean = z_train_copy['displ'].mean()
z_train_euc_std = z_train_copy['euc_dis'].std()
z_train_euc_mean = z_train_copy['euc_dis'].mean()
z_train_copy['year'] = (z_train_copy['year'] - z_train_year_mean)/z_train_year_std
z_train_copy['cylinders'] = (z_train_copy['cylinders'] - z_train_cylinders_mean)/z_train_cylinders_std
z_train_copy['displ'] = (z_train_copy['displ'] - z_train_displ_mean)/z_train_displ_std
z_train_copy['euc_dis'] = (z_train_copy['euc_dis'] - z_train_euc_mean)/z_train_euc_std
def Z_TRAIN_KNN(MPG, z_train_copy, k):
z_train_copy['euc_dis'] = np.sqrt(
(MPG['year']-z_train_copy['year'])**2+
(MPG['cylinders']-z_train_copy['cylinders'])**2+
(MPG['displ']-z_train_copy['displ'])**2)
z_train_sorted_data = z_train_copy.sort_values(['euc_dis'])
z_train_prediction = z_train_sorted_data.iloc[0:k]['comb08'].mean()
return z_train_prediction
MPG ={}
MPG['year'] = 2020
MPG['cylinders'] = 4
MPG['displ'] = 5.2
print(f"The average MPG for this car is: %d" %Z_TRAIN_KNN(MPG, z_train_copy, 5))
def regression_all_kNN(test_data_euc,z_train_data,k):
#apply the classify_kNN function to each item in the test data with the train
#data and k passed as the other two arguments. The result will be a series of
#the individual results.
for i in test_data:
z_train_data['euc_dis'] = np.sqrt(
(test_data['year']- z_train_data['year'])**2+
(test_data['cylinders']- z_train_data['cylinders'])**2+
(test_data['displ']- z_train_data['displ'])**2)
sorted_train_data = z_train_data.sort_values(['euc_dis'])
prediction = test_data.apply(regression_all_kNN,args=(z_train_data,k))
return prediction
predictions5NN = regression_all_kNN(test_data, train_data, 5)

Machine Learning Web Application error in Flask

I am a newbie in machine learning model deployment and have been trying to deploy a simple machine learning model on car price prediction in Flask and Heroku. Have made the model using sklearn pipeline and transformer. The code runs perfectly on the Jupyter Notebook.
Deploying it in Heroku through Github, shows Build succeeded. However on launching the application, its shows application error.
It seems there something is wrong with the app.py file code for Flask. Any help or insight would be really helpful. Thank You
Jupyter Notebook Code:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('car data.csv')
data['current_year'] = 2020
data['car_age'] = data['current_year'] - data['Year']
num_features = [col for col in data.columns if data[col].dtype != 'O' and col != 'Selling_Price']
cat_features = [col for col in data.columns if data[col].dtype == 'O']
X= data.drop(['Selling_Price'], axis=1)
y = data['Selling_Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
num_transformer = Pipeline(steps = [('scaler', StandardScaler())])
cat_transformer = Pipeline(steps =[('OneHot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers = [('numerical_transformer', num_transformer, num_features),
('categorical_transformer', cat_transformer, cat_features)])
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
grid_param = {'n_estimators' :[100, 200, 500, 800, 1000]}
grid = GridSearchCV(estimator = rf, param_grid= grid_param)
model = Pipeline(steps= [('preprocessor',preprocessor), ('grid_regressor', grid)])
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error
MAE = mean_absolute_error(y_test, y_predict)
MSE = mean_squared_error(y_test, y_predict)
RMSE = np.sqrt(MSE)
import pickle
file = open('regression_model.pkl', 'wb')
pickle.dump(model, file)
Code for the app.py file for Flask:
from flask import Flask, render_template, request
import jsonify
import pickle
import numpy as np
import sklearn
model = pickle.load(open('regression_model.pkl', 'rb'))
app = Flask(__name__)
#app.route('/', methods=['GET'])
def Home():
return render_template('index.html')
#app.route("/predict", methods= ['POST']
def predict():
if request.method == 'POST':
Year = int(request.form['Year'])
Selling_Price = float(request.form['Selling_Price'])
Present_Price = float(request.form['Present_Price'])
Kms_Driven = int(request.form['Kms_Driven'])
Owner = int(request.form['Owner'])
car_age = 2020 - Year
Fuel_Type_Petrol = request.form['Fuel_Type_Petrol']
if(Fuel_Type_Petrol=='Petrol'):
Fuel_Type_Petrol=1
Fuel_Type_Diesel=0
elif (Fuel_Type_Petrol =='Diesel'):
Fuel_Type_Petrol=0
Fuel_Type_Diesel=1
else:
Fuel_Type_Petrol=0
Fuel_Type_Diesel=0
Seller_Type_Individual = request.form['Seller_Type_Individual']
if(Seller_Type_Individual =='Individual'):
Seller_Type_Individual=1
else:
Seller_Type_Individual=0
Transmission_Mannual=request.form['Transmission_Mannual']
if(Transmission_Mannual=='Mannual'):
Transmission_Mannual=1
else:
Transmission_Mannual=0
data = [Selling_Price, Present_Price, Kms_Driven, Owner, Fuel_Type_Petrol, Fuel_Type_Diesel, Seller_Type_Individual, Transmission_Mannual, car_age]
prediction = model.predict([data])
output = round(prediction[0], 2)
if output <0:
return render_template('index.html', prediction_texts="Sorry you cannot sell this car")
else:
return render_template('index.html', prediction_texts= "You can sell the car at {}".format(output))
else:
render_template('index.html')
if __name__ == '__main__':
app.run(debug=True)

ValueError: could not convert string to float: sklearn

Recently I was working with a dataset in python and got an unexpected error. the error was: ValueError: could not convert string to float. Actually in the dataset there were text data also which I converted into integer with LabelEncoder. But when I am going in the training part where I fit the model, I'm getting this error which makes no sense.
code:
import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import pickle
import numpy as np
data = pd.read_csv("house_train.csv")
data = data.fillna(value=0)
dataX_train = data.drop(["SalePrice"], axis = 1)
dataX_test = data.SalePrice
le = preprocessing.LabelEncoder()
dataX_train.MSZoning = le.fit_transform(list(data["MSZoning"]))
dataX_train.Street = le.fit_transform(list(data["Street"]))
dataX_train.Alley = le.fit_transform(list(data["Alley"]))
dataX_train.LotShape = le.fit_transform(list(data["LotShape"]))
dataX_train.LandContour = le.fit_transform(list(data["LandContour"]))
dataX_train.Utilities = le.fit_transform(list(data["Utilities"]))
dataX_train.LotConfig = le.fit_transform(list(data["LotConfig"]))
dataX_train.LandSlope = le.fit_transform(list(data["LandSlope"]))
dataX_train.Neighborhood = le.fit_transform(list(data["Neighborhood"]))
dataX_train.Condition1 = le.fit_transform(list(data["Condition1"]))
dataX_train.Condition2 = le.fit_transform(list(data["Condition2"]))
dataX_train.BldgType = le.fit_transform(list(data["BldgType"]))
dataX_train.HouseStyle = le.fit_transform(list(data["HouseStyle"]))
dataX_train.RoofStyle = le.fit_transform(list(data["RoofStyle"]))
dataX_train.RoofMatl = le.fit_transform(list(data["RoofMatl"]))
dataX_train.Exterior1st = le.fit_transform(list(data["Exterior1st"]))
dataX_train.Exterior2nd = le.fit_transform(list(data["Exterior2nd"]))
dataX_train.MasVnrType = le.fit_transform(list(data["MasVnrType"]))
dataX_train.ExterQual = le.fit_transform(list(data["ExterQual"]))
dataX_train.ExterCond = le.fit_transform(list(data["ExterCond"]))
dataX_train.Foundation = le.fit_transform(list(data["Foundation"]))
dataX_train.BsmtQual = le.fit_transform(list(data["BsmtQual"]))
dataX_train.BsmtExposure = le.fit_transform(list(data["BsmtExposure"]))
dataX_train.BsmtFinType1 = le.fit_transform(list(data["BsmtFinType1"]))
dataX_train.BsmtFinType2 = le.fit_transform(list(data["BsmtFinType2"]))
dataX_train.Heating = le.fit_transform(list(data["Heating"]))
dataX_train.HeatingQC = le.fit_transform(list(data["HeatingQC"]))
dataX_train.CentralAir = le.fit_transform(list(data["CentralAir"]))
dataX_train.Electrical = le.fit_transform(list(data["Electrical"]))
dataX_train.KitchenQual = le.fit_transform(list(data["KitchenQual"]))
dataX_train.Functional = le.fit_transform(list(data["Functional"]))
dataX_train.FireplaceQu = le.fit_transform(list(data["FireplaceQu"]))
dataX_train.GarageType = le.fit_transform(list(data["GarageType"]))
dataX_train.GarageFinish = le.fit_transform(list(data["GarageFinish"]))
dataX_train.GarageQual = le.fit_transform(list(data["GarageQual"]))
dataX_train.GarageCond = le.fit_transform(list(data["GarageCond"]))
dataX_train.PavedDrive = le.fit_transform(list(data["PavedDrive"]))
dataX_train.PoolQC = le.fit_transform(list(data["PoolQC"]))
dataX_train.Fence = le.fit_transform(list(data["Fence"]))
dataX_train.MiscFeature = le.fit_transform(list(data["MiscFeature"]))
dataX_train.SaleType = le.fit_transform(list(data["SaleType"]))
dataX_train.SaleCondition = le.fit_transform(list(data["SaleCondition"]))
best = 0
x_train, x_test, y_train, y_test = model_selection.train_test_split(dataX_train, dataX_test,
test_size = 0.2)
clf = linear_model.LinearRegression()
clf.fit(x_train, y_train)
acc = clf.score(x_test, y_test)
if acc > best:
best = acc
with open("housingmodel.pickle", "wb") as f:
pickle.dump(clf , f)
print(acc)
first of all check out if you encoded all of your features in dataX_train, I think you missed something there.
try: dataX_train.dtypes and check if there is any of non-numeric values and then use to_numeric on a non numeric columns. For example
dataX_train['NonNumericCol'] = dataX_train['NonNumericCol'].apply(pd.to_numeric)

AttributeError: 'LdaModel' object has no attribute 'minimum_phi_value'

As I was just experimenting with NLP then I was working on sarcasm detection but in meanwhile I had put this code.
sarcasmextractor.py
# coding: utf-8
# Importing the library
# In[2]:
import io
import sys
import os
import numpy as np
import pandas as pd
import nltk
import gensim
import csv, collections
from textblob import TextBlob
from sklearn.utils import shuffle
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
import pickle
import replace_emoji
# Define a class to load the SentimentWordnet and write methods to calculate the scores
# In[4]:
class load_senti_word_net(object):
"""
constructor to load the file and read the file as CSV
6 columns - pos, ID, PosScore, NegScore, synsetTerms, gloss
synsetTerms can have multiple similar words like abducting#1 abducent#1 and will read each one and calculaye the scores
"""
def __init__(self):
sent_scores = collections.defaultdict(list)
with io.open("SentiWordNet_3.0.0_20130122.txt") as fname:
file_content = csv.reader(fname, delimiter='\t',quotechar='"')
for line in file_content:
if line[0].startswith('#') :
continue
pos, ID, PosScore, NegScore, synsetTerms, gloss = line
for terms in synsetTerms.split(" "):
term = terms.split("#")[0]
term = term.replace("-","").replace("_","")
key = "%s/%s"%(pos,term.split("#")[0])
try:
sent_scores[key].append((float(PosScore),float(NegScore)))
except:
sent_scores[key].append((0,0))
for key, value in sent_scores.items():
sent_scores[key] = np.mean(value,axis=0)
self.sent_scores = sent_scores
"""
For a word,
nltk.pos_tag(["Suraj"])
[('Suraj', 'NN')]
"""
def score_word(self, word):
pos = nltk.pos_tag([word])[0][1]
return self.score(word, pos)
def score(self,word, pos):
"""
Identify the type of POS, get the score from the senti_scores and return the score
"""
if pos[0:2] == 'NN':
pos_type = 'n'
elif pos[0:2] == 'JJ':
pos_type = 'a'
elif pos[0:2] =='VB':
pos_type='v'
elif pos[0:2] =='RB':
pos_type = 'r'
else:
pos_type = 0
if pos_type != 0 :
loc = pos_type+'/'+word
score = self.sent_scores[loc]
if len(score)>1:
return score
else:
return np.array([0.0,0.0])
else:
return np.array([0.0,0.0])
"""
Repeat the same for a sentence
nltk.pos_tag(word_tokenize("My name is Suraj"))
[('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Suraj', 'NNP')]
"""
def score_sentencce(self, sentence):
pos = nltk.pos_tag(sentence)
print (pos)
mean_score = np.array([0.0, 0.0])
for i in range(len(pos)):
mean_score += self.score(pos[i][0], pos[i][1])
return mean_score
def pos_vector(self, sentence):
pos_tag = nltk.pos_tag(sentence)
vector = np.zeros(4)
for i in range(0, len(pos_tag)):
pos = pos_tag[i][1]
if pos[0:2]=='NN':
vector[0] += 1
elif pos[0:2] =='JJ':
vector[1] += 1
elif pos[0:2] =='VB':
vector[2] += 1
elif pos[0:2] == 'RB':
vector[3] += 1
return vector
# Now let's extract the features
#
# ###Stemming and Lemmatization
# In[5]:
porter = nltk.PorterStemmer()
sentiments = load_senti_word_net()
# In[7]:
def gram_features(features,sentence):
sentence_rep = replace_emoji.replace_reg(str(sentence))
token = nltk.word_tokenize(sentence_rep)
token = [porter.stem(i.lower()) for i in token]
bigrams = nltk.bigrams(token)
bigrams = [tup[0] + ' ' + tup[1] for tup in bigrams]
grams = token + bigrams
#print (grams)
for t in grams:
features['contains(%s)'%t]=1.0
# In[8]:
import string
def sentiment_extract(features, sentence):
sentence_rep = replace_emoji.replace_reg(sentence)
token = nltk.word_tokenize(sentence_rep)
token = [porter.stem(i.lower()) for i in token]
mean_sentiment = sentiments.score_sentencce(token)
features["Positive Sentiment"] = mean_sentiment[0]
features["Negative Sentiment"] = mean_sentiment[1]
features["sentiment"] = mean_sentiment[0] - mean_sentiment[1]
#print(mean_sentiment[0], mean_sentiment[1])
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in token]).strip())
features["Blob Polarity"] = text.sentiment.polarity
features["Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["Blob Polarity"] = 0
features["Blob Subjectivity"] = 0
print("do nothing")
first_half = token[0:int(len(token)/2)]
mean_sentiment_half = sentiments.score_sentencce(first_half)
features["positive Sentiment first half"] = mean_sentiment_half[0]
features["negative Sentiment first half"] = mean_sentiment_half[1]
features["first half sentiment"] = mean_sentiment_half[0]-mean_sentiment_half[1]
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in first_half]).strip())
features["first half Blob Polarity"] = text.sentiment.polarity
features["first half Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["first Blob Polarity"] = 0
features["first Blob Subjectivity"] = 0
print("do nothing")
second_half = token[int(len(token)/2):]
mean_sentiment_sechalf = sentiments.score_sentencce(second_half)
features["positive Sentiment second half"] = mean_sentiment_sechalf[0]
features["negative Sentiment second half"] = mean_sentiment_sechalf[1]
features["second half sentiment"] = mean_sentiment_sechalf[0]-mean_sentiment_sechalf[1]
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in second_half]).strip())
features["second half Blob Polarity"] = text.sentiment.polarity
features["second half Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["second Blob Polarity"] = 0
features["second Blob Subjectivity"] = 0
print("do nothing")
# In[9]:
features = {}
sentiment_extract(features,"a long narrow opening")
# In[11]:
def pos_features(features,sentence):
sentence_rep = replace_emoji.replace_reg(sentence)
token = nltk.word_tokenize(sentence_rep)
token = [ porter.stem(each.lower()) for each in token]
pos_vector = sentiments.pos_vector(token)
for j in range(len(pos_vector)):
features['POS_'+str(j+1)] = pos_vector[j]
print ("done")
# In[12]:
features = {}
pos_features(features,"a long narrow opening")
# In[13]:
def capitalization(features,sentence):
count = 0
for i in range(len(sentence)):
count += int(sentence[i].isupper())
features['Capitalization'] = int(count > 3)
print (count)
# In[14]:
features = {}
capitalization(features,"A LoNg NArrow opening")
# In[15]:
import topic
topic_mod = topic.topic(nbtopic=200,alpha='symmetric')
# In[16]:
topic_mod = topic.topic(model=os.path.join('topics.tp'),dicttp=os.path.join('topics_dict.tp'))
# In[17]:
def topic_feature(features,sentence,topic_modeler):
topics = topic_modeler.transform(sentence)
for j in range(len(topics)):
features['Topic :'] = topics[j][1]
# In[18]:
topic_feature(features,"A LoNg NArrow opening",topic_mod)
# In[19]:
def get_features(sentence, topic_modeler):
features = {}
gram_features(features,sentence)
pos_features(features,sentence)
sentiment_extract(features, sentence)
capitalization(features,sentence)
topic_feature(features, sentence,topic_modeler)
return features
# In[20]:
df = pd.DataFrame()
df = pd.read_csv("dataset_csv.csv", header=0, sep='\t')
df.head()
# In[17]:
import re
for i in range(0,df.size):
temp = str(df["tweets"][i])
temp = re.sub(r'[^\x00-\x7F]+','',temp)
featureset.append((get_features(temp,topic_mod), df["label"][i]))
# In[20]:
c = []
for i in range(0,len(featureset)):
c.append(pd.DataFrame(featureset[i][0],index=[i]))
result = pd.concat(c)
# In[22]:
result.insert(loc=0,column="label",value='0')
# In[23]:
for i in range(0, len(featureset)):
result["label"].loc[i] = featureset[i][1]
# In[25]:
result.to_csv('feature_dataset.csv')
# In[3]:
df = pd.DataFrame()
df = pd.read_csv("feature_dataset.csv", header=0)
df.head()
# In[4]:
get_ipython().magic('matplotlib inline')
import matplotlib as matplot
import seaborn
result = df
# In[5]:
X = result.drop(['label','Unnamed: 0','Topic :'],axis=1).values
# In[6]:
Y = result['label']
# In[7]:
import pickle
import pefile
import sklearn.ensemble as ek
from sklearn import cross_validation, tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LinearRegression
import sklearn.linear_model as lm
# In[29]:
model = { "DecisionTree":tree.DecisionTreeClassifier(max_depth=10),
"RandomForest":ek.RandomForestClassifier(n_estimators=50),
"Adaboost":ek.AdaBoostClassifier(n_estimators=50),
"GradientBoosting":ek.GradientBoostingClassifier(n_estimators=50),
"GNB":GaussianNB(),
"Logistic Regression":LinearRegression()
}
# In[8]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y ,test_size=0.2)
# In[9]:
X_train = pd.DataFrame(X_train)
X_train = X_train.fillna(X_train.mean())
X_test = pd.DataFrame(X_test)
X_test = X_test.fillna(X_test.mean())
# In[38]:
results_algo = {}
for algo in model:
clf = model[algo]
clf.fit(X_train,y_train.astype(int))
score = clf.score(X_test,y_test.astype(int))
print ("%s : %s " %(algo, score))
results_algo[algo] = score
# In[39]:
winner = max(results_algo, key=results_algo.get)
# In[40]:
clf = model[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))
# In[41]:
from sklearn import metrics
print (metrics.classification_report(y_test, res))
# In[34]:
test_data = "public meetings are awkard for me as I can insult people but I choose not to and that is something that I find difficult to live with"
# In[101]:
test_data="I purchased this product 4.47 billion years ago and when I opened it today, it was half empty."
# In[82]:
test_data="when people see me eating and ask me are you eating? No no I'm trying to choke myself to death #sarcastic"
# In[102]:
test_feature = []
test_feature.append((get_features(test_data,topic_mod)))
# In[104]:
test_feature
# In[105]:
c = []
c.append(pd.DataFrame(test_feature[0],index=[i]))
test_result = pd.concat(c)
test_result = test_result.drop(['Topic :'],axis=1).values
# In[106]:
res= clf.predict(test_result)
But it is giving me the following error:
C:\ProgramData\Anaconda3\lib\site-packages\gensim\utils.py:1197: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
[('a', 'DT'), ('long', 'JJ'), ('narrow', 'JJ'), ('open', 'JJ')]
[('a', 'DT'), ('long', 'JJ')]
[('narrow', 'JJ'), ('open', 'JJ')]
done
5
Traceback (most recent call last):
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\sarcasm-extraction.py", line 276, in <module>
topic_feature(features,"A LoNg NArrow opening",topic_mod)
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\sarcasm-extraction.py", line 268, in topic_feature
topics = topic_modeler.transform(sentence)
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\topic.py", line 42, in transform
return self.lda[corpus_sentence]
File "C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\ldamodel.py", line 1160, in __getitem__
return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics)
AttributeError: 'LdaModel' object has no attribute 'minimum_phi_value'
Code for topic.py:
from gensim import corpora, models, similarities
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import replace_emoji
class topic(object):
def __init__(self, nbtopic = 100, alpha=1,model=None,dicttp=None):
self.nbtopic = nbtopic
self.alpha = alpha
self.porter = nltk.PorterStemmer()
self.stop = stopwords.words('english')+['.','!','?','"','...','\\',"''",'[',']','~',"'m","'s",';',':','..','$']
if model!=None and dicttp!=None:
self.lda = models.ldamodel.LdaModel.load(model)
self.dictionary = corpora.Dictionary.load(dicttp)
def fit(self,documents):
documents_mod = documents
tokens = [nltk.word_tokenize(sentence) for sentence in documents_mod]
tokens = [[self.porter.stem(t.lower()) for t in sentence if t.lower() not in self.stop] for sentence in tokens]
self.dictionary = corpora.Dictionary(tokens)
corpus = [self.dictionary.doc2bow(text) for text in tokens]
self.lda = models.ldamodel.LdaModel(corpus,id2word=self.dictionary, num_topics=self.nbtopic,alpha=self.alpha)
self.lda.save('topics.tp')
self.dictionary.save('topics_dict.tp')
def get_topic(self,topic_number):
return self.lda.print_topic(topic_number)
def transform(self,sentence):
sentence_mod = sentence
tokens = nltk.word_tokenize(sentence_mod)
tokens = [self.porter.stem(t.lower()) for t in tokens if t.lower() not in self.stop]
corpus_sentence = self.dictionary.doc2bow(tokens)
return self.lda[corpus_sentence]
The overall code is found here overall code.
The minimum_phi_value is a property of LdaModel that is set when an instance is created and for some reason it hasn't been serialized (which is pretty strange, probably a bug).
To workaround this particular issue you can add
self.lda.minimum_phi_value = 0.01
... after self.lda loading or avoid saving/restoring the model if possible (i.e. always train it).
But I encourage you to examine the fields of self.lda before and after serialization to check they are identical.

Categories