Upto which part i need to dump my K-mean Clustering Model,So that the input will be given in recommend songs and the output will be display
This is my below ml program
I used K-means clustering for cluster both genres & songs for music recommendation
import os
import numpy as np
import pandas as pd
import json
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
import sys
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
import sklearn
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib
data = pd.read_csv('./data.csv')
genre_data = pd.read_csv("./data_w_genres.csv")
year_data = pd.read_csv("./data_by_year.csv")
artist_data = pd.read_csv("./data_by_artist.csv")
sklearn.cluster.KMeans(n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=0.0001, verbose=0, random_state=None,copy_x=True, algorithm='auto')
cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)
tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()),
('kmeans', KMeans(n_clusters=20,
verbose=False))
], verbose=False)
X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']
os.environ['SPOTIPY_CLIENT_ID'] = 'd267a64373c94780b083db4b58cb5c76'
os.environ['SPOTIPY_CLIENT_SECRET'] = '8244aa724749468a8314c7305ea9db44'
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=os.environ["SPOTIPY_CLIENT_ID"],
client_secret=os.environ["SPOTIPY_CLIENT_SECRET"]))
def find_song(name, year):
song_data = defaultdict()
results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
if results['tracks']['items'] == []:
return None
results = results['tracks']['items'][0]
track_id = results['id']
audio_features = sp.audio_features(track_id)[0]
song_data['name'] = [name]
song_data['year'] = [year]
song_data['explicit'] = [int(results['explicit'])]
song_data['duration_ms'] = [results['duration_ms']]
song_data['popularity'] = [results['popularity']]
for key, value in audio_features.items():
song_data[key] = value
return pd.DataFrame(song_data)
number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit','instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']
def get_song_data(song, spotify_data):
try:
song_data = spotify_data[(spotify_data['name'] == song['name'])
& (spotify_data['year'] == song['year'])].iloc[0]
return song_data
except IndexError:
return find_song(song['name'], song['year'])
def get_mean_vector(song_list, spotify_data):
song_vectors = []
for song in song_list:
song_data = get_song_data(song, spotify_data)
if song_data is None:
print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
continue
song_vector = song_data[number_cols].values
song_vectors.append(song_vector)
song_matrix = np.array(list(song_vectors))
return np.mean(song_matrix, axis=0)
def flatten_dict_list(dict_list):
flattened_dict = defaultdict()
for key in dict_list[0].keys():
flattened_dict[key] = []
for dictionary in dict_list:
for key, value in dictionary.items():
flattened_dict[key].append(value)
return flattened_dict
def recommend_songs(song_list, data, n_songs=30):
metadata_cols = ['name', 'year', 'artists']
song_dict = flatten_dict_list(song_list)
song_center = get_mean_vector(song_list, data)
scaler = song_cluster_pipeline.steps[0][1]
scaled_data = scaler.transform(data[number_cols])
scaled_song_center = scaler.transform(song_center.reshape(1, -1))
distances = cdist(scaled_song_center, scaled_data, 'cosine')
index = list(np.argsort(distances)[:, :n_songs][0])
rec_songs = data.iloc[index]
rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
dictionary = rec_songs[metadata_cols].to_dict(orient='records')
json_ob = json.dumps(dictionary,index = 2);
with open("output.json","w") as outfile:
outfile.write(json_ob);
recommend_songs([{'name': 'Come As You Are', 'year':1991},
{'name': 'Smells Like Teen Spirit', 'year': 1991},
{'name': 'Lithium', 'year': 1992},
{'name': 'All Apologies', 'year': 1993},
{'name': 'Stay Away', 'year': 1993}], data)
I tried joblib but i don't know which part must be pickled so only input must be given
Related
In my program,
from sklearn import linear_model
from keras.wrappers.scikit_learn import KerasClassifier,KerasRegressor
import eli5
from eli5.sklearn import PermutationImportance
if __name__ == '__main__':
(cnn_tube_par_X_train, cnn_tube_par_X_test, cnn_tube_Y_train, cnn_tube_Y_test) = read_file()
sc_X = StandardScaler()
sc_Y = StandardScaler()
sc_cnn_tube_par_X_train = sc_X.fit_transform(cnn_tube_par_X_train.iloc[:, 1:6].values)
sc_cnn_tube_par_X_test = sc_X.transform(cnn_tube_par_X_test.iloc[:, 1:6].values)
sc_cnn_tube_eff_Y_train = sc_Y.fit_transform(cnn_tube_Y_train.iloc[:, -1:].values)
sc_cnn_tube_eff_Y_test = sc_Y.transform(cnn_tube_Y_test.iloc[:, -1:].values)
MLR_pImportance(sc_cnn_tube_par_X_train,sc_cnn_tube_par_X_test,sc_cnn_tube_eff_Y_train,sc_cnn_tube_eff_Y_test)
def MLR_pImportance(sc_mlr_tube_par_X_train,sc_mlr_tube_par_X_test,sc_mlr_tube_eff_Y_train,sc_mlr_tube_eff_Y_test):
mlr = linear_model.LinearRegression()
mlr.fit(sc_mlr_tube_par_X_train,sc_mlr_tube_eff_Y_train)
perm = PermutationImportance(mlr,random_state=1).fit(sc_mlr_tube_par_X_test,sc_mlr_tube_eff_Y_test)
print(perm.feature_importances_)
print(perm.feature_importances_std_)
eli5.show_weights(perm)
The results show that :
[0.63895352 0.1270582 0.06904505 0.32131836 0.02549574]
[0.02766096 0.01535046 0.01789114 0.02761288 0.01048179]
these are the result of
print(perm.feature_importances_)
print(perm.feature_importances_std_)
but the sebtance:
eli5.show_weights(perm)
show nothing
could you tell the reason,and how to solve it
I am a newbie in machine learning model deployment and have been trying to deploy a simple machine learning model on car price prediction in Flask and Heroku. Have made the model using sklearn pipeline and transformer. The code runs perfectly on the Jupyter Notebook.
Deploying it in Heroku through Github, shows Build succeeded. However on launching the application, its shows application error.
It seems there something is wrong with the app.py file code for Flask. Any help or insight would be really helpful. Thank You
Jupyter Notebook Code:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('car data.csv')
data['current_year'] = 2020
data['car_age'] = data['current_year'] - data['Year']
num_features = [col for col in data.columns if data[col].dtype != 'O' and col != 'Selling_Price']
cat_features = [col for col in data.columns if data[col].dtype == 'O']
X= data.drop(['Selling_Price'], axis=1)
y = data['Selling_Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
num_transformer = Pipeline(steps = [('scaler', StandardScaler())])
cat_transformer = Pipeline(steps =[('OneHot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers = [('numerical_transformer', num_transformer, num_features),
('categorical_transformer', cat_transformer, cat_features)])
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
grid_param = {'n_estimators' :[100, 200, 500, 800, 1000]}
grid = GridSearchCV(estimator = rf, param_grid= grid_param)
model = Pipeline(steps= [('preprocessor',preprocessor), ('grid_regressor', grid)])
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error
MAE = mean_absolute_error(y_test, y_predict)
MSE = mean_squared_error(y_test, y_predict)
RMSE = np.sqrt(MSE)
import pickle
file = open('regression_model.pkl', 'wb')
pickle.dump(model, file)
Code for the app.py file for Flask:
from flask import Flask, render_template, request
import jsonify
import pickle
import numpy as np
import sklearn
model = pickle.load(open('regression_model.pkl', 'rb'))
app = Flask(__name__)
#app.route('/', methods=['GET'])
def Home():
return render_template('index.html')
#app.route("/predict", methods= ['POST']
def predict():
if request.method == 'POST':
Year = int(request.form['Year'])
Selling_Price = float(request.form['Selling_Price'])
Present_Price = float(request.form['Present_Price'])
Kms_Driven = int(request.form['Kms_Driven'])
Owner = int(request.form['Owner'])
car_age = 2020 - Year
Fuel_Type_Petrol = request.form['Fuel_Type_Petrol']
if(Fuel_Type_Petrol=='Petrol'):
Fuel_Type_Petrol=1
Fuel_Type_Diesel=0
elif (Fuel_Type_Petrol =='Diesel'):
Fuel_Type_Petrol=0
Fuel_Type_Diesel=1
else:
Fuel_Type_Petrol=0
Fuel_Type_Diesel=0
Seller_Type_Individual = request.form['Seller_Type_Individual']
if(Seller_Type_Individual =='Individual'):
Seller_Type_Individual=1
else:
Seller_Type_Individual=0
Transmission_Mannual=request.form['Transmission_Mannual']
if(Transmission_Mannual=='Mannual'):
Transmission_Mannual=1
else:
Transmission_Mannual=0
data = [Selling_Price, Present_Price, Kms_Driven, Owner, Fuel_Type_Petrol, Fuel_Type_Diesel, Seller_Type_Individual, Transmission_Mannual, car_age]
prediction = model.predict([data])
output = round(prediction[0], 2)
if output <0:
return render_template('index.html', prediction_texts="Sorry you cannot sell this car")
else:
return render_template('index.html', prediction_texts= "You can sell the car at {}".format(output))
else:
render_template('index.html')
if __name__ == '__main__':
app.run(debug=True)
Going through the second chapter of Hands-On Machine Learning with Scikit-Learn & TensorFlow, running to the error stated above. This happens when I try to implement the following line:
linReg.fit(housingPrepared, housing_labels)
Researching online it looks like it has to do something with the dimensions of my features and my labels not matching up. Printing the shape of housingPrepared (X) and housing_labels (Y) yields the following result:
(16512, 16) (4128,)
I've spent the last hour going through line by line to see if I missed a line in this chapter, can't find anything. Wondering if someone here might have an intuition on where a potential solution for this problem could be.
Thank you so much in advance. All my code up to the problem line is posted below:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from zlib import crc32
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from CategoricalEncoder import CategoricalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import check_array
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetchHousingData(housingUrl=HOUSING_URL, housingPath=HOUSING_PATH):
if not os.path.isdir(housingPath):
os.makedirs(housingPath)
tgzPath = os.path.join(housingPath, "housing.tgz")
urllib.request.urlretrieve(housingUrl, tgzPath)
housingTgz = tarfile.open(tgzPath)
housingTgz.extractall(path=housingPath)
housingTgz.close()
def loadHousingData(housingPath=HOUSING_PATH):
return pd.read_csv("https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv")
housing = loadHousingData()
#plt.hist(housing['longitude'],bins=50)
#plt.show()
def splitTrainTesT(data, testRatio):
shuffled_indices = np.random.permutation(len(data))
testSetSize = int(len(data)* testRatio)
testIndices = shuffled_indices[:testSetSize]
trainIndices = shuffled_indices[testSetSize:]
return data.iloc[trainIndices], data.iloc[testIndices]
def testSetCheck(identifier, testRatio):
return crc32(np.int64(identifier)) & 0xffffffff < testRatio * 2 ** 32
def splitTrainTestByID(data, testRatio, idColumn):
ids = data[idColumn]
inTestSet = ids.apply(lambda id_: testSetCheck(id_, testRatio))
return data.loc[~inTestSet], data.loc[inTestSet]
#housingWithID = housing.reset_index()
#trainSet, testSet = splitTrainTestByID(housingWithID,0.2,"index")
trainSet, testSet = train_test_split(housing,test_size=0.2,random_state=42)
housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
#plt.hist(housing["income_cat"])
#plt.show()
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for trainIndex, testIndex in split.split(housing, housing["income_cat"]):
stratTrainSet = housing.loc[trainIndex]
stratTestSet = housing.loc[testIndex]
for set in (stratTrainSet, stratTestSet):
set.drop("income_cat", axis=1, inplace=True)
housing = stratTrainSet.copy()
#print(housing)
#plt.scatter(x=housing["latitude"],y=housing["longitude"], alpha=0.4)
#plt.show()
corr_matrix = housing.corr()
#print(corr_matrix["median_house_value"].sort_values(ascending=False))
#attribues = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
#scatter_matrix(housing[attribues], figsize=(12,8))
#plt.show()
""" PREPARING DATA FOR MACHINE LEARNING ALGORITHMS"""
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTestSet["median_house_value"].copy()
housing.dropna(subset=["total_bedrooms"])
imputer = Imputer(strategy="median")
housingNum = housing.drop("ocean_proximity", axis=1)
imputer.fit(housingNum)
X = imputer.transform(housingNum)
housingTr = pd.DataFrame(X, columns=housingNum.columns)
housingCat = housing["ocean_proximity"]
housingCatEncoded, housingCategories = housingCat.factorize()
encoder = OneHotEncoder()
housingCat1Hot = encoder.fit_transform(housingCatEncoded.reshape(-1,1))
"""Custom Transformers For Rooms Per Household, etc"""
roomsIX, bedroomsIX, populationIX, householdsIX = 3,4,5,6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, addBedroomsPerRoom = True):
self.addBedroomsPerRoom = addBedroomsPerRoom
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
roomsPerHousehold = X[:,roomsIX]/X[:,householdsIX]
populationPerHousehold = X[:,populationIX]/X[:,householdsIX]
if self.addBedroomsPerRoom:
bedroomsPerRoom = X[:,bedroomsIX]/X[:,roomsIX]
return np.c_[X, roomsPerHousehold, populationPerHousehold, bedroomsPerRoom]
else:
return np.c_[X, roomsPerHousehold, populationPerHousehold]
attrAdder = CombinedAttributesAdder(addBedroomsPerRoom=False)
housingExtraAttribs = attrAdder.transform(housing.values)
numPipeline = Pipeline([('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housingNumTr = numPipeline.fit_transform(housingNum)
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attributeNames):
self.attributeNames = attributeNames
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attributeNames].values
numAttribs = list(housingNum)
catAttribs = ["ocean_proximity"]
numPipeline = Pipeline([('selector', DataFrameSelector(numAttribs)),
('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),])
"""UPDATE SKLEARN TO INCLUDE CATEGORICAL ENCODER LIBRARY"""
catPipeline = Pipeline([('selector', DataFrameSelector(catAttribs)),
('cat_encoder', CategoricalEncoder(encoding='onehot-dense')),
])
fullPipeline = FeatureUnion(transformer_list=[("num_pipeline", numPipeline), ("cat_pipeline", catPipeline),])
housingPrepared = fullPipeline.fit_transform(housing)
linReg = LinearRegression()
print(housingPrepared.shape, housing_labels.shape)
linReg.fit(housingPrepared, housing_labels)
I believe the problem is in these two lines:
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTestSet["median_house_value"].copy()
Change it to:
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTrainSet["median_house_value"].copy()
and you're good to go.
As I was just experimenting with NLP then I was working on sarcasm detection but in meanwhile I had put this code.
sarcasmextractor.py
# coding: utf-8
# Importing the library
# In[2]:
import io
import sys
import os
import numpy as np
import pandas as pd
import nltk
import gensim
import csv, collections
from textblob import TextBlob
from sklearn.utils import shuffle
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
import pickle
import replace_emoji
# Define a class to load the SentimentWordnet and write methods to calculate the scores
# In[4]:
class load_senti_word_net(object):
"""
constructor to load the file and read the file as CSV
6 columns - pos, ID, PosScore, NegScore, synsetTerms, gloss
synsetTerms can have multiple similar words like abducting#1 abducent#1 and will read each one and calculaye the scores
"""
def __init__(self):
sent_scores = collections.defaultdict(list)
with io.open("SentiWordNet_3.0.0_20130122.txt") as fname:
file_content = csv.reader(fname, delimiter='\t',quotechar='"')
for line in file_content:
if line[0].startswith('#') :
continue
pos, ID, PosScore, NegScore, synsetTerms, gloss = line
for terms in synsetTerms.split(" "):
term = terms.split("#")[0]
term = term.replace("-","").replace("_","")
key = "%s/%s"%(pos,term.split("#")[0])
try:
sent_scores[key].append((float(PosScore),float(NegScore)))
except:
sent_scores[key].append((0,0))
for key, value in sent_scores.items():
sent_scores[key] = np.mean(value,axis=0)
self.sent_scores = sent_scores
"""
For a word,
nltk.pos_tag(["Suraj"])
[('Suraj', 'NN')]
"""
def score_word(self, word):
pos = nltk.pos_tag([word])[0][1]
return self.score(word, pos)
def score(self,word, pos):
"""
Identify the type of POS, get the score from the senti_scores and return the score
"""
if pos[0:2] == 'NN':
pos_type = 'n'
elif pos[0:2] == 'JJ':
pos_type = 'a'
elif pos[0:2] =='VB':
pos_type='v'
elif pos[0:2] =='RB':
pos_type = 'r'
else:
pos_type = 0
if pos_type != 0 :
loc = pos_type+'/'+word
score = self.sent_scores[loc]
if len(score)>1:
return score
else:
return np.array([0.0,0.0])
else:
return np.array([0.0,0.0])
"""
Repeat the same for a sentence
nltk.pos_tag(word_tokenize("My name is Suraj"))
[('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Suraj', 'NNP')]
"""
def score_sentencce(self, sentence):
pos = nltk.pos_tag(sentence)
print (pos)
mean_score = np.array([0.0, 0.0])
for i in range(len(pos)):
mean_score += self.score(pos[i][0], pos[i][1])
return mean_score
def pos_vector(self, sentence):
pos_tag = nltk.pos_tag(sentence)
vector = np.zeros(4)
for i in range(0, len(pos_tag)):
pos = pos_tag[i][1]
if pos[0:2]=='NN':
vector[0] += 1
elif pos[0:2] =='JJ':
vector[1] += 1
elif pos[0:2] =='VB':
vector[2] += 1
elif pos[0:2] == 'RB':
vector[3] += 1
return vector
# Now let's extract the features
#
# ###Stemming and Lemmatization
# In[5]:
porter = nltk.PorterStemmer()
sentiments = load_senti_word_net()
# In[7]:
def gram_features(features,sentence):
sentence_rep = replace_emoji.replace_reg(str(sentence))
token = nltk.word_tokenize(sentence_rep)
token = [porter.stem(i.lower()) for i in token]
bigrams = nltk.bigrams(token)
bigrams = [tup[0] + ' ' + tup[1] for tup in bigrams]
grams = token + bigrams
#print (grams)
for t in grams:
features['contains(%s)'%t]=1.0
# In[8]:
import string
def sentiment_extract(features, sentence):
sentence_rep = replace_emoji.replace_reg(sentence)
token = nltk.word_tokenize(sentence_rep)
token = [porter.stem(i.lower()) for i in token]
mean_sentiment = sentiments.score_sentencce(token)
features["Positive Sentiment"] = mean_sentiment[0]
features["Negative Sentiment"] = mean_sentiment[1]
features["sentiment"] = mean_sentiment[0] - mean_sentiment[1]
#print(mean_sentiment[0], mean_sentiment[1])
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in token]).strip())
features["Blob Polarity"] = text.sentiment.polarity
features["Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["Blob Polarity"] = 0
features["Blob Subjectivity"] = 0
print("do nothing")
first_half = token[0:int(len(token)/2)]
mean_sentiment_half = sentiments.score_sentencce(first_half)
features["positive Sentiment first half"] = mean_sentiment_half[0]
features["negative Sentiment first half"] = mean_sentiment_half[1]
features["first half sentiment"] = mean_sentiment_half[0]-mean_sentiment_half[1]
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in first_half]).strip())
features["first half Blob Polarity"] = text.sentiment.polarity
features["first half Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["first Blob Polarity"] = 0
features["first Blob Subjectivity"] = 0
print("do nothing")
second_half = token[int(len(token)/2):]
mean_sentiment_sechalf = sentiments.score_sentencce(second_half)
features["positive Sentiment second half"] = mean_sentiment_sechalf[0]
features["negative Sentiment second half"] = mean_sentiment_sechalf[1]
features["second half sentiment"] = mean_sentiment_sechalf[0]-mean_sentiment_sechalf[1]
try:
text = TextBlob(" ".join([""+i if i not in string.punctuation and not i.startswith("'") else i for i in second_half]).strip())
features["second half Blob Polarity"] = text.sentiment.polarity
features["second half Blob Subjectivity"] = text.sentiment.subjectivity
#print (text.sentiment.polarity,text.sentiment.subjectivity )
except:
features["second Blob Polarity"] = 0
features["second Blob Subjectivity"] = 0
print("do nothing")
# In[9]:
features = {}
sentiment_extract(features,"a long narrow opening")
# In[11]:
def pos_features(features,sentence):
sentence_rep = replace_emoji.replace_reg(sentence)
token = nltk.word_tokenize(sentence_rep)
token = [ porter.stem(each.lower()) for each in token]
pos_vector = sentiments.pos_vector(token)
for j in range(len(pos_vector)):
features['POS_'+str(j+1)] = pos_vector[j]
print ("done")
# In[12]:
features = {}
pos_features(features,"a long narrow opening")
# In[13]:
def capitalization(features,sentence):
count = 0
for i in range(len(sentence)):
count += int(sentence[i].isupper())
features['Capitalization'] = int(count > 3)
print (count)
# In[14]:
features = {}
capitalization(features,"A LoNg NArrow opening")
# In[15]:
import topic
topic_mod = topic.topic(nbtopic=200,alpha='symmetric')
# In[16]:
topic_mod = topic.topic(model=os.path.join('topics.tp'),dicttp=os.path.join('topics_dict.tp'))
# In[17]:
def topic_feature(features,sentence,topic_modeler):
topics = topic_modeler.transform(sentence)
for j in range(len(topics)):
features['Topic :'] = topics[j][1]
# In[18]:
topic_feature(features,"A LoNg NArrow opening",topic_mod)
# In[19]:
def get_features(sentence, topic_modeler):
features = {}
gram_features(features,sentence)
pos_features(features,sentence)
sentiment_extract(features, sentence)
capitalization(features,sentence)
topic_feature(features, sentence,topic_modeler)
return features
# In[20]:
df = pd.DataFrame()
df = pd.read_csv("dataset_csv.csv", header=0, sep='\t')
df.head()
# In[17]:
import re
for i in range(0,df.size):
temp = str(df["tweets"][i])
temp = re.sub(r'[^\x00-\x7F]+','',temp)
featureset.append((get_features(temp,topic_mod), df["label"][i]))
# In[20]:
c = []
for i in range(0,len(featureset)):
c.append(pd.DataFrame(featureset[i][0],index=[i]))
result = pd.concat(c)
# In[22]:
result.insert(loc=0,column="label",value='0')
# In[23]:
for i in range(0, len(featureset)):
result["label"].loc[i] = featureset[i][1]
# In[25]:
result.to_csv('feature_dataset.csv')
# In[3]:
df = pd.DataFrame()
df = pd.read_csv("feature_dataset.csv", header=0)
df.head()
# In[4]:
get_ipython().magic('matplotlib inline')
import matplotlib as matplot
import seaborn
result = df
# In[5]:
X = result.drop(['label','Unnamed: 0','Topic :'],axis=1).values
# In[6]:
Y = result['label']
# In[7]:
import pickle
import pefile
import sklearn.ensemble as ek
from sklearn import cross_validation, tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LinearRegression
import sklearn.linear_model as lm
# In[29]:
model = { "DecisionTree":tree.DecisionTreeClassifier(max_depth=10),
"RandomForest":ek.RandomForestClassifier(n_estimators=50),
"Adaboost":ek.AdaBoostClassifier(n_estimators=50),
"GradientBoosting":ek.GradientBoostingClassifier(n_estimators=50),
"GNB":GaussianNB(),
"Logistic Regression":LinearRegression()
}
# In[8]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y ,test_size=0.2)
# In[9]:
X_train = pd.DataFrame(X_train)
X_train = X_train.fillna(X_train.mean())
X_test = pd.DataFrame(X_test)
X_test = X_test.fillna(X_test.mean())
# In[38]:
results_algo = {}
for algo in model:
clf = model[algo]
clf.fit(X_train,y_train.astype(int))
score = clf.score(X_test,y_test.astype(int))
print ("%s : %s " %(algo, score))
results_algo[algo] = score
# In[39]:
winner = max(results_algo, key=results_algo.get)
# In[40]:
clf = model[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))
# In[41]:
from sklearn import metrics
print (metrics.classification_report(y_test, res))
# In[34]:
test_data = "public meetings are awkard for me as I can insult people but I choose not to and that is something that I find difficult to live with"
# In[101]:
test_data="I purchased this product 4.47 billion years ago and when I opened it today, it was half empty."
# In[82]:
test_data="when people see me eating and ask me are you eating? No no I'm trying to choke myself to death #sarcastic"
# In[102]:
test_feature = []
test_feature.append((get_features(test_data,topic_mod)))
# In[104]:
test_feature
# In[105]:
c = []
c.append(pd.DataFrame(test_feature[0],index=[i]))
test_result = pd.concat(c)
test_result = test_result.drop(['Topic :'],axis=1).values
# In[106]:
res= clf.predict(test_result)
But it is giving me the following error:
C:\ProgramData\Anaconda3\lib\site-packages\gensim\utils.py:1197: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
[('a', 'DT'), ('long', 'JJ'), ('narrow', 'JJ'), ('open', 'JJ')]
[('a', 'DT'), ('long', 'JJ')]
[('narrow', 'JJ'), ('open', 'JJ')]
done
5
Traceback (most recent call last):
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\sarcasm-extraction.py", line 276, in <module>
topic_feature(features,"A LoNg NArrow opening",topic_mod)
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\sarcasm-extraction.py", line 268, in topic_feature
topics = topic_modeler.transform(sentence)
File "C:\shubhamprojectwork\sarcasm detection\SarcasmDetection-master\SarcasmDetection-master\Code\topic.py", line 42, in transform
return self.lda[corpus_sentence]
File "C:\ProgramData\Anaconda3\lib\site-packages\gensim\models\ldamodel.py", line 1160, in __getitem__
return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics)
AttributeError: 'LdaModel' object has no attribute 'minimum_phi_value'
Code for topic.py:
from gensim import corpora, models, similarities
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import replace_emoji
class topic(object):
def __init__(self, nbtopic = 100, alpha=1,model=None,dicttp=None):
self.nbtopic = nbtopic
self.alpha = alpha
self.porter = nltk.PorterStemmer()
self.stop = stopwords.words('english')+['.','!','?','"','...','\\',"''",'[',']','~',"'m","'s",';',':','..','$']
if model!=None and dicttp!=None:
self.lda = models.ldamodel.LdaModel.load(model)
self.dictionary = corpora.Dictionary.load(dicttp)
def fit(self,documents):
documents_mod = documents
tokens = [nltk.word_tokenize(sentence) for sentence in documents_mod]
tokens = [[self.porter.stem(t.lower()) for t in sentence if t.lower() not in self.stop] for sentence in tokens]
self.dictionary = corpora.Dictionary(tokens)
corpus = [self.dictionary.doc2bow(text) for text in tokens]
self.lda = models.ldamodel.LdaModel(corpus,id2word=self.dictionary, num_topics=self.nbtopic,alpha=self.alpha)
self.lda.save('topics.tp')
self.dictionary.save('topics_dict.tp')
def get_topic(self,topic_number):
return self.lda.print_topic(topic_number)
def transform(self,sentence):
sentence_mod = sentence
tokens = nltk.word_tokenize(sentence_mod)
tokens = [self.porter.stem(t.lower()) for t in tokens if t.lower() not in self.stop]
corpus_sentence = self.dictionary.doc2bow(tokens)
return self.lda[corpus_sentence]
The overall code is found here overall code.
The minimum_phi_value is a property of LdaModel that is set when an instance is created and for some reason it hasn't been serialized (which is pretty strange, probably a bug).
To workaround this particular issue you can add
self.lda.minimum_phi_value = 0.01
... after self.lda loading or avoid saving/restoring the model if possible (i.e. always train it).
But I encourage you to examine the fields of self.lda before and after serialization to check they are identical.
I was trying to add a CSV to my python code and use LR, LDA and various other Algorithms to test their score results.
and Then I wanted to show the results in the form of a table in SQL server.
I get the error "Invalid Parameter Type. param-index= 0 and param-type = numpy.ndarray HY105"
Added Knowledge would be appreciated.
Here is the code:
import matplotlib.pyplot as plt
import numpy as np
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import sqlalchemy
import pypyodbc
import pyodbc
# Load dataset
url = "AirPassengers.csv"
names = ['Serial Number', 'Time', 'Air Passengers']
dataset = pandas.read_csv(url, names=names)
print(dataset.describe())
print(dataset)
#Creating Validation Sets
array = dataset.values
X = array[1:, 0:4]
Y = array[1: ,2]
validation_size = 0.30
seed = 5
X_train, X_validation, Y_train, Y_validation =
model_selection.train_test_split(X,Y,test_size=validation_size,
random_state=seed)
#Test Harness and Evaluation Metrics
seed = 5
scoring = 'accuracy'
#Building Models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train,
cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
conn = pyodbc.connect(
r'DRIVER={ODBC Driver 11 for SQL Server};'
r'SERVER=Q3GN0570\MSSQLSERVER1;'
r'DATABASE=AdventureWorksDW2014;'
r'Trusted_Connection=yes;'
)
cursorexec = conn.cursor()
cursorexec.execute("INSERT INTO pythonTest(LR,LDA,KNN,CART,NB,SVM) VALUES
(?,?,?,?,?,?)", results)
cursorexec.commit()
conn.close()