I am using this dataset Weath Based on age and the documentation states that the accuracy should be around 84%. Unfortunately, the accuracy of my program is at 25%
To process the data I did the following:
1. Loaded the .txt data file and converted it to a .csv
2. Removed data with missing values
3. Extracted the class values: <=50K >50 and convert it to 0 and 1 respectively
4. For each attribute and for each string value of that attribute I
mapped it to an integer value. Example att1{'cs':0, 'cs2':1},
att2{'usa':0, 'greece':1} ... and so on
5. Called naive bayes on the new integer data set
Python code:
import load_csv as load #my functions to do [1..5] of the list
import numpy as np
my_data = np.genfromtxt('out.csv', dtype = dt, delimiter = ',', skip_header = 1)
data = np.array(load.remove_missing_values(my_data)) #this funcion removes the missing data
features_train = np.array(load.remove_field_num(data, len(data[0]) - 1)) #this function extracts the data, e.g removes the class in the end of the data
label_train = np.array(load.create_labels(data))
features_train = np.array(load.convert_to_int(features_train))
my_data = np.genfromtxt('test.csv', dtype = dt, delimiter = ',', skip_header = 1)
data = np.array(load.remove_missing_values(my_data))
features_test = np.array(load.remove_field_num(data, len(data[0]) - 1))
label_test = np.array(load.create_labels(data)) #extracts the labels from the .csv data file
features_test = np.array(load.convert_to_int(features_test)) #converts the strings to ints(each unique string of an attribute is assigned a unique integer value
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.metrics import accuracy_score
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, label_train)
predict = clf.predict(features_test)
score = accuracy_score(predict, label_test) #Low accuracy score
load_csv module:
import numpy as np
attributes = { 'Private':0, 'Self-emp-not-inc':1, 'Self-emp-inc':2, 'Federal-gov':3, 'Local-gov':4, 'State-gov':5, 'Without-pay':6, 'Never-worked':7,
'Bachelors':0, 'Some-college':1, '11th':2, 'HS-grad':3, 'Prof-school':4, 'Assoc-acdm':5, 'Assoc-voc':6, '9th':7, '7th-8th':8, '12th':9, 'Masters':10, '1st-4th':11, '10th':12, 'Doctorate':13, '5th-6th':14, 'Preschool':15,
'Married-civ-spouse':0, 'Divorced':1, 'Never-married':2, 'Separated':3, 'Widowed':4, 'Married-spouse-absent':5, 'Married-AF-spouse':6,
'Tech-support':0, 'Craft-repair':1, 'Other-service':2, 'Sales':3, 'Exec-managerial':4, 'Prof-specialty':5, 'Handlers-cleaners':6, 'Machine-op-inspct':7, 'Adm-clerical':8,
'Farming-fishing':9, 'Transport-moving':10, 'Priv-house-serv':11, 'Protective-serv':12, 'Armed-Forces':13,
'Wife':0, 'Own-child':1, 'Husband':2, 'Not-in-family':4, 'Other-relative':5, 'Unmarried':5,
'White':0, 'Asian-Pac-Islander':1, 'Amer-Indian-Eskimo':2, 'Other':3, 'Black':4,
'Female':0, 'Male':1,
'United-States':0, 'Cambodia':1, 'England':2, 'Puerto-Rico':3, 'Canada':4, 'Germany':5, 'Outlying-US(Guam-USVI-etc)':6, 'India':7, 'Japan':8, 'Greece':9, 'South':10, 'China':11, 'Cuba':12, 'Iran':13, 'Honduras':14, 'Philippines':15, 'Italy':16, 'Poland':17, 'Jamaica':18, 'Vietnam':19, 'Mexico':20, 'Portugal':21, 'Ireland':22, 'France':23, 'Dominican-Republic':24, 'Laos':25, 'Ecuador':26, 'Taiwan':27, 'Haiti':28, 'Columbia':29, 'Hungary':30, 'Guatemala':31, 'Nicaragua':32, 'Scotland':33, 'Thailand':34, 'Yugoslavia':35, 'El-Salvador':36, 'Trinadad&Tobago':37, 'Peru':38, 'Hong':39, 'Holand-Netherlands':40
}
def remove_field_num(a, i): #function to strip values
names = list(a.dtype.names)
new_names = names[:i] + names[i + 1:]
b = a[new_names]
return b
def remove_missing_values(data):
temp = []
for i in range(len(data)):
for j in range(len(data[i])):
if data[i][j] == '?': #If a missing value '?' is encountered do not append the line to temp
break;
if j == (len(data[i]) - 1) and len(data[i]) == 15:
temp.append(data[i]) #Append the lines that do not contain '?'
return temp
def create_labels(data):
temp = []
for i in range(len(data)): #Iterate through the data
j = len(data[i]) - 1 #Extract the labels
if data[i][j] == '<=50K':
temp.append(0)
else:
temp.append(1)
return temp
def convert_to_int(data):
my_lst = []
for i in range(len(data)):
lst = []
for j in range(len(data[i])):
key = data[i][j]
if j in (1, 3, 5, 6, 7, 8, 9, 13, 14):
lst.append(int(attributes[key]))
else:
lst.append(int(key))
my_lst.append(lst)
temp = np.array(my_lst)
return temp
I have tried to use both tree and NaiveBayes but the accuracy is very low. Any suggestions of what am I missing?
I guess the problem is in preprocessing. It is better to encode the categorical variables as one_hot vectors (vectors with only zero or ones where one corresponds to the desired value for that class) instead of raw numbers. Sklearn DictVectorizer can help you in that. You can do the classification much more efficiently with the pandas library.
The following shows how easily you can achieve that with help of pandas library. It works very well along side scikit-learn. This achieves accuracy of 81.6 on a test set that is 20% of the entire data.
from __future__ import division
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.dict_vectorizer import DictVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics.classification import classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
# Read the data into a pandas dataframe
df = pd.read_csv('adult.data.csv')
# Columns names
cols = np.array(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
'target'])
# numeric columns
numeric_cols = ['age', 'fnlwgt', 'education-num',
'capital-gain', 'capital-loss', 'hours-per-week']
# assign names to the columns in the dataframe
df.columns = cols
# replace the target variable to 0 and 1 for <50K and >50k
df1 = df.copy()
df1.loc[df1['target'] == ' <=50K', 'target'] = 0
df1.loc[df1['target'] == ' >50K', 'target'] = 1
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(
df1.drop('target', axis=1), df1['target'], test_size=0.2)
# numeric attributes
x_num_train = X_train[numeric_cols].as_matrix()
x_num_test = X_test[numeric_cols].as_matrix()
# scale to <0,1>
max_train = np.amax(x_num_train, 0)
max_test = np.amax(x_num_test, 0) # not really needed
x_num_train = x_num_train / max_train
x_num_test = x_num_test / max_train # scale test by max_train
# labels or target attribute
y_train = y_train.astype(int)
y_test = y_test.astype(int)
# categorical attributes
cat_train = X_train.drop(numeric_cols, axis=1)
cat_test = X_test.drop(numeric_cols, axis=1)
cat_train.fillna('NA', inplace=True)
cat_test.fillna('NA', inplace=True)
x_cat_train = cat_train.T.to_dict().values()
x_cat_test = cat_test.T.to_dict().values()
# vectorize (encode as one hot)
vectorizer = DictVectorizer(sparse=False)
vec_x_cat_train = vectorizer.fit_transform(x_cat_train)
vec_x_cat_test = vectorizer.transform(x_cat_test)
# build the feature vector
x_train = np.hstack((x_num_train, vec_x_cat_train))
x_test = np.hstack((x_num_test, vec_x_cat_test))
clf = LogisticRegression().fit(x_train, y_train.values)
pred = clf.predict(x_test)
print classification_report(y_test.values, pred, digits=4)
print accuracy_score(y_test.values, pred)
clf = DecisionTreeClassifier().fit(x_train, y_train)
predict = clf.predict(x_test)
print classification_report(y_test.values, pred, digits=4)
print accuracy_score(y_test.values, pred)
clf = GaussianNB().fit(x_train, y_train)
predict = clf.predict(x_test)
print classification_report(y_test.values, pred, digits=4)
print accuracy_score(y_test.values, pred)
Related
I have a pretty basic question. My X data is df['input'], Y data is df['label']. This is my code:
from sklearn.feature_extraction.text import TfidfVectorizer
Xfeatures = df['input']
y = df['label']
tfidf_vec = TfidfVectorizer(max_features= MF,
max_df = MAXDF)
X = tfidf_vec.fit_transform(Xfeatures)
featurenames = tfidf_vec.get_feature_names()
X.todense()
df_vec = pd.DataFrame(X.todense(),columns=tfidf_vec.get_feature_names())
df_vec.T
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X, y,test_size = 0.33,random_state = 28)
This is the model that I run for text classification:
from sklearn.svm import SVC
lr_model = SVC()
lr_model.fit(x_train,y_train)
y_pred = lr_model.predict(x_test)
# Acccuracy
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
I would like to identify the those that are misclassified (i.e. df['input']). I can write down predicted and actual the categories into csv, but not the text that is misclassified (or training data in general):
import csv
rows = zip(y_test, y_pred)
with open(r"C:\Users\erdem\Desktop\data.csv", "w", newline="") as f:
writer = csv.writer(f)
for row in rows:
writer.writerow(row)
Try to go with
X[y_test != y_pred]
y_test != y_pred will be a binary array with True on misclassified data (and False on correct predictions): you can use that as indices for your X (or Xfeatures).
I have developed some different datasets and I want to write a for loop to do the training for each of which and at the end, I want to have RMSE for each dataset. I tried by passing through a for loop but it does not work since it gives back the same value for each dataset while I know that it should be different. The code that I have written is below:
for i in NEW_middle_index:
DF = df1.iloc[i-100:i+100,:]
# Append an empty sublist inside the list
FINAL_DF.append(DF)
y = DF.iloc[:,3]
X = DF.drop(columns='Target')
index_train = int(0.7 * len(X))
X_train = X[:index_train]
y_train = y[:index_train]
X_test = X[index_train:]
y_test = y[index_train:]
scaler_x = MinMaxScaler().fit(X_train)
X_train = scaler_x.transform(X_train)
X_test = scaler_x.transform(X_test)
xgb_r = xg.XGBRegressor(objective ='reg:linear',
n_estimators = 20, seed = 123)
for i in range(len(NEW_middle_index)):
# print(i)
# Fitting the model
xgb_r.fit(X_train,y_train)
# Predict the model
pred = xgb_r.predict(X_test)
# RMSE Computation
rmse = np.sqrt(mean_squared_error(y_test,pred))
# print(rmse)
RMSE.append(rmse)
Not sure if you indented it correctly. You are overwriting X_train and X_test and when you fit your model, its always on the same dataset, hence you get the same results.
One option is to fit the model once you create the train / test dataframes. Else if you want to keep the train / test set, maybe something like below, to store them in a list of dictionaries, without changing too much of your code:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import xgboost as xg
df1 = pd.DataFrame(np.random.normal(0,1,(600,3)))
df1['Target'] = np.random.uniform(0,1,600)
NEW_middle_index = [100,300,500]
NEWDF = []
for i in NEW_middle_index:
y = df1.iloc[i-100:i+100:,3]
X = df1.iloc[i-100:i+100,:].drop(columns='Target')
index_train = int(0.7 * len(X))
scaler_x = MinMaxScaler().fit(X)
X_train = scaler_x.transform(X[:index_train])
y_train = y[:index_train]
X_test = scaler_x.transform(X[index_train:])
y_test = y[index_train:]
NEWDF.append({'X_train':X_train,'y_train':y_train,'X_test':X_test,'y_test':y_test})
Then we fit and calculate RMSE:
RMSE = []
xgb_r = xg.XGBRegressor(objective ='reg:linear',n_estimators = 20, seed = 123)
for i in range(len(NEW_middle_index)):
xgb_r.fit(NEWDF[i]['X_train'],NEWDF[i]['y_train'])
pred = xgb_r.predict(NEWDF[i]['X_test'])
rmse = np.sqrt(mean_squared_error(NEWDF[i]['y_test'],pred))
RMSE.append(rmse)
RMSE
[0.3524827559800294, 0.3098101362502435, 0.3843173269966071]
I made an ordinal regression model (first time performing regression, be merciful) and now I need to evaluate it. What would be the best way? (I use mord API for the ordinal regression)
These are the tasks I am trying to complete:
3) Build a regression model that will predict the rating score of each
product based on attributes which correspond to some very common words
used in the reviews (selection of how many words is left to you as a
decision). So, for each product you will have a long(ish) vector of
attributes based on how many times each word appears in reviews of
this product. Your target variable is the rating. You will be judged
on the process of building the model (regularization, subset
selection, validation set, etc.) and not so much on the accuracy of
the results.
4) Having the vectors from Question 3, perform
dimensionality reduction (either PCA or NMF). Can you conclude how
many components you can keep? Experiment with this parameter and
justify your final conclusion.
This is my code for it:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import textblob
import nltk
from pandas import ExcelWriter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from textblob import Word
from collections import Counter
import seaborn as sns
import mord as m
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
%matplotlib inline
df = # import dataframe from link
#Clean up Rating (whilst doing 'hand cleaning' I saw data outside of the [0,5] range; needs to be corrected; this could have been spotted by plotting the data on histogram but since I saw this while going throught the data I feel plotting it is an unnecessary step)
df.loc[df.Rating > 5, 'Rating'] = np.NaN
df.loc[df.Rating < 1, 'Rating'] = np.NaN
# Convert weights to same measure (pounds). Most of the weights I inspected seem wrong...
for i in range(0, df.weight.size-1):
cell = df.weight[i]
while (cell == 0 and i < df.weight.size-1):
i += 1
cell = df.weight[i]
if not(isinstance(cell, float)) and not(isinstance(cell, int)):
number = ''.join([x for x in cell if (x.isdigit() or x=='.')])
num = float(number)
if bool(re.search('ounces', cell)):
df.loc[i, 'weight'] = num * 0.0625 # Ounces to pounds conversion
else:
df.loc[i, 'weight'] = num # Introduce only number (without measure type)
df.loc[:, "Review"] = df["Title"] + str(' - ') + df["Text"]
df.drop('Title', axis=1, inplace=True)
df.drop('Text', axis=1, inplace=True)
df.columns = ['Brand', 'Name', 'NumsHelpful', 'Rating', 'Weight(Pounds)', 'Review']
df['Weight(Pounds)'] = pd.to_numeric(df['Weight(Pounds)'], errors='coerce')
df['Brand'] = df['Brand'].astype(str)
df['Review'] = df['Review'].astype(str)
df['Name'] = df['Name'].astype(str)
d = {'Brand':'first',
'NumsHelpful':'mean',
'Rating':'mean',
'Weight(Pounds)':'first',
'Review':'/'.join,
}
df = df.groupby('Name').agg(d).reset_index()
df.Rating = df.Rating.round()
df.NumsHelpful = df.NumsHelpful.round()
df['Review2'] = df['Review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['Review2'] = df['Review2'].str.replace('[^\w\s]','')
stop = stopwords.words('english')
df['Review2'] = df['Review2'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
freq = pd.Series(' '.join(df['Review2']).split()).value_counts()[:20]
common = ['wine', 'mix', 'taste', 'drink', 'one', 'price', 'product', 'flavour', 'would', 'bitters', 'bottle', 'buy','really', 'make']
df['Review2'] = df['Review2'].apply(lambda x: " ".join(x for x in x.split() if x not in common))
freq = pd.Series(' '.join(df['Review2']).split()).value_counts()[-10:]
freq = list(freq.index)
df['Review2'] = df['Review2'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['words'] = df.Review2.str.strip().str.split('[\W_]+')
df['Review2'] = df['words'].apply(lambda x: " ".join([Word(word).lemmatize('v') for word in x]))
df['Review2'].str.split(expand=True).stack().value_counts()
# Create word matrix
bow = df.Review2.str.split().apply(pd.Series.value_counts)
rating = df['Rating']
df_rating = pd.DataFrame([rating])
df_rating = df_rating.transpose()
bow = bow.join(df_rating)
# Remove some columns and rows
bow = bow.loc[(bow['Rating'].notna()), ~(bow.sum(0) < 80)]
# Divide into train - validation - test
bow.fillna(0, inplace=True)
rating = bow['Rating']
bow = bow.drop('Rating', 1)
x_train, x_test, y_train, y_test = train_test_split(bow, rating, test_size=0.4, random_state=0)
# Run regression
regr = m.OrdinalRidge()
regr.fit(x_train, y_train)
scores = cross_val_score(regr, bow, rating, cv=5, scoring='accuracy')
# scores -> array([0.75438596, 0.73684211, 0.66071429, 0.53571429, 0.60714286])
# avg_score -> Accuracy: 0.66 (+/- 0.16)
# Do PCA (dimensionality reduction)
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(x_train)
# Apply transform to both the training set and the test set.
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# Make an instance of the Model
pca = PCA(.95)
pca.fit(x_train)
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)
regr.fit(x_train, y_train)
scores = cross_val_score(regr, bow, rating, cv=10, scoring='accuracy')
What are your thoughts on the above code?
Any insight is greatly appreciated!
EDIT:
This is a link to the dataset
This is a link to a google.doc containing the source code (Python)
creates the k - fold analysis
from scipy.stats import itemfreq
from os import listdir
from os.path import isfile, join
import numpy as np
import pickle
from csv import reader
from scipy.stats import itemfreq
from sklearn.model_selection import KFold
from os import listdir
from os.path import isfile, join
from astropy.extern.ply.cpp import xrange
seed = 0 # number of cough classes = 2; (DISEASE/NORMAL) or (COPD/CHF)
np.random.seed(seed) # generates random numbers
X_train = [] # creates training set using .csv file
Y_train = [] #creates training set using patients
X_test = [] # creates testing set using .csv file
Y_test = [] # creates testing set using patients
Z = [] # splits data
label = [] #labels split data
eps=1e-7
set_probs = [] #predicts probability
i = 0; # uses to go through all patients
correct = 0;
DISEASE = 1;
NORMAL = 1;
for i in xrange (1,10): # goes through all 9 patients
Z.append(DISEASE)
DISEASE = DISEASE + 1;
label.append(1); #labels data as 1, if = DISEASE
for i in xrange (1,10):
Z.append(NORMAL)
NORMAL = NORMAL + 1;
label.append(2); #labels data as 2, if = NORMAL
add = 0
add1 = 0
add2 = 0
print(len(Z))
kf = KFold(n_splits = 10, shuffle = True)
for train, test in kf.split(Z):
X_train = []
Y_train = []
X_test = []
Y_test = []
set_probs = []
# Z_train - creates training set from split data
# Z_test - creates testing set from split data
# label_train - labels Z_train data
# label_test - labels Z_testing data
# This where I am getting the error
Z_train, Z_test, label_train, label_test = Z[train], Z[test], label[train],
label[test]
# training set
for z in xrange(0, len(Z_train)):
if label_train[z] == 1: # if predicted 1 = DISEASE
mypath = '~/Users/awindmon/Documents/DISEASE_Example/';
if label_train[z] == 2: # if predicted 2 = NORMAL
mypath = '~/Users/awindmon/Documents/NORMAL_Example/';
# testing set
for z in xrange(0, len(Z_test)):
if label_test[z] == 1:
mypath = '~/Users/awindmon/Documents/DISEASE_Example/';
if label_test[z] == 2:
mypath = '~/Users/awindmon/Documents/NORMAL_Example/';
clf = SVC (kernel = 'linear', random_state = 0, gamma = 1, C = 1,
probability = True)
clf.fit(X_train, Y_train)
filename = 'LinearSVM_Model.sav'
pickle.dump(clf, open(filename, 'wb'))
count = 0
probability_list = clf.predict_proba(X_test)
p0=0
p1=0
p2=0
p3=0
p4=0
p5=0
p6=0
for l in range(0,len(probability_list)):
if (l!=0) and (l%3 == 0):
set_probs.append([p0,p1,p2,p3,p4,p5,p6])
p0=0
p1=0
p2=0
p3=0
p4=0
p5=0
p6=0
p0=p0+ probability_list[l][0]
p1=p1+ probability_list[l][1]
p2=p2+ probability_list[l][2]
p3=p3+ probability_list[l][3]
p4=p4+ probability_list[l][4]
p5=p5+ probability_list[l][5]
p6=p6+ probability_list[l][6]
if (l == len(probability_list)-1):
set_probs.append([p0,p1,p2,p3,p4,p5,p6])
p0=0
p1=0
p2=0
p3=0
p4=0
p5=0
p6=0
print (set_probs,Y_test)
add1=add1+clf.score(X_test, Y_test)
print (add1/10)
I am new to python and I have developed this code to do k-fold cross validation for a machine learning problem. On the last line, I am attempting to divide and label my training and testing data, but I keep getting this error: TypeError: only integer scalar arrays can be converted to a scalar index.
The values train and test are np arrays. You probably want to do something like this:
from sklearn.model_selection import KFold
import numpy as np
kf = KFold(n_splits = 10, shuffle = True)
Z = [i+1 for i in range(10)]
for train, test in kf.split(Z):
X_train = []
Y_train = []
X_test = []
Y_test = []
set_probs = []
Z_train = np.array([Z[i] for i in train])
label_train = np.array([label[i] for i in train])
Z_test = Z[test[0]]
label_test = label[test[0]]
The split gives you what indexes you are picking so you can use that index to get the value or the label in a similar manner.
This goes beyond the scope of this question but how you are using Z_train doesn't make sense to me because you are just using its length rather than its values. Perhaps you mean for z in Z_train:
Below is what i have done so far.
#importing the necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
filepath = r"C:\Users...Kaggle data\house prediction iowa\house_predtrain (3).csv"
train = pd.read_csv(filepath)
print(train.shape)
filepath2 = r"C:\Users...Kaggle data\house prediction iowa\house_predtest (1).csv"
test = pd.read_csv (filepath2)
print(test.shape)
#first we raplace all the NANs by 0 in botht the train and test data
train = train.fillna(0)
test = test.fillna(0) #error one
train.dtypes.value_counts()
#isolating all the object/categorical feature and converting them to numeric features
encode_cols = train.dtypes[train.dtypes == np.object]
encode_cols2 = test.dtypes[test.dtypes == np.object]
#print(encode_cols)
encode_cols = encode_cols.index.tolist()
encode_cols2 = encode_cols2.index.tolist()
print(encode_cols2)
# Do the one hot encoding
train_dummies = pd.get_dummies(train, columns=encode_cols)
test_dummies = pd.get_dummies(test, columns=encode_cols2)
#align your test and train data (error2)
train, test = train_dummies.align(test_dummies, join = 'left', axis = 1)
print(train.shape)
print(test.shape)
#Now working with Floats features
numericals_floats = train.dtypes == np.float
numericals = train.columns[numericals_floats]
print(numericals)
#we check for skewness in the float data
skew_limit = 0.35
skew_vals = train[numericals].skew()
skew_cols = (skew_vals
.sort_values(ascending=False)
.to_frame()
.rename(columns={0:'Skewness'}))
skew_cols
#Visualising them above data before and after log transforming
%matplotlib inline
field = 'GarageYrBlt'
fig, (ax_before, ax_after) = plt.subplots(1, 2, figsize=(10,5))
train[field].hist(ax=ax_before)
train[field].apply(np.log1p).hist(ax=ax_after)
ax_before.set (title = 'Before np.log1p', ylabel = 'frequency', xlabel = 'Value')
ax_after.set (title = 'After np.log1p', ylabel = 'frequency', xlabel = 'Value')
fig.suptitle('Field: "{}"'.format (field));
#note how applying log transformation on GarageYrBuilt does not do much
print(skew_cols.index.tolist()) #returns a list of the values
for i in skew_cols.index.tolist():
if i == "SalePrice": #we do not want to transform the feature to be predicted
continue
train[i] = train[i].apply(np.log1p)
test[i] = test[i].apply(np.log1p)
feature_cols = [x for x in train.columns if x != ('SalePrice')]
X_train = train[feature_cols]
y_train = train['SalePrice']
X_test = test[feature_cols]
y_test = train['SalePrice']
print(X_test.shape)
print(y_train.shape)
print(X_train.shape)
#now to the most fun part. Feature engineering is over!!!
#i am going to use linear regression, L1 regularization, L2 regularization and ElasticNet(blend of L1 and L2)
#first up, Linear Regression
alphas =[0.00005, 0.0005, 0.005, 0.05, 0.5, 0.1, 0.3, 1, 3, 5, 10, 25, 50, 100] #i choosed this
l1_ratios = np.linspace(0.1, 0.9, 9)
#LinearRegression
linearRegression = LinearRegression().fit(X_train, y_train)
prediction1 = linearRegression.predict(X_test)
LR_score = linearRegression.score(X_train, y_train)
print(LR_score)
#ridge
ridgeCV = RidgeCV(alphas=alphas).fit(X_train, y_train)
prediction2 = ridgeCV.predict(X_test)
R_score = ridgeCV.score(X_train, y_train)
print(R_score)
#lasso
lassoCV = LassoCV(alphas=alphas, max_iter=1e2).fit(X_train, y_train)
prediction3 = lassoCV.predict(X_test)
L_score = lassoCV.score(X_train, y_train)
print(L_score)
#elasticNetCV
elasticnetCV = ElasticNetCV(alphas=alphas, l1_ratio=l1_ratios, max_iter=1e2).fit(X_train, y_train)
prediction4 = elasticnetCV.predict(X_test)
EN_score = elasticnetCV.score(X_train, y_train)
print(EN_score)
from sklearn.ensemble import RandomForestRegressor
randfr = RandomForestRegressor()
randfr = randfr.fit(X_train, y_train)
prediction5 = randfr.predict(X_test)
print(prediction5.shape)
RF_score = randfr.score(X_train, y_train)
print(RF_score)
#putting it lall together
rmse_vals = [LR_score, R_score, L_score, EN_score, RF_score]
labels = ['Linear', 'Ridge', 'Lasso', 'ElasticNet', 'RandomForest']
rmse_df = pd.Series(rmse_vals, index=labels).to_frame()
rmse_df.rename(columns={0: 'SCORES'}, inplace=1)
rmse_df
\\KaggleHouse_submission_1 = pd.DataFrame({'Id': test.Id, 'SalePrice': prediction5})
KaggleHouse_submission_1 = KaggleHouse_submission_1
print(KaggleHouse_submission_1.shape)
In the kaggle house prediction there is a train dataset and a test dataset. here is the link to the actual data link. The output dataframe size should be a 1459 X 2 but mine is 1460 X 2 for some reason. I am not sure why this is happening. Any feedbacks is highly appreciated.
In the following line:
test = train.fillna(0)
you are assigning (overwriting) test variable with the "train" data ...
Scikit learn is very sensitive o ordering of columns, so if your train data set and the test data set are misaligned, you may have a problem similar to that above. so you need to first ensure that the test data is encoded same as the train data by using the following align command.
train, test = train_dummies.align(test_dummies, join='left', axis = 1)
see changes in my code above