I have typed in the following lines of code:
# import relevant statistical packages
import numpy as np
import pandas as pd
import statsmodels.api as sm
import pylab as pl
import sklearn.linear_model as skl
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
# import data
url = "/<...>/Smarket.csv" # relative url within my computer
Smarket = pd.read_csv(url, index_col = 'SlNo')
X3 = Smarket[['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']]
Y3 = Smarket['Direction']
X_train, X_test, y_train, y_test = train_test_split(X3, Y3, test_size=0.2016)
data_1 = pd.concat([pd.DataFrame(y_train), X_train], axis = 1)
model_1 = sm.formula.glm(formula = 'y_train~X_train', data = data_1, family= sm.families.Binomial()).fit()
X_new = model_1.predict(X_test)
Now it is in the last code where I recieve the following error:
PatsyError: Number of rows mismatch between data argument and X_train (252 versus 998)
y_train~X_train
^^^^^^^
I am just unable to understand why I am getting this error. I get it might be because of mismatch in the number of data between X_test and X_train. How do I need to change my code to get the predicted values?
Related
Following is my code. The error seems to be in qsvc.fit() line but I can't understand why.one of the error line says "TypeError: Invalid parameter values, expected Sequence[Sequence[float]]." I'm pretty much sure I have passed arrays as parameters in fit function but do they need to be float type because labels are generally strings. sorry this is my first time trying this so these may seem naive.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from qiskit import Aer
from qiskit.circuit.library import ZFeatureMap
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit.algorithms.state_fidelities import ComputeUncompute
from qiskit.primitives import Sampler
from qiskit.utils import QuantumInstance
from qiskit_machine_learning.algorithms import PegasosQSVC
data=pd.read_csv('train.csv')
X = data.loc[1:1000,["marital","balance","loan"]].values
Y = data.iloc[:1000,-1].values
x_train, x_test, y_train, y_test = train_test_split(X, Y)
data_feature_map = ZFeatureMap(feature_dimension=3, reps=1 )
sampler = Sampler()
fidelity = ComputeUncompute(sampler=sampler)
data_kernel = FidelityQuantumKernel(fidelity=fidelity, feature_map=data_feature_map)
pegasos_qsvc = PegasosQSVC(quantum_kernel=data_kernel, C=1000, num_steps=100)
pegasos_qsvc.fit(x_train, y_train)
qsvc_score = pegasos_qsvc.score(x_test, y_test)
print(f"QSVC classification test score: {qsvc_score}")
You can use values 0,1 and 2 to represent "marital", "balance" and "loan". sklearn has a LabelEncoder to help such a conversion.
i made this program for a school project it works fine but my data should be in the form of dates
but every time i insert dates as variables it just promts me with an error saying ( can't float string "2022-05-16" )
thanks in advance
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
dataset = pd.read_csv('/content/Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values
dataset
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)"""
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, y)
To get the datetime column as a datetime-dtype rather than a string, you could use the parse_dates argument in pandas.read_csv:
dataset = pd.read_csv('/content/Position_Salaries.csv', parse_dates=...)
Or you could convert the datetime column to a datetime data type later using pandas.to_datetime:
dataset[date_col] = pd.to_datetime(dataset[date_col])
Afterwards, you might want to extract date components using the .dt accessor methods.
I wanted to know if it is required to use SMOTE only after splitting test and train dataset. I used smote after train_test_split for Churn prediction, but haven't got any significant improvement pre or post SMOTE. Below is my entire code using smote. Not sure where the issue is. I wanted to know if I used SMOTE properly.
Below is the code
import pandas as pd
import numpy as np
from datetime import timedelta,datetime,date
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from numpy import percentile
tel_data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
tel_data.info()
tel_data.isnull().sum()
num = {"No":0,"Yes":1}
tel_data = tel_data.replace({"Churn":num})
# also total charges seem to be object. coverting to integer
tel_data['TotalCharges'] = pd.to_numeric(tel_data['TotalCharges'])
tel_data.head(2)
tel_data['Churn'].value_counts()
plt.figure(figsize=(6,5))
sns.countplot(tel_data['Churn'])
plt.show()
# using pd.to_numeric to convert the TotalCharges column to numeric will help us see the null values
tel_data.TotalCharges = pd.to_numeric(tel_data.TotalCharges, errors="coerce")
tel_data.isnull().sum()
# deleting the rows with null values
tel_data = tel_data.dropna(axis=0)
# encoding all categorical variables using one hot encoding
tel_data = pd.get_dummies(tel_data,drop_first=True,columns=['gender','Partner','Dependents',
'PhoneService','MultipleLines','InternetService',
'OnlineSecurity','OnlineBackup','DeviceProtection',
'TechSupport','StreamingTV','StreamingMovies',
'Contract','PaperlessBilling','PaymentMethod'])
# splitting the dataset (removing 'customerID' since it doesnt serve any purpose)
X = tel_data.drop(['customerID','Churn'],axis=1)
y = tel_data['Churn']
# performing feature selection using chi2 test
from sklearn.feature_selection import chi2
chi_scores = chi2(X,y)
print('chi_values:',chi_scores[0],'\n')
print('p_values:',chi_scores[1])
p_values = pd.Series(chi_scores[1],index = X.columns)
p_values.sort_values(ascending = False , inplace = True)
plt.figure(figsize=(12,8))
p_values.plot.bar()
plt.show()
tel_data.drop(['PhoneService_Yes','gender_Male','MultipleLines_No phone service','MultipleLines_Yes','customerID'],axis=1,inplace=True)
tel_data.head(2)
# splitting the dataset (removing 'customerID' since it doesnt serve any purpose)
X = tel_data.drop(['Churn'],axis=1)
y = tel_data['Churn']
# import sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score
# splitting into train and test data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)
model_xgb_1 = xgb.XGBClassifier(n_estimators=100,
learning_rate=0.3,
max_depth=5,
random_state=42 )
xgbmod = model_xgb_1.fit(X_train,y_train)
# checking accuracy of training data
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(xgbmod.score(X_train, y_train)))
y_xgb_pred = trn_xgbmod.predict(X_test)
print(classification_report(y_test, y_xgb_pred))
from imblearn.over_sampling import SMOTE
smote_preprocess = SMOTE(random_state=42)
X_train_resampled,y_train_resampled = smote_preprocess.fit_resample(X_train,y_train)
model_xgb_smote = xgb.XGBClassifier(n_estimators=100,
learning_rate=0.3,
max_depth=5,
random_state=42 )
xgbmod_smote = model_xgb_smote.fit(X_train_resampled,y_train_resampled)
# checking accuracy of training data
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(xgbmod_smote.score(X_train_resampled,y_train_resampled)))
y_xgb_pred_smote = xgbmod_smote.predict(X_test)
print(classification_report(y_test, y_xgb_pred_smote))
I'm currently trying the following concept:
I applied np.log1p() to the independent variables and dependent variable (price)
Assuming X = independent variables and Y = dependent variable, I train_test_split X & Y
Then I trained the LinearRegression(), Ridge(), Lasso(), and ElasticNet() models
Given that the labels I used to train the model were also log1p(Y), I'm assuming the model predictions are also log values?
If the predictions are log values, how come np.expm1 doesn't return a value that is on a similar scale?
Linear Regression Code for reference
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from scipy.stats import skew
from scipy import stats
from scipy.stats import norm
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
df_num = pd.DataFrame(np.random.randint(0,100,size=(10000, 4)), columns=list('ABCD'))
df_cat = pd.DataFrame(np.random.randint(0,2,size=(10000, 2)), columns=['cat1', 'cat2'])
price = pd.DataFrame(np.random.randint(0,100,size=(10000, 1)), columns=['price'])
y = price
skewness = df_num.apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 0.5]
skewed_features = skewness.index
df_num[skewed_features] = np.log1p(df_num[skewed_features])
y = np.log1p(y)
train = pd.concat([df_num, df_cat], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.3, random_state = 0)
lr_clf = LinearRegression()
lr_clf.fit(X_train, y_train)
def predict_price(A, B, C, D, cat1):
cat1_index = np.where(train.columns == cat1)[0][0]
x = np.zeros(len(train.columns))
x[0] = np.log1p(A)
x[1] = np.log1p(B)
x[2] = np.log1p(C)
x[3] = np.log1p(D)
if cat1_index >= 0:
x[cat1_index] = 1
return np.expm1(lr_clf.predict([x])[0])
predict_price(20, 30, 15, 55, 'cat2')
EDIT1: I tried to recreate an example from scratch, but I can't seem to replicate the issue I'm running into. The issue I run into in my real data is that:
predictions work totally fine if I DON'T log-normalize inputs when training and DON'T log normalize inputs when predicting.
HOWEVER when I do log-normalize when training and log normalize inputs and np.expm1 the prediction, the value is totally off.
Please let me know if there is anything I can explain more clearly.
I want to create some random data and try to improve my model with PolynominalFeatures, however I'm facing little troubles with doing so.
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import random
import pandas as pd
import numpy as np
import statsmodels.api as sm
#create some artificial data
x=np.linspace(-1,1,1000)
x=pd.array(random.choices(x,k=1000))
y=x**2+np.random.randn(1000)
#divide sample
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5)
#define data frame to futher use for PolynomialFeatures
df=pd.DataFrame([x_train,x_test])
df=df.transpose()
data = df
# perform a polynomial features transform of the dataset
trans = PolynomialFeatures(degree=2)
data = trans.fit_transform(data)
model = sm.OLS(y_train,data).fit()
And then I get error : ValueError: unrecognized data structures: <class 'pandas.core.arrays.numpy_.PandasArray'> / <class 'numpy.ndarray'>
Do you have any ideas what should be done to make my regression work properly ?
use to_numpy() function to convert pandas array to numpy array
model = sm.OLS(y_train.to_numpy(),data).fit()