select subset for regression

select subset for regression - python

I have the following codes that I want to use. The column 0 is year (1950-2020) then the rest of the columns are months. I only want to use the data from 1979-2020 in my linear regression model.
Can you help me? I am quite a beginner in using python. Below is my code:
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
data1 = pd.read_csv (r'C:\Users\User-PC\sample.csv')
x1 = pd.DataFrame(data,columns=['Year','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
#data2 = pd.read_csv (r'C:\Users\User-PC\sample2.csv', parse_dates=[0], index_col=0)
#x2 = pd.DataFrame(data2,columns=['Year','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
plt.plot(x1['Year'], x1['Jan'], color='green')
plt.title('Model 1')
plt.xlabel('Year')
plt.ylabel('index')
plt.show()

You can filter your dataframe by year before applying linear regression:
new_df = df[df['Year'].between(1979, 2000, inclusive="both")]

Related

KMeans Clustering of CSV Data Set

I am trying to create a KMeans clustering model based on a csv data set that I have compiled. The data set is organized as such:
population longitude latitude
Atlanta, GA
Austin, TX
...
I tried just plotting the data, which isn't working, if produces a scatter plot where you can't see the axis or the data points, and I can't really tell of the Kmeans algorithim is working.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets
import pandas as pd
import csv
data = pd.read_csv("data.csv")
print (data.head())
plt.scatter(x=data['Population'].astype(bytes), y=data['Longitude'].astype(bytes), z=data["Latitude"].astype(bytes))
plt.xlim(0,1000000)
plt.ylim(0,5000)
plt.zlim(0,5000)
plt.xlabel('Population')
plt.ylabel('Longitude')
plt.zlabel('Latitude')
plt.title('KMeans Clustering for Population vs. Latitude and Longitude', fontsize = 10)
plt.show()
x = data.iloc[:,1:3] #selecting features
#Clustering
kmeans = KMeans(3)
kmeans.fit(x)
#Clustering Results
indentified_clusters = kmeans.fit_predict(x)
indentified_clusters
array([1,1,0.0,2])
data_with_clusters = data.copy()
data_with_clusters['Clusters'] = identified_clusters
plt.scatter(data_with_clusters['Population'],data_with_clusters['Longitude'],data_with_clusters['Latitude']c=data_with_clusters['Clusters'],cmap='rainbow')

Try the following :
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans
#2 Importing the mall dataset
data= pd.read_csv("xxx")
print(data.head())
plt.scatter(data['Longitude'],data['Latitude'])
plt.xlim(-180,180)
plt.ylim(-90,90)
plt.show()
x = data.iloc[:,1:3] # 1t for rows and second for columns
x
kmeans = KMeans(3)
kmeans.fit(x)
data_with_clusters = data.copy()
data_with_clusters['Clusters'] = indentified_clusters
plt.scatter(data_with_clusters['Longitude'],data_with_clusters['Latitude'],c=data_with_clusters['Clusters'],cmap='rainbow')
plt.show()

How to add anomaly points on the boxplot

I used the ellipticenvelope method to find the anomalies in the iris dataset as below:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
iris = load_iris()
cols = iris.feature_names
X = pd.DataFrame(iris.data, columns=cols)
X.head()
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import EllipticEnvelope
scaler = StandardScaler()
scaler.fit_transform(X)
cov = EllipticEnvelope(store_precision=True,
assume_centered=True,
support_fraction=None,
contamination=0.01,
random_state=0)
cov.fit(X)
X['Anomaly'] = cov.predict(X)
Now you can find the anomalies in the last column with the value -1.
X[X['Anomaly'] == -1]
Now I want to do a root cause analysis to find the source of the anomaly, so I want to plot the anomalies in the boxplot with red dots for example. Is it possible or not? if yes, how can I add it?
X.boxplot(column=cols, grid=False, rot=45)
# code to plot anomalies on boxplot
plt.show()

Data mining for machine learning

I start in data analysis and I encounter a problem on an exercise to recover on kaggle: file 'ENBsv' I import my data, determine the correlation, create a new column in my dataframe which totals my target variables
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import validation_curve
from sklearn import ensemble
from sklearn import svm
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import VotingClassifier
df = pd.read_csv('ENB.csv')
df.columns= ["relative_compactness","surface_area","wall_area","roof_area","overall_height","orientaion",
"glazing_area","glazing_area_dist","heating_load","cooling_load"]
df.head()
corr =df.corr(method = 'pearson')
plt.figure(figsize = (20,10))
sns.heatmap(df.corr(), annot=True, cmap='Greens');
df['total_charges'] = pd.Series([1]).astype(dtype=float)
df['total_charges'] = df['heating_load'] + df['cooling_load']
I have to instantiate new variable 'charges_classes' split the buildings into 4 distinct classes with the label 0,1,2,3 according to the 3 quantiles of the new variable created. But I have to look and seek I can not find a solution, someone can help me here is what I did:
charge_classes = pd.get_dummies(df['total_charges'])
charge_classes

You could use qcut:
df['charge_classes'] = pd.qcut(df['total_charges'], 4, labels=False)

Too many lines and curves on the polynomial graph

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
df = pd.read_csv("C:\\Users\\MONSTER\\Desktop\\dosyalar\\datasets\\Auto.csv")
x = df["horsepower"].to_numpy()
y = df["mpg"].to_numpy()
x = x.reshape(-1,1)
poly = PolynomialFeatures(degree = 5)
X_poly = poly.fit_transform(x)
poly.fit(X_poly,y)
lr = LinearRegression()
lr.fit(X_poly, y)
y_pred = lr.predict(X_poly)
plt.scatter(x,y,color="blue",marker=".")
plt.plot(x,y_pred,color="red")
I have tried to draw a polynomial regression curve but I couldn't manage it. Someone told me to sorting values before plotting via "numpy.argsort" but nothing has changed. How can I fix it?

probably scatter is better for you:
plt.scatter(x,y_pred,color="red")
Or with argsort as mentioned:
orders = np.argsort(x.ravel())
plt.plot(x[orders], y[orders], color='red')

AttributeError: 'OLSResults' object has no attribute 'norm_resid'

When I run this I have the following error :
AttributeError: 'OLSResults' object has no attribute 'norm_resid'
I have the latest version of OLS, so the attribute norm_resid should be there.
Any ideas ?
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn import datasets, linear_model
from statsmodels.formula.api import ols
"""
Data Management
"""
data = pd.read_csv("TestExer1-sales-round1.csv")
X_train = data["Advertising"]
Y_train = data["Sales"]
# use of linregregress
model = ols("Y_train ~ X_train", data).fit()
print(model.summary())
plt.plot(X_train,Y_train , 'ro')
plt.plot(X_train, model.fittedvalues, 'b')
plt.legend(['Sales', 'Advertising'])
plt.ylim(0, 70)
plt.xlim(5, 18)
plt.hist(model.norm_resid())
plt.ylabel('Count')
plt.xlabel('Normalized residuals')
plt.xlabel('Temperature')
plt.ylabel('Gas')
plt.title('Before Insulation')

I had the same issue, but the following worked:
plt.hist(model.resid_pearson)
Thus your solution should look like:
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn import datasets, linear_model
from statsmodels.formula.api import ols
"""
Data Management
"""
data = pd.read_csv("TestExer1-sales-round1.csv")
X_train = data["Advertising"]
Y_train = data["Sales"]
# use of linregregress
model = ols("Y_train ~ X_train", data).fit()
print(model.summary())
plt.plot(X_train,Y_train , 'ro')
plt.plot(X_train, model.fittedvalues, 'b')
plt.legend(['Sales', 'Advertising'])
plt.ylim(0, 70)
plt.xlim(5, 18)
plt.hist(model.resid_pearson)
plt.ylabel('Count')
plt.xlabel('Normalized residuals')
plt.xlabel('Temperature')
plt.ylabel('Gas')
plt.title('Before Insulation')
when using statsmodel version 0.8.0 or greater.
Note: the pearson residuals only divide each residual value with standard error of residuals. While normalisation also divides each residual by the sum of all residuals. For more see here
From the docs.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

select subset for regression - python

You can filter your dataframe by year before applying linear regression: new_df = df[df['Year'].between(1979, 2000, inclusive="both")]

Related

KMeans Clustering of CSV Data Set

How to add anomaly points on the boxplot

Data mining for machine learning

Too many lines and curves on the polynomial graph

AttributeError: 'OLSResults' object has no attribute 'norm_resid'

Categories

Resources