Getting error while making scatter plot with pyplot - python

Trying to make scatter plot using pyplot but getting black covered bars and the scatter is in a line rather then being scattered points. I'm trying to do kmeans on the set of data https://archive.ics.uci.edu/ml/datasets/wholesale+customers#. And trying to show scatter plots of all pair of attributes with the cluster colouring.
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
data = np.genfromtxt('wholesale_customers.csv', delimiter=',')
ds = pd.read_csv('wholesale_customers.csv', sep=',', header=None)
ds = ds.drop([0])
ds = ds.drop(0, 1)
ds = ds.drop(1, 1)
dsT = ds.T
columns = ['fresh', 'milk', 'grocery', 'frozen', 'detergents', 'delicatessen']
for i in range(2, 8):
mean = 0
min = 100000
max = 0
temp = ds[i].values
for x in temp:
x = int(x)
if x < min:
min = x
if x > max:
max = x
mean += x
mean /= 440
print(columns[i - 2], " min: ", min, ", max: ", max, ", mean: ", mean)
temp = ds.values
tempT = dsT.values
kmeans = KMeans(n_clusters=3, random_state=0)
label = kmeans.fit_predict(temp)
for i in range(0, 6):
for j in range(i+1, 6):
plt.scatter(tempT[i], tempT[j])
plt.xlabel(columns[i])
plt.ylabel(columns[j])
plt.show()

Related

how can i set the color of the output point from kmeans algorithm

I have a code below and I want to make a clustering using the K-means algorithm in python I have done it but the problem is the color of the output picture is hard to see, so I want to change the color of the result picture as you can see in the picture below
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
# Importing the dataset
dataset = pd.read_csv('disayat_luar1.csv')
x = dataset.drop(["kondisi", "Humidity", "Temperature", "Methane", "Carbon Monoxide"], axis = 1)
plt.scatter(x['no'],x['Alcohol'] )
plt.xlim(1,510)
plt.ylim(1,700)
plt.show()
x = x.iloc[:, 0:2]
x_array = np.array(x)
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x_array)
kmeans = KMeans(n_clusters = 2, random_state=123)
kmeans.fit(x_scaled)
x["Cluster"] = kmeans.labels_
output = plt.scatter(x_scaled[:,0], x_scaled[:,1], s = 100, c = x.Cluster, marker = "o", alpha = 1 )
centers = kmeans.cluster_centers_
plt.scatter(centers[:,0], centers[:,1], c='red', s=200, alpha=1 , marker="o");
plt.title("Clustering K-Means")
plt.colorbar (output)
plt.show()
below is the result picture
[1]: https://i.stack.imgur.com/Y0uED.png
many thanks

How to remove baseline drift after denoising the signal?

I want to plot a cardiac signal from forrestgump dataset in openneuro.I opened the tsv. file and I plot the signal.then I removed the noise by a median filter.But the signal in my opinion has baseline drift.I can't find out how I can remove the baseline drift from the figure.the figure must be straight in x axis
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import csv
import math
x = []
y = []
tsv_file ='tsvfile'
with open(tsv_file, 'r') as tsvfile:
lines = csv.reader(tsvfile, delimiter=" ")
for index, row in enumerate(lines):
x.append(index)
y.append(row[2])
window_size = 200
i = 0
moving_averages = []
yy=np.array(y).astype(np.float)
print(len(yy))
while i < len(yy) - window_size + 1:
window_average = np.sum(yy[i:i+window_size])/window_size
moving_averages.append(window_average)
i += 1
yd=moving_averages
xd = np.arange(len(yd))
print(len(yd))
plt.plot(xd[0:2000], yd[0:2000], color='g', linestyle='dashed', marker='.', label="Weather Data")
plt.show()

Traces on Polynomial Regression

Hello I'm having troube trying to predict the Weekly Sales based on the fuel price using polynomial regression. I saw someone else ask the same question and tried the only answer but I still can't get a good graph. Here's what I've done:
from contextlib import redirect_stderr
from turtle import color, pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
df = pd.read_csv (r'Walmart.csv')
df = df.sort_values(by=['Weekly_Sales'])
y = df.loc[:, "Fuel_Price"].sample(n = 50, random_state= 6)
x = df.loc[:, "Weekly_Sales"].sample(n = 50, random_state= 6)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(x.values.reshape(-1,1))
poly.fit(X_poly,y)
linreg = LinearRegression()
linreg.fit(X_poly,y)
y_pred = linreg.predict(X_poly)
plt.scatter(x, y, color='red')
plt.plot(x,y_pred, color = 'blue')
plt.show()
Result:
Graph
Your main problem is that x are not in order after randomly sampling them from df. Replace the x and y sampling lines lines with
...
xy = df.sample(n = 50, random_state= 6).sort_values(by=['Weekly_Sales'])
y = df["Fuel_Price"]
x = df["Weekly_Sales"]
...
and it should work. Eg for some made up data:
Alternatively you can plot the blue line as a scatter and it would not matter if the xs are not in order
...
plt.plot(x,y_pred ,'.', color = 'blue')
...
and it would look like this:

linear regression loop for residuals scatterplot

I'm running a linear regression simulation, each model according to a different value of the "label" variable. I can print metrics for each model, but I'm not able to run a different scatterplot por each model. All the graphs are reproduced in a single scatterplot. I would like to run a metric and a different scatterplot for each model
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import binom
from scipy.stats import norm
import numpy as np
from scipy.stats import norm
# generate random numbers from N(0,1)
x = norm.rvs(size=10000,loc=0,scale=1)
y = norm.rvs(size=10000,loc=0,scale=1)
z = binom.rvs(n=10,p=0.8,size=10000)
df = pd.DataFrame(data={'v1':x.flatten(),'target':y.flatten(),'label':z.flatten()})
classes=df.label.unique().tolist()
results = []
for name in classes:
df_subset=df.loc[df['label']==name]
reg = LinearRegression()
reg.fit(df_subset['v1'].values.reshape(-1, 1), df_subset["target"].values.reshape(-1, 1))
predictions = reg.predict(df_subset['v1'].values.reshape(-1, 1))
res=np.mean((predictions - df_subset["target"].values.reshape(-1, 1)) ** 2)
results.append(res)
msg = "Metric model %s: %f " % (name, res)
print(msg)
df_subset['pred']=predictions
sns.scatterplot(data=df_subset, x='pred', y="target")
Just create a new figure before sns plot.
plt.figure() <---
after sns plot do plt.show() so that you can show print statement(model metric) before each plot.
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import binom
from scipy.stats import norm
import numpy as np
import seaborn as sns
from scipy.stats import norm
# generate random numbers from N(0,1)
x = norm.rvs(size=10000,loc=0,scale=1)
y = norm.rvs(size=10000,loc=0,scale=1)
z = binom.rvs(n=10,p=0.8,size=10000)
df = pd.DataFrame(data={'v1':x.flatten(),'target':y.flatten(),'label':z.flatten()})
classes=df.label.unique().tolist()
results = []
for name in classes:
df_subset=df.loc[df['label']==name]
reg = LinearRegression()
reg.fit(df_subset['v1'].values.reshape(-1, 1), df_subset["target"].values.reshape(-1, 1))
predictions = reg.predict(df_subset['v1'].values.reshape(-1, 1))
res=np.mean((predictions - df_subset["target"].values.reshape(-1, 1)) ** 2)
results.append(res)
msg = "Metric model %s: %f " % (name, res)
print(msg)
plt.figure() #<-----------here
df_subset['pred']=predictions
sns.scatterplot(data=df_subset, x='pred', y="target")
plt.show() #<------------ here
I would recommend installing matplotlib library and then
import matplotlib.pyplot as plt
y = 0
.
.
.
#inside your for loop
plot = sns.scatterplot(data=df_subset, x='pred', y="target")
plt.savefig('plot_' + str(y))
plt.clf()

Classifications after better cluster found - Sklearn

I using kmeans to classificate data.
And I found my better k cluster with Elbow method and silhouette to validate decision.
So now how can i classificate my data and plot dist chart?
Could you please help me with this?
This is my code.
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
%matplotlib inline
df_diabetes = pd.read_csv('diabetes.csv')
#Deletando a coluna "Classe"
df_noclass = df_diabetes.drop('Classe', axis=1)
df_noclass.head()
nomes = df_diabetes_noclass.columns
valores = df_diabetes_noclass.values
escala_min_max = preprocessing.MinMaxScaler()
valores_normalizados = escala_min_max.fit_transform(valores)
df_diabetes_normalizado = pd.DataFrame(valores_normalizados)
df_diabetes_normalizado.columns = nomes
df_diabetes_normalizado.head(5)
sse = {}
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
df_diabetes_normalizado["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure(figsize=(14,9))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Numero de Clusters")
plt.ylabel("SSE")
plt.show()
X = df_diabetes_normalizado
y = df_diabetes_normalizado
for n_cluster in range(2, 11):
kmeans = KMeans(n_clusters=n_cluster).fit(X)
label = kmeans.labels_
sil_coeff = silhouette_score(X, label, metric='euclidean')
print("Para n_clusters={}, O Coeficiente de silueta é {}".format(n_cluster, sil_coeff))
I need to classificate my datas now and create a plot like image below.
If you want to predict which cluster your new data belongs to, you need to use the predict method:
kmeans.predict(newData)
Here is the documentation link for the predict method:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans.predict

Categories