Getting error while making scatter plot with pyplot

Getting error while making scatter plot with pyplot - python

Trying to make scatter plot using pyplot but getting black covered bars and the scatter is in a line rather then being scattered points. I'm trying to do kmeans on the set of data https://archive.ics.uci.edu/ml/datasets/wholesale+customers#. And trying to show scatter plots of all pair of attributes with the cluster colouring.
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
data = np.genfromtxt('wholesale_customers.csv', delimiter=',')
ds = pd.read_csv('wholesale_customers.csv', sep=',', header=None)
ds = ds.drop([0])
ds = ds.drop(0, 1)
ds = ds.drop(1, 1)
dsT = ds.T
columns = ['fresh', 'milk', 'grocery', 'frozen', 'detergents', 'delicatessen']
for i in range(2, 8):
mean = 0
min = 100000
max = 0
temp = ds[i].values
for x in temp:
x = int(x)
if x < min:
min = x
if x > max:
max = x
mean += x
mean /= 440
print(columns[i - 2], " min: ", min, ", max: ", max, ", mean: ", mean)
temp = ds.values
tempT = dsT.values
kmeans = KMeans(n_clusters=3, random_state=0)
label = kmeans.fit_predict(temp)
for i in range(0, 6):
for j in range(i+1, 6):
plt.scatter(tempT[i], tempT[j])
plt.xlabel(columns[i])
plt.ylabel(columns[j])
plt.show()

Related

how can i set the color of the output point from kmeans algorithm

I have a code below and I want to make a clustering using the K-means algorithm in python I have done it but the problem is the color of the output picture is hard to see, so I want to change the color of the result picture as you can see in the picture below
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
# Importing the dataset
dataset = pd.read_csv('disayat_luar1.csv')
x = dataset.drop(["kondisi", "Humidity", "Temperature", "Methane", "Carbon Monoxide"], axis = 1)
plt.scatter(x['no'],x['Alcohol'] )
plt.xlim(1,510)
plt.ylim(1,700)
plt.show()
x = x.iloc[:, 0:2]
x_array = np.array(x)
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x_array)
kmeans = KMeans(n_clusters = 2, random_state=123)
kmeans.fit(x_scaled)
x["Cluster"] = kmeans.labels_
output = plt.scatter(x_scaled[:,0], x_scaled[:,1], s = 100, c = x.Cluster, marker = "o", alpha = 1 )
centers = kmeans.cluster_centers_
plt.scatter(centers[:,0], centers[:,1], c='red', s=200, alpha=1 , marker="o");
plt.title("Clustering K-Means")
plt.colorbar (output)
plt.show()
below is the result picture
[1]: https://i.stack.imgur.com/Y0uED.png
many thanks

How to remove baseline drift after denoising the signal?

I want to plot a cardiac signal from forrestgump dataset in openneuro.I opened the tsv. file and I plot the signal.then I removed the noise by a median filter.But the signal in my opinion has baseline drift.I can't find out how I can remove the baseline drift from the figure.the figure must be straight in x axis
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import csv
import math
x = []
y = []
tsv_file ='tsvfile'
with open(tsv_file, 'r') as tsvfile:
lines = csv.reader(tsvfile, delimiter=" ")
for index, row in enumerate(lines):
x.append(index)
y.append(row[2])
window_size = 200
i = 0
moving_averages = []
yy=np.array(y).astype(np.float)
print(len(yy))
while i < len(yy) - window_size + 1:
window_average = np.sum(yy[i:i+window_size])/window_size
moving_averages.append(window_average)
i += 1
yd=moving_averages
xd = np.arange(len(yd))
print(len(yd))
plt.plot(xd[0:2000], yd[0:2000], color='g', linestyle='dashed', marker='.', label="Weather Data")
plt.show()

Traces on Polynomial Regression

Hello I'm having troube trying to predict the Weekly Sales based on the fuel price using polynomial regression. I saw someone else ask the same question and tried the only answer but I still can't get a good graph. Here's what I've done:
from contextlib import redirect_stderr
from turtle import color, pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
df = pd.read_csv (r'Walmart.csv')
df = df.sort_values(by=['Weekly_Sales'])
y = df.loc[:, "Fuel_Price"].sample(n = 50, random_state= 6)
x = df.loc[:, "Weekly_Sales"].sample(n = 50, random_state= 6)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(x.values.reshape(-1,1))
poly.fit(X_poly,y)
linreg = LinearRegression()
linreg.fit(X_poly,y)
y_pred = linreg.predict(X_poly)
plt.scatter(x, y, color='red')
plt.plot(x,y_pred, color = 'blue')
plt.show()
Result:
Graph

Your main problem is that x are not in order after randomly sampling them from df. Replace the x and y sampling lines lines with
...
xy = df.sample(n = 50, random_state= 6).sort_values(by=['Weekly_Sales'])
y = df["Fuel_Price"]
x = df["Weekly_Sales"]
...
and it should work. Eg for some made up data:
Alternatively you can plot the blue line as a scatter and it would not matter if the xs are not in order
...
plt.plot(x,y_pred ,'.', color = 'blue')
...
and it would look like this:

linear regression loop for residuals scatterplot

I'm running a linear regression simulation, each model according to a different value of the "label" variable. I can print metrics for each model, but I'm not able to run a different scatterplot por each model. All the graphs are reproduced in a single scatterplot. I would like to run a metric and a different scatterplot for each model
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import binom
from scipy.stats import norm
import numpy as np
from scipy.stats import norm
# generate random numbers from N(0,1)
x = norm.rvs(size=10000,loc=0,scale=1)
y = norm.rvs(size=10000,loc=0,scale=1)
z = binom.rvs(n=10,p=0.8,size=10000)
df = pd.DataFrame(data={'v1':x.flatten(),'target':y.flatten(),'label':z.flatten()})
classes=df.label.unique().tolist()
results = []
for name in classes:
df_subset=df.loc[df['label']==name]
reg = LinearRegression()
reg.fit(df_subset['v1'].values.reshape(-1, 1), df_subset["target"].values.reshape(-1, 1))
predictions = reg.predict(df_subset['v1'].values.reshape(-1, 1))
res=np.mean((predictions - df_subset["target"].values.reshape(-1, 1)) ** 2)
results.append(res)
msg = "Metric model %s: %f " % (name, res)
print(msg)
df_subset['pred']=predictions
sns.scatterplot(data=df_subset, x='pred', y="target")

Just create a new figure before sns plot.
plt.figure() <---
after sns plot do plt.show() so that you can show print statement(model metric) before each plot.
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import binom
from scipy.stats import norm
import numpy as np
import seaborn as sns
from scipy.stats import norm
# generate random numbers from N(0,1)
x = norm.rvs(size=10000,loc=0,scale=1)
y = norm.rvs(size=10000,loc=0,scale=1)
z = binom.rvs(n=10,p=0.8,size=10000)
df = pd.DataFrame(data={'v1':x.flatten(),'target':y.flatten(),'label':z.flatten()})
classes=df.label.unique().tolist()
results = []
for name in classes:
df_subset=df.loc[df['label']==name]
reg = LinearRegression()
reg.fit(df_subset['v1'].values.reshape(-1, 1), df_subset["target"].values.reshape(-1, 1))
predictions = reg.predict(df_subset['v1'].values.reshape(-1, 1))
res=np.mean((predictions - df_subset["target"].values.reshape(-1, 1)) ** 2)
results.append(res)
msg = "Metric model %s: %f " % (name, res)
print(msg)
plt.figure() #<-----------here
df_subset['pred']=predictions
sns.scatterplot(data=df_subset, x='pred', y="target")
plt.show() #<------------ here

I would recommend installing matplotlib library and then
import matplotlib.pyplot as plt
y = 0
.
.
.
#inside your for loop
plot = sns.scatterplot(data=df_subset, x='pred', y="target")
plt.savefig('plot_' + str(y))
plt.clf()

Classifications after better cluster found - Sklearn

I using kmeans to classificate data.
And I found my better k cluster with Elbow method and silhouette to validate decision.
So now how can i classificate my data and plot dist chart?
Could you please help me with this?
This is my code.
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
%matplotlib inline
df_diabetes = pd.read_csv('diabetes.csv')
#Deletando a coluna "Classe"
df_noclass = df_diabetes.drop('Classe', axis=1)
df_noclass.head()
nomes = df_diabetes_noclass.columns
valores = df_diabetes_noclass.values
escala_min_max = preprocessing.MinMaxScaler()
valores_normalizados = escala_min_max.fit_transform(valores)
df_diabetes_normalizado = pd.DataFrame(valores_normalizados)
df_diabetes_normalizado.columns = nomes
df_diabetes_normalizado.head(5)
sse = {}
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
df_diabetes_normalizado["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure(figsize=(14,9))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Numero de Clusters")
plt.ylabel("SSE")
plt.show()
X = df_diabetes_normalizado
y = df_diabetes_normalizado
for n_cluster in range(2, 11):
kmeans = KMeans(n_clusters=n_cluster).fit(X)
label = kmeans.labels_
sil_coeff = silhouette_score(X, label, metric='euclidean')
print("Para n_clusters={}, O Coeficiente de silueta é {}".format(n_cluster, sil_coeff))
I need to classificate my datas now and create a plot like image below.

If you want to predict which cluster your new data belongs to, you need to use the predict method:
kmeans.predict(newData)
Here is the documentation link for the predict method:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans.predict

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Getting error while making scatter plot with pyplot - python

Related

how can i set the color of the output point from kmeans algorithm

How to remove baseline drift after denoising the signal?

Traces on Polynomial Regression

linear regression loop for residuals scatterplot

Classifications after better cluster found - Sklearn

Categories

Resources