I have used this code to create clusters and I want to plot the scatter plot of the clusters. The vectorAssembles_01 produces data with ID and features. Both should be used to plot the scatter plot.When I am running the code in google Collab I am getting an error message stating RecursionError: maximum recursion depth exceeded in comparison. please correct if I am wrong.
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
import numpy as np
import matplotlib.pyplot as plt
FEATURES_COL = ['Height(CM)', 'Weight(KG)',
'Crossing', 'Finishing', 'HeadingAccuracy',
'ShortPassing', 'Volleys', 'Dribbling', 'Curve',
'FKAccuracy', 'LongPassing', 'BallControl',
'Acceleration', 'SprintSpeed', 'Agility',
'Reactions', 'Balance', 'ShotPower', 'Jumping',
'Stamina', 'Strength', 'LongShots', 'Aggression',
'Interceptions', 'Positioning', 'Vision', 'Penalties',
'Composure', 'Marking', 'StandingTackle', 'SlidingTackle']
vecAssembler_01 = VectorAssembler(inputCols=FEATURES_COL, outputCol="features")
df_kmeansn = vecAssembler_01.transform(df).select('ID','features')
df_kmeansn.show()
#df_kmeansn.plot("ID","fearures",kind="Scatter")
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
x = df_kmeansn.ID
y = df_kmeansn.features
ax.scatter(x, y, alpha=0.8, edgecolors='none')
The output of the df_kmeansn is as shown below.
I'm not sure you can just plot Spark Dataframe directly, perhaps you should call "to_pandas" first
# ...
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
df_pandas = df_kmeansn.to_pandas()
x = df_pandas.ID
y = df_pandas.features
ax.scatter(x, y, alpha=0.8, edgecolors='none')
Related
Each row in the dataset has three datapoints. How can I plot a line for each one, as indicated?
import matplotlib.pyplot as plt
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
dataset = pd.read_csv('data.csv')
X = dataset.iloc[:, 0:2].values
y = dataset.iloc[:, -1].values
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], y, marker='.', color="red")
ax.set_xlabel("Cone")
ax.set_ylabel("Time")
ax.set_zlabel("Temp")
plt.show()
This is the data. SO wont let me save the post now that I have added the data because it says my question is mostly code, so I am writing this longwinded thing so that hopefully it lets me post. You can just ignore this paragraph. It is only here to balance out the code with prose so that Stack overflow will let me post.
cone,ramp,temp
4,15,1141
4,60,1162
4,150,1183
5,15,1159
5,60,1186
5,150,1207
6,15,1185
6,60,1222
6,150,1243
7,15,1201
7,60,1239
7,150,1257
8,15,1211
8,60,1249
8,150,1271
9,15,1224
9,60,1260
9,150,1280
10,15,1251
10,60,1285
10,150,1305
11,15,1272
11,60,1294
11,150,1315
12,15,1285
12,60,1306
12,150,1326
13,15,1310
13,60,1331
13,150,1348
14,15,1351
14,60,1365
14,150,1384
One way is to loop over the unique values of the cone column:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
for u in dataset["cone"].unique():
extracted_df = dataset[dataset["cone"] == u]
values = extracted_df.values
ax.plot(values[:, 0], values[:, 1], values[:, 2], color="red")
ax.set_xlabel("Cone")
ax.set_ylabel("Time")
ax.set_zlabel("Temp")
plt.show()
Having the geographic points with values, I would like to encode the values with colormap and customize the legend position and colormap range.
Using geopandas, I have written the following function:
def plot_continuous(df, column_values, title):
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
ax.axis('off')
df.plot(ax=ax, column=column_values, cmap='OrRd', legend=True);
ax.title.set_text(title)
The colorbar by default is vertical, but I would like to make it horizontal.
In order to have a horizontal colorbar, I have written the following function:
def plot_continuous(df, column_values, title, legend_title=None):
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
x = np.array(df.geometry.apply(lambda x: x.x))
y = np.array(df.geometry.apply(lambda x: x.y))
vals = np.array(df[column_values])
sc = ax.scatter(x, y, c=vals, cmap='OrRd')
cbar = plt.colorbar(sc, orientation="horizontal")
if legend_title is not None:
cbar.ax.set_xlabel(legend_title)
ax.title.set_text(title)
The image width and height in the latter case, however, is not proportional so the output looks distorted.
Does anyone know how to customize the geographic plot and keep the width-height ratio undistorted?
This gets far simpler if you use geopandas customisation of plot()
This is documented: https://geopandas.org/en/stable/docs/user_guide/mapping.html
Below I show MWE using your function and then using geopandas. Later has scaled data correctly.
MWE of your code
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
def plot_continuous(df, column_values, title, legend_title=None):
fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
x = np.array(df.geometry.apply(lambda x: x.x))
y = np.array(df.geometry.apply(lambda x: x.y))
vals = np.array(df[column_values])
sc = ax.scatter(x, y, c=vals, cmap='OrRd')
cbar = plt.colorbar(sc, orientation="horizontal")
if legend_title is not None:
cbar.ax.set_xlabel(legend_title)
ax.title.set_text(title)
cities = gpd.read_file(gpd.datasets.get_path("naturalearth_cities"))
cities["color"] = np.random.randint(1,10, len(cities))
plot_continuous(cities, "color", "my title", "Color")
use geopandas
ax = cities.plot(
column="color",
cmap="OrRd",
legend=True,
legend_kwds={"label": "Color", "orientation": "horizontal"},
)
ax.set_title("my title")
i want to plot some data like in the link below.
What can i do when i have more than "TV & Radio" in the OLS formula and i only want to plot these two with "Sales"? Because if i do like in the code below (link), it shows me an error that the others are not defined (except TV & Radio)..
thanks for help!
https://stackoverflow.com/a/26434204/14208684
Here is the code of the link:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
from matplotlib import cm
csv = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
model = sm.ols(formula='Sales ~ TV + Radio', data = csv)
fit = model.fit()
fit.summary()
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x_surf = np.arange(0, 350, 20) # generate a mesh
y_surf = np.arange(0, 60, 4)
x_surf, y_surf = np.meshgrid(x_surf, y_surf)
exog = pd.core.frame.DataFrame({'TV': x_surf.ravel(), 'Radio': y_surf.ravel()})
out = fit.predict(exog = exog)
ax.plot_surface(x_surf, y_surf,
out.reshape(x_surf.shape),
rstride=1,
cstride=1,
color='None',
alpha = 0.4)
ax.scatter(csv['TV'], csv['Radio'], csv['Sales'],
c='blue',
marker='o',
alpha=1)
ax.set_xlabel('TV')
ax.set_ylabel('Radio')
ax.set_zlabel('Sales')
plt.show()
How can I make a Loading plot with Matplotlib of a PLS-DA plot, like the loading plot like that of PCA?
This answer explains how it can be done with PCA:
Plot PCA loadings and loading in biplot in sklearn (like R's autoplot)
However there are some significant differences between the two methods which makes the implementation different as well. (Some of the relevant differences are explained here https://learnche.org/pid/latent-variable-modelling/projection-to-latent-structures/interpreting-pls-scores-and-loadings )
To make the PLS-DA plot I use the following code:
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
import numpy as np
import pandas as pd
targets = [0, 1]
x_vals = StandardScaler().fit_transform(df.values)
y = [g == targets[0] for g in sample_description]
y = np.array(y, dtype=int)
plsr = PLSRegression(n_components=2, scale=False)
plsr.fit(x_vals, y)
colormap = {
targets[0]: '#ff0000', # Red
targets[1]: '#0000ff', # Blue
}
colorlist = [colormap[c] for c in sample_description]
scores = pd.DataFrame(plsr.x_scores_)
scores.index = x.index
x_loadings = plsr.x_loadings_
y_loadings = plsr.y_loadings_
fig1, ax = get_default_fig_ax('Scores on LV 1', 'Scores on LV 2', title)
ax = scores.plot(x=0, y=1, kind='scatter', s=50, alpha=0.7,
c=colorlist, ax=ax)
I took your code and enhanced it. The biplot is obtained via simply overlaying the score and the loading plot.
Other, more rigerous plots could be made with truely shared axis according to https://blogs.sas.com/content/iml/2019/11/06/what-are-biplots.html#:~:text=A%20biplot%20is%20an%20overlay,them%20on%20a%20single%20plot.
The code below generates this image for a dataset with ~200 features (therefore there are ~200 red arrows shown):
from sklearn.cross_decomposition import PLSRegression
pls2 = PLSRegression(n_components=2)
pls2.fit(X_train, Y_train)
x_loadings = pls2.x_loadings_
y_loadings = pls2.y_loadings_
fig, ax = plt.subplots(constrained_layout=True)
scores = pd.DataFrame(pls2.x_scores_)
scores.plot(x=0, y=1, kind='scatter', s=50, alpha=0.7,
c=Y_train.values[:,0], ax = ax)
newax = fig.add_axes(ax.get_position(), frameon=False)
feature_n=x_loadings.shape[0]
print(x_loadings.shape)
for feature_i in range(feature_n):
comp_1_idx=0
comp_2_idx=1
newax.arrow(0, 0, x_loadings[feature_i,comp_1_idx], x_loadings[feature_i,comp_2_idx],color = 'r',alpha = 0.5)
newax.get_xaxis().set_visible(False)
newax.get_yaxis().set_visible(False)
plt.show()
I would like to do an histogram with mixture 1D gaussian as the picture.
Thanks Meng for the picture.
My histogram is this:
I have a file with a lot of data (4,000,000 of numbers) in a column:
1.727182
1.645300
1.619943
1.709263
1.614427
1.522313
And I'm using the follow script with modifications than Meng and Justice Lord have done :
from matplotlib import rc
from sklearn import mixture
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import matplotlib.ticker as tkr
import scipy.stats as stats
x = open("prueba.dat").read().splitlines()
f = np.ravel(x).astype(np.float)
f=f.reshape(-1,1)
g = mixture.GaussianMixture(n_components=3,covariance_type='full')
g.fit(f)
weights = g.weights_
means = g.means_
covars = g.covariances_
plt.hist(f, bins=100, histtype='bar', density=True, ec='red', alpha=0.5)
plt.plot(f,weights[0]*stats.norm.pdf(f,means[0],np.sqrt(covars[0])), c='red')
plt.rcParams['agg.path.chunksize'] = 10000
plt.grid()
plt.show()
And when I run the script, I have the follow plot:
So, I don't have idea how put the start and end of all gaussians that must be there. I'm new in python and I'm confuse with the way to use the modules. Please, Can you help me and guide me how can I do this plot?
Thanks a lot
Although this is a reasonably old thread, I would like to provide my take on it. I believe my answer can be more comprehensible to some. Moreover, I include a test to check whether or not the desired number of components makes statistical sense via the BIC criterion.
# import libraries (some are for cosmetics)
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator)
import astropy
from scipy.stats import norm
from sklearn.mixture import GaussianMixture as GMM
import matplotlib as mpl
mpl.rcParams['axes.linewidth'] = 1.5
mpl.rcParams.update({'font.size': 15, 'font.family': 'STIXGeneral', 'mathtext.fontset': 'stix'})
# create the data as in #Meng's answer
x = np.concatenate((np.random.normal(5, 5, 1000), np.random.normal(10, 2, 1000)))
x = x.reshape(-1, 1)
# first of all, let's confirm the optimal number of components
bics = []
min_bic = 0
counter=1
for i in range (10): # test the AIC/BIC metric between 1 and 10 components
gmm = GMM(n_components = counter, max_iter=1000, random_state=0, covariance_type = 'full')
labels = gmm.fit(x).predict(x)
bic = gmm.bic(x)
bics.append(bic)
if bic < min_bic or min_bic == 0:
min_bic = bic
opt_bic = counter
counter = counter + 1
# plot the evolution of BIC/AIC with the number of components
fig = plt.figure(figsize=(10, 4))
ax = fig.add_subplot(1,2,1)
# Plot 1
plt.plot(np.arange(1,11), bics, 'o-', lw=3, c='black', label='BIC')
plt.legend(frameon=False, fontsize=15)
plt.xlabel('Number of components', fontsize=20)
plt.ylabel('Information criterion', fontsize=20)
plt.xticks(np.arange(0,11, 2))
plt.title('Opt. components = '+str(opt_bic), fontsize=20)
# Since the optimal value is n=2 according to both BIC and AIC, let's write down:
n_optimal = opt_bic
# create GMM model object
gmm = GMM(n_components = n_optimal, max_iter=1000, random_state=10, covariance_type = 'full')
# find useful parameters
mean = gmm.fit(x).means_
covs = gmm.fit(x).covariances_
weights = gmm.fit(x).weights_
# create necessary things to plot
x_axis = np.arange(-20, 30, 0.1)
y_axis0 = norm.pdf(x_axis, float(mean[0][0]), np.sqrt(float(covs[0][0][0])))*weights[0] # 1st gaussian
y_axis1 = norm.pdf(x_axis, float(mean[1][0]), np.sqrt(float(covs[1][0][0])))*weights[1] # 2nd gaussian
ax = fig.add_subplot(1,2,2)
# Plot 2
plt.hist(x, density=True, color='black', bins=np.arange(-100, 100, 1))
plt.plot(x_axis, y_axis0, lw=3, c='C0')
plt.plot(x_axis, y_axis1, lw=3, c='C1')
plt.plot(x_axis, y_axis0+y_axis1, lw=3, c='C2', ls='dashed')
plt.xlim(-10, 20)
#plt.ylim(0.0, 2.0)
plt.xlabel(r"X", fontsize=20)
plt.ylabel(r"Density", fontsize=20)
plt.subplots_adjust(wspace=0.3)
plt.show()
plt.close('all')
It's all about reshape.
First, you need to reshape f.
For pdf, reshape before using stats.norm.pdf. Similarly, sort and reshape before plotting.
from matplotlib import rc
from sklearn import mixture
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import matplotlib.ticker as tkr
import scipy.stats as stats
# x = open("prueba.dat").read().splitlines()
# create the data
x = np.concatenate((np.random.normal(5, 5, 1000),np.random.normal(10, 2, 1000)))
f = np.ravel(x).astype(np.float)
f=f.reshape(-1,1)
g = mixture.GaussianMixture(n_components=3,covariance_type='full')
g.fit(f)
weights = g.weights_
means = g.means_
covars = g.covariances_
plt.hist(f, bins=100, histtype='bar', density=True, ec='red', alpha=0.5)
f_axis = f.copy().ravel()
f_axis.sort()
plt.plot(f_axis,weights[0]*stats.norm.pdf(f_axis,means[0],np.sqrt(covars[0])).ravel(), c='red')
plt.rcParams['agg.path.chunksize'] = 10000
plt.grid()
plt.show()