plot data as linear model in 3D with matplotlib - python

i want to plot some data like in the link below.
What can i do when i have more than "TV & Radio" in the OLS formula and i only want to plot these two with "Sales"? Because if i do like in the code below (link), it shows me an error that the others are not defined (except TV & Radio)..
thanks for help!
https://stackoverflow.com/a/26434204/14208684
Here is the code of the link:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
from matplotlib import cm
csv = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
model = sm.ols(formula='Sales ~ TV + Radio', data = csv)
fit = model.fit()
fit.summary()
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x_surf = np.arange(0, 350, 20) # generate a mesh
y_surf = np.arange(0, 60, 4)
x_surf, y_surf = np.meshgrid(x_surf, y_surf)
exog = pd.core.frame.DataFrame({'TV': x_surf.ravel(), 'Radio': y_surf.ravel()})
out = fit.predict(exog = exog)
ax.plot_surface(x_surf, y_surf,
out.reshape(x_surf.shape),
rstride=1,
cstride=1,
color='None',
alpha = 0.4)
ax.scatter(csv['TV'], csv['Radio'], csv['Sales'],
c='blue',
marker='o',
alpha=1)
ax.set_xlabel('TV')
ax.set_ylabel('Radio')
ax.set_zlabel('Sales')
plt.show()

Related

Scale y-axis for really small numbers

I'm trying to scale the y-axis so my errorbars can be seen.
Any help would be appreciated! :)
Here is my current code.
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# if using a Jupyter notebook, include:
%matplotlib inline
x = ntermsList
y = allPmuCycleCountAverages
xerr = 0
yerr = allPmuCycleCountStandardDeviations
fig, ax = plt.subplots()
ax.errorbar(x, y, xerr=xerr, yerr=yerr,fmt='-o')
ax.set_xlabel('x-axis')
ax.set_ylabel('y-axis')
ax.set_title('Line plot with error bars')
ax.set_xticks(ntermsList)
ax.set_xticklabels(ntermsList)
ax.set_yticks(allPmuCycleCountAverages)
ax.yaxis.grid(True)
plt.show()
I've tried these solutions, but no joy:
plt.ylim(-1, 1)
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
plt.yticks(np.arange(min(y), max(y)+0.5, 0.01))
I was expecting the y-axis scale to zoom close enough to the points so my errorbars could be seen
Try autoscalling based in y ticks. Here I'm adding some logic that just rescales the y-axis based on the data that is in the visible x-region. As I don't have your data I took random data.
import numpy as np
import random
ntermsList = np.random.randint(low=0, high=10, size=(555,))
allPmuCycleCountAverages = np.random.randint(low=0, high=10, size=(555,))
allPmuCycleCountStandardDeviations = np.random.randint(low=0, high=10, size=(555,))
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# if using a Jupyter notebook, include:
%matplotlib inline
x = ntermsList
y = allPmuCycleCountAverages
xerr = 0
yerr = allPmuCycleCountStandardDeviations
fig, ax = plt.subplots()
ax.errorbar(x, y, xerr=xerr, yerr=yerr,fmt='-o')
ax.set_xlabel('x-axis')
ax.set_ylabel('y-axis')
ax.set_title('Line plot with error bars')
ax.set_xticks(ntermsList)
ax.set_xticklabels(ntermsList)
ax.set_yticks(allPmuCycleCountAverages)
#plt.setp(ax.get_yticklabels(), rotation=90, horizontalalignment='right')
ax.yaxis.grid(True)
margin =0.1
def get_bottom_top(line):
xd = line.get_xdata()
yd = line.get_ydata()
lo,hi = ax.get_xlim()
y_displayed = yd[((xd>lo) & (xd<hi))]
h = np.max(y_displayed) - np.min(y_displayed)
bot = np.min(y_displayed)-margin*h
top = np.max(y_displayed)+margin*h
return bot,top
lines = ax.get_lines()
bot,top = np.inf, -np.inf
for line in lines:
new_bot, new_top = get_bottom_top(line)
if new_bot < bot: bot = new_bot
if new_top > top: top = new_top
ax.set_ylim(bot,top)
plt.show()
Before Rescalling
After rescalling

Formatting a plot in Seaborn

I made a PMF plot using seaborn:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
n= 1000 #number of trials
p= 0.5 #probability
trial_2 = np.random.binomial(n,p,1000)
sns.displot(trial_2, stat = 'probability')
trial_2_mean= np.mean(trial_2)
plt.axvline(trial_2_mean,color='red')
plt.xlabel("Number of Successes")
red_patch = mpatches.Patch(color='red', label='Mean')
plt.legend(handles=[red_patch])
I want to add text to the plot like below (the n=60 and p=0.1):
Also how do I plot in a format similar to the one in the picture (straight lines)
You can do following:
from scipy.stats import binom
n = 50
p = 0.1
x = [x for x in range(15)]
trial_2 = binom.pmf(x, n, p)
sns.scatterplot(x, trial_2,label=('$n=50, p=0.1$'))
plt.vlines(x, 0, trial_2, colors='red', lw=3, alpha=0.4)
plt.xticks(x)
plt.ylabel('Probability')
plt.xlabel('Number of Successes')
plt.show()
Produces:

How to plot scatter plot using python?

I have used this code to create clusters and I want to plot the scatter plot of the clusters. The vectorAssembles_01 produces data with ID and features. Both should be used to plot the scatter plot.When I am running the code in google Collab I am getting an error message stating RecursionError: maximum recursion depth exceeded in comparison. please correct if I am wrong.
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
import numpy as np
import matplotlib.pyplot as plt
FEATURES_COL = ['Height(CM)', 'Weight(KG)',
'Crossing', 'Finishing', 'HeadingAccuracy',
'ShortPassing', 'Volleys', 'Dribbling', 'Curve',
'FKAccuracy', 'LongPassing', 'BallControl',
'Acceleration', 'SprintSpeed', 'Agility',
'Reactions', 'Balance', 'ShotPower', 'Jumping',
'Stamina', 'Strength', 'LongShots', 'Aggression',
'Interceptions', 'Positioning', 'Vision', 'Penalties',
'Composure', 'Marking', 'StandingTackle', 'SlidingTackle']
vecAssembler_01 = VectorAssembler(inputCols=FEATURES_COL, outputCol="features")
df_kmeansn = vecAssembler_01.transform(df).select('ID','features')
df_kmeansn.show()
#df_kmeansn.plot("ID","fearures",kind="Scatter")
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
x = df_kmeansn.ID
y = df_kmeansn.features
ax.scatter(x, y, alpha=0.8, edgecolors='none')
The output of the df_kmeansn is as shown below.
I'm not sure you can just plot Spark Dataframe directly, perhaps you should call "to_pandas" first
# ...
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
df_pandas = df_kmeansn.to_pandas()
x = df_pandas.ID
y = df_pandas.features
ax.scatter(x, y, alpha=0.8, edgecolors='none')

How can I do a histogram with 1D gaussian mixture with sklearn?

I would like to do an histogram with mixture 1D gaussian as the picture.
Thanks Meng for the picture.
My histogram is this:
I have a file with a lot of data (4,000,000 of numbers) in a column:
1.727182
1.645300
1.619943
1.709263
1.614427
1.522313
And I'm using the follow script with modifications than Meng and Justice Lord have done :
from matplotlib import rc
from sklearn import mixture
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import matplotlib.ticker as tkr
import scipy.stats as stats
x = open("prueba.dat").read().splitlines()
f = np.ravel(x).astype(np.float)
f=f.reshape(-1,1)
g = mixture.GaussianMixture(n_components=3,covariance_type='full')
g.fit(f)
weights = g.weights_
means = g.means_
covars = g.covariances_
plt.hist(f, bins=100, histtype='bar', density=True, ec='red', alpha=0.5)
plt.plot(f,weights[0]*stats.norm.pdf(f,means[0],np.sqrt(covars[0])), c='red')
plt.rcParams['agg.path.chunksize'] = 10000
plt.grid()
plt.show()
And when I run the script, I have the follow plot:
So, I don't have idea how put the start and end of all gaussians that must be there. I'm new in python and I'm confuse with the way to use the modules. Please, Can you help me and guide me how can I do this plot?
Thanks a lot
Although this is a reasonably old thread, I would like to provide my take on it. I believe my answer can be more comprehensible to some. Moreover, I include a test to check whether or not the desired number of components makes statistical sense via the BIC criterion.
# import libraries (some are for cosmetics)
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator)
import astropy
from scipy.stats import norm
from sklearn.mixture import GaussianMixture as GMM
import matplotlib as mpl
mpl.rcParams['axes.linewidth'] = 1.5
mpl.rcParams.update({'font.size': 15, 'font.family': 'STIXGeneral', 'mathtext.fontset': 'stix'})
# create the data as in #Meng's answer
x = np.concatenate((np.random.normal(5, 5, 1000), np.random.normal(10, 2, 1000)))
x = x.reshape(-1, 1)
# first of all, let's confirm the optimal number of components
bics = []
min_bic = 0
counter=1
for i in range (10): # test the AIC/BIC metric between 1 and 10 components
gmm = GMM(n_components = counter, max_iter=1000, random_state=0, covariance_type = 'full')
labels = gmm.fit(x).predict(x)
bic = gmm.bic(x)
bics.append(bic)
if bic < min_bic or min_bic == 0:
min_bic = bic
opt_bic = counter
counter = counter + 1
# plot the evolution of BIC/AIC with the number of components
fig = plt.figure(figsize=(10, 4))
ax = fig.add_subplot(1,2,1)
# Plot 1
plt.plot(np.arange(1,11), bics, 'o-', lw=3, c='black', label='BIC')
plt.legend(frameon=False, fontsize=15)
plt.xlabel('Number of components', fontsize=20)
plt.ylabel('Information criterion', fontsize=20)
plt.xticks(np.arange(0,11, 2))
plt.title('Opt. components = '+str(opt_bic), fontsize=20)
# Since the optimal value is n=2 according to both BIC and AIC, let's write down:
n_optimal = opt_bic
# create GMM model object
gmm = GMM(n_components = n_optimal, max_iter=1000, random_state=10, covariance_type = 'full')
# find useful parameters
mean = gmm.fit(x).means_
covs = gmm.fit(x).covariances_
weights = gmm.fit(x).weights_
# create necessary things to plot
x_axis = np.arange(-20, 30, 0.1)
y_axis0 = norm.pdf(x_axis, float(mean[0][0]), np.sqrt(float(covs[0][0][0])))*weights[0] # 1st gaussian
y_axis1 = norm.pdf(x_axis, float(mean[1][0]), np.sqrt(float(covs[1][0][0])))*weights[1] # 2nd gaussian
ax = fig.add_subplot(1,2,2)
# Plot 2
plt.hist(x, density=True, color='black', bins=np.arange(-100, 100, 1))
plt.plot(x_axis, y_axis0, lw=3, c='C0')
plt.plot(x_axis, y_axis1, lw=3, c='C1')
plt.plot(x_axis, y_axis0+y_axis1, lw=3, c='C2', ls='dashed')
plt.xlim(-10, 20)
#plt.ylim(0.0, 2.0)
plt.xlabel(r"X", fontsize=20)
plt.ylabel(r"Density", fontsize=20)
plt.subplots_adjust(wspace=0.3)
plt.show()
plt.close('all')
It's all about reshape.
First, you need to reshape f.
For pdf, reshape before using stats.norm.pdf. Similarly, sort and reshape before plotting.
from matplotlib import rc
from sklearn import mixture
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import matplotlib.ticker as tkr
import scipy.stats as stats
# x = open("prueba.dat").read().splitlines()
# create the data
x = np.concatenate((np.random.normal(5, 5, 1000),np.random.normal(10, 2, 1000)))
f = np.ravel(x).astype(np.float)
f=f.reshape(-1,1)
g = mixture.GaussianMixture(n_components=3,covariance_type='full')
g.fit(f)
weights = g.weights_
means = g.means_
covars = g.covariances_
plt.hist(f, bins=100, histtype='bar', density=True, ec='red', alpha=0.5)
f_axis = f.copy().ravel()
f_axis.sort()
plt.plot(f_axis,weights[0]*stats.norm.pdf(f_axis,means[0],np.sqrt(covars[0])).ravel(), c='red')
plt.rcParams['agg.path.chunksize'] = 10000
plt.grid()
plt.show()

Less noisy graph and extra humps in python

Here is the data file:
https://jsfiddle.net/83ygso6u/
Sorry for posting it in jsfiddle... didn't know where else to host it.
Anyway the second column should be ignored.
Here is the code and graph:
import pylab as plb
import math
from pylab import *
import matplotlib.pyplot as plt
data = plb.loadtxt('title_of_datafile.txt')
x = data[:,0]*1000
y= data[:,2]
plt.figure()
plt.title('Some_Title',fontsize=35, y=1.05)
plt.xlabel('Frequency (Hz)',fontsize=30)
plt.ylabel('dBu',fontsize=30)
plt.plot(x,y,'k-', label='Data')
plt.xticks(fontsize = 25, y=-0.008)
plt.yticks(fontsize = 25, x=-0.008)
plt.show()
So you can see this signal is quite noisy, but it does have two distinct peaks at around 4500 Hz and 5500 Hz.
I have been searching around the net and havn't really come across anything that will help me.
How can I extract these peaks and/or clean up the signal in python?
Well I managed to find a solution. Here is the script with the resulting plot.
Script:
import pylab as plb
import math
from pylab import *
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from scipy import signal
import peakutils
from peakutils.plot import plot as pplot
data = plb.loadtxt('data_file_name')
x = data[:,0]*1000
y= data[:,2]
y1 = sp.signal.medfilt(y,431) # remove noise to the signal
indexes = peakutils.indexes(y1, thres=0.00005, min_dist=1400) #determine peaks
x_new = x[indexes]
plt.figure()
plt.subplot(1,2,1)
plt.title('some_title_1',fontsize=35, y=1.05)
plt.xlabel('Frequency (Hz)',fontsize=30)
plt.ylabel('Signal (dBu)',fontsize=30)
plt.plot(x,y,'r-', label='Raw Data')
plt.plot(x,y1,'b-', label='Cleaned up Signal')
plt.plot(x_new[3:6],y1[indexes][3:6],'k^',markersize=10, label='Peaks')
plt.xticks(fontsize = 25, y=-0.008)
plt.yticks(fontsize = 25, x=-0.008)
plt.legend(loc=1,prop={'size':30})
plt.subplot(1,2,2)
for i,j in zip(x_new[3:6], y1[indexes][3:6]):
plt.annotate(str(i)+ " Hz",xy=(i,j+0.5),fontsize=15)
plt.title('some_title_2',fontsize=35, y=1.05)
plt.xlabel('Frequency (Hz)',fontsize=30)
plt.ylabel('Signal (dBu)',fontsize=30)
plt.plot(x,y,'r-', label='Data')
plt.plot(x,y1,'b-')
plt.plot(x_new[3:6],y1[indexes][3:6],'k^',markersize=10)
plt.xticks(fontsize = 25, y=-0.008)
plt.yticks(fontsize = 25, x=-0.008)
plt.xlim([3000, 6000])
plt.ylim([-90, -75])
plt.subplots_adjust(hspace = 0.6)
plt.show()

Categories