I want to make a legend for a scipy dendrogram. I have tried to find hexadecimal color in the dendrogram function, but I didn't find anything. How to do this?
This is the code:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
.......
# Agglomerative Clustering di sklearn
agg_clustering = AgglomerativeClustering(n_clusters=4).fit(dfx_scaled)
# Aggiunta della colonna "clusters"
dfx['clusters'] = agg_clustering.labels_
# plot del dendogramma
linkage_matrix = linkage(dfx_scaled, 'ward')
plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix,no_labels=True )
d = dendrogram(linkage_matrix,no_labels=True )
plt.axhline(y=140, color='black', linestyle='--')
plt.show()
Related
I am trying to create a KMeans clustering model based on a csv data set that I have compiled. The data set is organized as such:
population longitude latitude
Atlanta, GA
Austin, TX
...
I tried just plotting the data, which isn't working, if produces a scatter plot where you can't see the axis or the data points, and I can't really tell of the Kmeans algorithim is working.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets
import pandas as pd
import csv
data = pd.read_csv("data.csv")
print (data.head())
plt.scatter(x=data['Population'].astype(bytes), y=data['Longitude'].astype(bytes), z=data["Latitude"].astype(bytes))
plt.xlim(0,1000000)
plt.ylim(0,5000)
plt.zlim(0,5000)
plt.xlabel('Population')
plt.ylabel('Longitude')
plt.zlabel('Latitude')
plt.title('KMeans Clustering for Population vs. Latitude and Longitude', fontsize = 10)
plt.show()
x = data.iloc[:,1:3] #selecting features
#Clustering
kmeans = KMeans(3)
kmeans.fit(x)
#Clustering Results
indentified_clusters = kmeans.fit_predict(x)
indentified_clusters
array([1,1,0.0,2])
data_with_clusters = data.copy()
data_with_clusters['Clusters'] = identified_clusters
plt.scatter(data_with_clusters['Population'],data_with_clusters['Longitude'],data_with_clusters['Latitude']c=data_with_clusters['Clusters'],cmap='rainbow')
Try the following :
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans
#2 Importing the mall dataset
data= pd.read_csv("xxx")
print(data.head())
plt.scatter(data['Longitude'],data['Latitude'])
plt.xlim(-180,180)
plt.ylim(-90,90)
plt.show()
x = data.iloc[:,1:3] # 1t for rows and second for columns
x
kmeans = KMeans(3)
kmeans.fit(x)
data_with_clusters = data.copy()
data_with_clusters['Clusters'] = indentified_clusters
plt.scatter(data_with_clusters['Longitude'],data_with_clusters['Latitude'],c=data_with_clusters['Clusters'],cmap='rainbow')
plt.show()
Can someone help me with how to create a scatterplot. I have written the following code, however, it is not the scatter plot link that I expected as all data only concentrate 3 values of x-variable
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.stats import skew
from warnings import filterwarnings
filterwarnings('ignore')
df_transactions = pd.read_csv('transactions.csv')
daily_revenue= df_transactions.groupby("days_after_open").sum()['revenue']
df_transactions["daily_revenue"] = daily_revenue
x = df_transactions["days_after_open"]
y = df_transactions["daily_revenue"]
plt.scatter(x,y,alpha=0.2)
plt.xlabel("Days After Open (days)")
plt.ylabel("Daily Reveue ($)")
plt.savefig("plot")
dataframe image
Please define the 'daily_revenue' following before moving to the scatter plot.
y = df_transactions["daily_revenue"]
I have been trying add the Legend to my code below.
It should have worked when I add the "Label". But it just won't show, not sure what I did wrong.
Packages Used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import kruskal
from sklearn.datasets import load_iris
Df1 = pd.read_csv(r"C:\Users\pc admin\Desktop\SUTD Programming\Data Wrangling\Personal Assigment\IBM Data.csv", header=0)
plt.figure(figsize=(20,8))
plt.style.use('seaborn-colorblind')
plt.grid(True, alpha=0.5)
sns.kdeplot(Df1.loc[Df1['Attrition'] == 'No', 'JobSatisfaction'], **label = "Previous-Employee"**)
sns.kdeplot(Df1.loc[Df1['Attrition'] == 'Yes', 'JobSatisfaction'], **label ="Current-Employees"**)
plt.xlabel('JobSatisfaction')
plt.xlim(left=0)
plt.ylabel('Density')
plt.title('Distance From Home Distribution in Percent by Attrition Status');
You simply need to call the .legend() method of your Axes object. The plotting functions of seaborn return the reference to the Axes directly which is handy. See the documentation of sns.kdeplot
ax = sns.kdeplot(...)
ax.legend(loc="upper right")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import kruskal
from sklearn.datasets import load_iris
import matplotlib.patches as mpatches
Df1 = pd.r`enter code here`ead_csv(
r"C:\Users\pc admin\Desktop\SUTD Programming\Data
Wrangling\Personal Assigment\IBM Data.csv", header=0)
plt.figure(figsize=(20, 8))
plt.style.use('seaborn-colorblind')
plt.grid(True, alpha=0.5)
sns.kdeplot(Df1.loc[Df1['Attrition'] == 'No',
'JobSatisfaction'], **label="Previous-Employee"**)
sns.kdeplot(Df1.loc[Df1['Attrition'] == 'Yes',
'JobSatisfaction'], **label="Current-Employees"**)
plt.xlabel('JobSatisfaction')
plt.xlim(left=0)
plt.ylabel('Density')
plt.title('Distance From Home Distribution in Percent
by Attrition Status')
handles = [mpatches.Patch(facecolor=plt.cm.Reds(100),
label="Active Employee"),
mpatches.Patch(facecolor=plt.cm.Blues(100),
label="Ex employee")]
plt.legend(handles=handles)
# chose whatever colour you want hope thats help you
#out :)
#legends was deprecated in python 3 if you have object
of #sns.kdeplot
Code is below
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns
df = pd.DataFrame(np.random.rand(10,3), columns=["A", "B","C"])
km = KMeans(n_clusters=3).fit(df)
df['cluster_id'] = km.labels_
test = {0:"Blue", 1:"Red", 2:"Green"}
#sns.scatterplot()
plt.show()
I am trying to plot without x,y that is column constraints. I need to plot any number of columns just want to plot the cluster graph
I have used the Digits dataset from Sklearn and I have tried to reduce the dimension from 64 to 3 using TSNE( t-Distributed Stochastic Neighbor Embedding):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotib inline
from sklearn.manifold import TSNE
from sklearn.datasets import load_digits
from mpl_toolkits.mplot3d import Axes3D
digits = load_digits()
digits_df = pd.DataFrame(digits.data,)
digits_df["target"] = pd.Series(digits.target)
tsne = TSNE(n_components=3)
digits_tsne = tsne.fit_transform(digits_df.iloc[:,:64])
digits_df_tsne = pd.DataFrame(digits_tsne,
columns =["Component1","Component2","Component3"])
finalDf = pd.concat([digits_df_tsne, digits_df["target"]], axis = 1)
#Visualizing 3D
figure = plt.figure(figsize=(9,9))
axes = figure.add_subplot(111,projection = "3d")
dots = axes.scatter(xs = finalDf[:,0],ys = finalDf[:,1],zs = finalDf[:,2],
c = digits.target, cmap = plt.cm.get_cmap("nipy_spectral_r",10))
The finalDf:
Te error:
TypeError: '(slice(None, None, None), 0)' is an invalid key
What is wrong? Could someone help me?
You're trying numpy slicing on pandas dataframe which is not valid, so first convert the dataframes to numpy arrays.
Here's the updated code: -
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotib inline
from sklearn.manifold import TSNE
from sklearn.datasets import load_digits
from mpl_toolkits.mplot3d import Axes3D
digits = load_digits()
digits_df = pd.DataFrame(digits.data,)
digits_df["target"] = pd.Series(digits.target)
tsne = TSNE(n_components=3)
digits_tsne = tsne.fit_transform(digits_df.iloc[:,:64])
digits_df_tsne = pd.DataFrame(digits_tsne,
columns =["Component1","Component2","Component3"])
finalDf = pd.concat([digits_df_tsne, digits_df["target"]], axis = 1)
#Visualizing 3D
figure = plt.figure(figsize=(9,9))
axes = figure.add_subplot(111,projection = "3d")
dots = axes.scatter(xs = finalDf.to_numpy()[:,0],ys = finalDf.to_numpy()[:,1],zs = finalDf.to_numpy()[:,2],
c = digits.target, cmap = plt.cm.get_cmap("nipy_spectral_r",10))