How to scatter plot 3 columns - python

Code is below
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns
df = pd.DataFrame(np.random.rand(10,3), columns=["A", "B","C"])
km = KMeans(n_clusters=3).fit(df)
df['cluster_id'] = km.labels_
test = {0:"Blue", 1:"Red", 2:"Green"}
#sns.scatterplot()
plt.show()
I am trying to plot without x,y that is column constraints. I need to plot any number of columns just want to plot the cluster graph

Related

KMeans Clustering of CSV Data Set

I am trying to create a KMeans clustering model based on a csv data set that I have compiled. The data set is organized as such:
population longitude latitude
Atlanta, GA
Austin, TX
...
I tried just plotting the data, which isn't working, if produces a scatter plot where you can't see the axis or the data points, and I can't really tell of the Kmeans algorithim is working.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets
import pandas as pd
import csv
data = pd.read_csv("data.csv")
print (data.head())
plt.scatter(x=data['Population'].astype(bytes), y=data['Longitude'].astype(bytes), z=data["Latitude"].astype(bytes))
plt.xlim(0,1000000)
plt.ylim(0,5000)
plt.zlim(0,5000)
plt.xlabel('Population')
plt.ylabel('Longitude')
plt.zlabel('Latitude')
plt.title('KMeans Clustering for Population vs. Latitude and Longitude', fontsize = 10)
plt.show()
x = data.iloc[:,1:3] #selecting features
#Clustering
kmeans = KMeans(3)
kmeans.fit(x)
#Clustering Results
indentified_clusters = kmeans.fit_predict(x)
indentified_clusters
array([1,1,0.0,2])
data_with_clusters = data.copy()
data_with_clusters['Clusters'] = identified_clusters
plt.scatter(data_with_clusters['Population'],data_with_clusters['Longitude'],data_with_clusters['Latitude']c=data_with_clusters['Clusters'],cmap='rainbow')
Try the following :
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans
#2 Importing the mall dataset
data= pd.read_csv("xxx")
print(data.head())
plt.scatter(data['Longitude'],data['Latitude'])
plt.xlim(-180,180)
plt.ylim(-90,90)
plt.show()
x = data.iloc[:,1:3] # 1t for rows and second for columns
x
kmeans = KMeans(3)
kmeans.fit(x)
data_with_clusters = data.copy()
data_with_clusters['Clusters'] = indentified_clusters
plt.scatter(data_with_clusters['Longitude'],data_with_clusters['Latitude'],c=data_with_clusters['Clusters'],cmap='rainbow')
plt.show()

Creating scatter plot

Can someone help me with how to create a scatterplot. I have written the following code, however, it is not the scatter plot link that I expected as all data only concentrate 3 values of x-variable
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.stats import skew
from warnings import filterwarnings
filterwarnings('ignore')
df_transactions = pd.read_csv('transactions.csv')
daily_revenue= df_transactions.groupby("days_after_open").sum()['revenue']
df_transactions["daily_revenue"] = daily_revenue
x = df_transactions["days_after_open"]
y = df_transactions["daily_revenue"]
plt.scatter(x,y,alpha=0.2)
plt.xlabel("Days After Open (days)")
plt.ylabel("Daily Reveue ($)")
plt.savefig("plot")
dataframe image
Please define the 'daily_revenue' following before moving to the scatter plot.
y = df_transactions["daily_revenue"]

seaborn mixing of plots

I'm having trouble creating this plot in spyder:
import seaborn as sns
import pandas as pd
from pandas.api.types import CategoricalDtype
diamonds= sns.load_dataset("diamonds")
df=diamonds.copy()
cut_Kategoriler=["Fair","Good","Very Good","Premium","Ideal"]
df.cut=df.cut.astype(CategoricalDtype(categories = cut_Kategoriler,ordered=True))
print(df.head())
sns.catplot(x="cut",y="price",data=df)
sns.barplot(x="cut",y="price",hue="color",data=df)
I want create two plots. But these plots overflap. How can i separate the graphics in the last two lines?
You need to import matplotlib.pyplot as plt and then add plt.show() after each of the two plots.
The modified code is added below:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt # Import Matplotlib
from pandas.api.types import CategoricalDtype
diamonds = sns.load_dataset("diamonds")
df=diamonds.copy()
cut_Kategoriler=["Fair","Good","Very Good","Premium","Ideal"]
df.cut=df.cut.astype(CategoricalDtype(categories = cut_Kategoriler,ordered=True))
print(df.head())
sns.catplot(x="cut",y="price",data=df)
plt.show() # Display the first plot
sns.barplot(x="cut",y="price",hue="color",data=df)
plt.show() # Display the second plot

How to define the Quartile range for multivariable and plot the box plot

How to plot the Outliers with Box plot for the below data
no,store_id,revenue,profit,state,country
0,101,779183,281257,WD,India
1,101,144829,838451,WD,India
2,101,766465,757565,AL,Japan
Code is below, code is there till converting data to standardscalar any can choose minmaxscalar. After that How to define Quartile range to define outliers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
df = pd.read_csv(r'anomaly.csv',index_col=False);
df1 = pd.get_dummies(data=df)
df2 = StandardScaler().fit_transform(df1)
Box and whisker plots display the 25th and 75th percentiles of the data by convention.
This is calculated automatically using the medians of the data you provided.
For example, for the following data:
no,store_id,revenue,profit,state,country
0,101,779183,281257,WD,India
1,101,144829,838451,WD,India
2,101,766465,757565,AL,Japan
2,101,1000000,757565,AL,Italy
You can display the boxplot as follows for the revenue column:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
df = pd.read_csv(r'anomaly.csv',index_col=False)
df1 = pd.get_dummies(data=df)
df2 = StandardScaler().fit_transform(df1)
green_diamond = dict(markerfacecolor='g', marker='D')
fig1, ax1 = plt.subplots()
ax1.set_title('Box plot')
ax1.boxplot(df['revenue'], flierprops=green_diamond)
plt.show()
The outlier is displayed:

Unable to get Regression line and the variance bounds in Seaborn pairplot

I am unable to get regression line and the variance bounds around it while plotting seaborn.pairplot with kind=reg as shown in the examples at http://seaborn.pydata.org/generated/seaborn.pairplot.html
import pandas pd
import seaborn as sns
import numpy as np
import matplotlib as plt
# Preparing random dataFrame with two colums, viz., random x and lag-1 values
lst1 = list(np.random.rand(10000))
df = pd.DataFrame({'x1':lst1})
df['x2'] = df['x1'].shift(1)
df = df[df['x2'] > 0]
# Plotting now
pplot = sns.pairplot(df, kind="reg")
pplot.set(ylim=(min(df['x1']), max(df['x1'])))
pplot.set(xlim=(min(df['x1']), max(df['x1'])))
plt.show()
The regression line is there, you just don't see it, because it's hidden by the unnaturally high number of points in the plot.
So let's reduce the number of points and you'll see the regression as expected.
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
# Preparing random dataFrame with two colums, viz., random x and lag-1 values
lst1 = list(np.random.rand(100))
df = pd.DataFrame({'x1':lst1})
df['x2'] = df['x1'].shift(1)
df = df[df['x2'] > 0]
# Plotting now
pplot = sns.pairplot(df, kind="reg")
plt.show()

Categories