How to get rid of weird stray lines in scatterplot - python

I'm dealing with the well-known Gapminder data file (here:
https://www.kaggle.com/datasets/tklimonova/gapminder-datacamp-2007?select=gapminder_full.csv)
df.head():
country year population continent life_exp gdp_cap
0 Afghanistan 2007 31889923 Asia 43.828 974.580338
1 Albania 2007 3600523 Europe 76.423 5937.029526
2 Algeria 2007 33333216 Africa 72.301 6223.367465
3 Angola 2007 12420476 Africa 42.731 4797.231267
4 Argentina 2007 40301927 Americas 75.320 12779.379640
I tried a scatter plot but get confused by the many lines appearing on the plot:
plt.style.use('seaborn')
x = np.array(df['gdp_cap'])
y = np.array(df['life_exp'])
plt.scatter(x, y, marker = 'o', alpha = 1)
coeff = np.polyfit(x, y, 2)
plt.plot(x, coeff[0]*(x**2) + coeff[1]*x + coeff[2])
plt.show()
What I am doing wrong ???

Your second plot overdraws the first plot. Do add another plt.show() to prevent overdrawing:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('seaborn')
x = np.array(df['gdp_cap'])
y = np.array(df['life_exp'])
plt.scatter(x, y, marker = 'o', alpha = 1)
plt.show()
coeff = np.polyfit(x, y, 2)
plt.plot(x, coeff[0]*(x**2) + coeff[1]*x + coeff[2])
plt.show()
Output:

Related

How to generate 2-yaxis graphs on a panel data per id?

I have a dataset, df that looks like this:
Date
Code
City
State
Quantity x
Quantity y
Population
Cases
Deaths
2019-01
10001
Los Angeles
CA
445
0
0
2019-01
10002
Sacramento
CA
4450
556
0
0
2020-03
12223
Houston
TX
440
4440
35000000
23
11
...
...
...
...
...
...
...
...
...
2021-07
10002
Sacramento
CA
3220
NA
5444000
211
22
My start and end date are the same for all cities. I have over 4000 different cities, and would like to plot a 2-yaxis graph for each city, using something similar to the following code:
import matplotlib.pyplot as plt
fig, ax1 = plt.subplots(figsize=(9,9))
color = 'tab:red'
ax1.set_xlabel('Date')
ax1.set_ylabel('Quantity X', color=color)
ax1.plot(df['Quantity x'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
color2 = 'tab:blue'
ax2.set_ylabel('Deaths', color=color2)
ax2.plot(df['Deaths'], color=color2)
ax2.tick_params(axis='y', labelcolor=color2)
plt.show()
I would like to create a loop so that the code above runs for every Code that is related to a City, with quantity x and deaths, and it saves each graph made into a folder. How can I create a loop that does that, and stops every different Code?
Observations: Some values on df['Quantity x] and df[Population] are left blank.
If I understood you correctly, you are looking for a filtering functionality:
import matplotlib.pyplot as plt
import pandas as pd
def plot_quantity_and_death(df):
# your code
fig, ax1 = plt.subplots(figsize=(9, 9))
color = 'tab:red'
ax1.set_xlabel('Date')
ax1.set_ylabel('Quantity X', color=color)
ax1.plot(df['Quantity x'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
color2 = 'tab:blue'
ax2.set_ylabel('Deaths', color=color2)
ax2.plot(df['Deaths'], color=color2)
ax2.tick_params(axis='y', labelcolor=color2)
# save & close addon
plt.savefig(f"Code_{str(df['Code'].iloc[0])}.png")
plt.close()
df = pd.DataFrame() # this needs to be replaced by your dataset
# get unique city codes, loop over them, filter data and plot it
unique_codes = pd.unique(df['Code'])
for code in unique_codes:
filtered_df = df[df['Code'] == code]
plot_quantity_and_death(filtered_df)

How can I plot a secondary y-axis with seaborn's barplot?

I'm trying to plot the data (see below). With company_name on the x-axis, status_mission_2_y on the y axis and percentage on the other y_axis. I have tried using the twinx() fucntion but I can't get it to work.
Please can you help? Thanks in advance!
def twinplot(data):
x_ = data.columns[0]
y_ = data.columns[1]
y_2 = data.columns[2]
data1 = data[[x_, y_]]
data2 = data[[x_, y_2]]
plt.figure(figsize=(15, 8))
ax = sns.barplot(x=x_, y=y_, data=data1)
ax2 = ax.twinx()
g2 = sns.barplot(x=x_, y=y_2, data=data2, ax=ax2)
plt.show()
data = ten_company_missions_failed
twinplot(data)
company_name
percentage
status_mission_2_y
EER
1
1
Ghot
1
1
Trv
1
1
Sandia
1
1
Test
1
1
US Navy
0.823529412
17
Zed
0.8
5
Gov
0.75
4
Knight
0.666666667
3
Had
0.666666667
3
Seaborn plots the two bar plots with the same color and on the same x-positions.
The following example code resizes the bar widths, with the bars belonging ax moved to the left. And the bars of ax2 moved to the right. To differentiate the right bars, a semi-transparency (alpha=0.7) and hatching is used.
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import pandas as pd
import seaborn as sns
from io import StringIO
data_str = '''company_name percentage status_mission_2_y
EER 1 1
Ghot 1 1
Trv 1 1
Sandia 1 1
Test 1 1
"US Navy" 0.823529412 17
Zed 0.8 5
Gov 0.75 4
Knight 0.666666667 3
Had 0.666666667 3'''
data = pd.read_csv(StringIO(data_str), delim_whitespace=True)
x_ = data.columns[0]
y_ = data.columns[1]
y_2 = data.columns[2]
data1 = data[[x_, y_]]
data2 = data[[x_, y_2]]
plt.figure(figsize=(15, 8))
ax = sns.barplot(x=x_, y=y_, data=data1)
width_scale = 0.45
for bar in ax.containers[0]:
bar.set_width(bar.get_width() * width_scale)
ax.yaxis.set_major_formatter(PercentFormatter(1))
ax2 = ax.twinx()
sns.barplot(x=x_, y=y_2, data=data2, alpha=0.7, hatch='xx', ax=ax2)
for bar in ax2.containers[0]:
x = bar.get_x()
w = bar.get_width()
bar.set_x(x + w * (1- width_scale))
bar.set_width(w * width_scale)
plt.show()
A simpler alternative could be to combine a barplot on ax and a lineplot on ax2.
plt.figure(figsize=(15, 8))
ax = sns.barplot(x=x_, y=y_, data=data1)
ax.yaxis.set_major_formatter(PercentFormatter(1))
ax2 = ax.twinx()
sns.lineplot(x=x_, y=y_2, data=data2, marker='o', color='crimson', lw=3, ax=ax2)
plt.show()

Problem with linear regression and summarize

I would like to create a plot of my linear regression model showing bike sales for each year summed up at one point, and not like now that there are two points separately.
This is my code:
from sklearn.linear_model import LinearRegression
from sklearn import datasets, linear_model
## Wzrost lub maleje zakup rowerow
## (Purchase of bicycles increases or decreases)
plot1 = df.groupby('Year')['Product_Category'].value_counts().rename('count').reset_index()
x = plot1['Year'].values.reshape(-1, 1)
y = plot1['count'].values.reshape(-1, 1)
# plot #
## linear ##
regr = linear_model.LinearRegression()
regr.fit(x, y)
y_pred = regr.predict(x_test)
#plot#
plt.scatter(x, y, color='black')
plt.plot(x, y, color='blue', linewidth=3)
This is my plot:
As what I can understand from your example, this maybe a solution, replace value_counts by count.
Example data:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({'Year': [ 2019, 2019, 2020, 2021], 'Product_Category': ['a', 'b', 'c', 'd']})
print(df)
Year Product_Category
0 2019 a
1 2019 b
2 2020 c
3 2021 d
The count will return:
plot1 = df.groupby('Year')['Product_Category'].count().rename('count').reset_index()
print(plot1)
Year count
0 2019 2
1 2020 1
2 2021 1
plot1 = df.groupby('Year')['Product_Category'].count().rename('count').reset_index()
#x,y#
x = plot1['Year'].values.reshape(-1, 1)
y = plot1['count'].values.reshape(-1, 1)
# plot #
#plot#
plt.scatter(x, y, color='black')
plt.plot(x, y, color='blue', linewidth=3)

python pandas bar plot another column text

max min mincount maxcount
0 12 10 1 6
1 21 14 1 6
2 34 19 1 6
3 6 20 1 4
4 8 22 1 4
5 41 23 1 4
this is pandas DataFrame.
so I want like this image.
enter image description here
text label is very important.
here my code
df = pd.DataFrame({'maxcount': max_count, 'mincount': min_count, 'max': max, 'min': min})
ax = df[['maxcount', 'mincount']].plot(kind='bar')
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#create your dataframe
d= {'max':[12,21,34,6,8,41],'min':[10,14,19,20,22,23],
'mincount':[1,1,1,1,1,1],'maxcount':[6,6,6,4,4,4]}
df=pd.DataFrame(d)
#create 2 dataframes counts and max_min (1 for plotting and 1 for text)
counts=pd.DataFrame(df,columns=['maxcount','mincount'])
max_min=pd.DataFrame(df,columns=['max','min'])
#plot the counts
ax=counts[counts.columns].plot(kind='bar',colormap='Paired',figsize= (12,4))
#using zip() and ax.annotate specify where (location by means of z)
#and what (max_min or counts) you want to plot
for x,y,z in zip(max_min.iloc[:,0].values,counts.iloc[:,0].values, range(len(counts))):
ax.annotate('%.d' % x, (z-0.2, counts.iloc[z,0]), va='bottom', ha='center', fontsize=10)
ax.annotate("("'%.d' % y+")", (z-0.1, counts.iloc[z,0]), va='bottom', ha='center', fontsize=10)
for x,y,z in zip(max_min.iloc[:,1].values,counts.iloc[:,1].values, range(len(counts))):
ax.annotate('%.d' % x, (z+0.1, counts.iloc[z,1]), va='bottom', ha='center', fontsize=10)
ax.annotate("("'%.d' % y+")", (z+0.2, counts.iloc[z,1]), va='bottom', ha='center', fontsize=10)
This is the output:

Annotating scatter plot in groupby plot

I have a df like so:
ID Prcp NDVI Year
1 1.4 0.1 2000
1 2.3 0.4 2001
1 4.4 0.8 2002
1 0.4 0.1 2003
2 2.1 0.6 2000
2 1.2 0.4 2001
2 3.4 0.7 2002
2 2.8 0.5 2003
and I want to do a scatter plot of Prcp vs. NDVI for each unique ID. I then want to add a data label for Year for each particular point on the plot. I am trying to do it like this:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
df=pd.read_csv(r'H:\my_file.csv')
with PdfPages(r'H:\path_to_out\out.pdf') as pdf:
for i, group in df.groupby('ID'):
plot = group.plot(x=['Prcp'], y=['NDVI'], title='NDVI_' + str(i), kind='scatter').get_figure()
n=df.Year
b=df.Prcp
c=df.NDVI
for i, txt in enumerate(n):
plt.annotate(txt, (b[i],c[i]), fontsize=2.5)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
ax = plt.gca()
ax.set_xlabel('Precipitation')
ax.set_ylabel('Mean NDVI')
pdf.savefig(plot, bbox_inches='tight', pad_inches=0)
plt.close(plot)
but this doesn't work correctly.
To do this for just 1 plot, with a df like this:
ID Prcp NDVI Year
1 1.4 0.1 2000
1 2.3 0.4 2001
1 4.4 0.8 2002
1 0.4 0.1 2003
I would do it like this:
a=df.Prcp
b=df.NDVI
n=df.Year
with PdfPages(r'H:\graph.pdf') as pdf:
plt.title('NDVI')
plt.xlabel('Prcp')
plt.ylabel('NDVI')
plt.scatter(df.Prcp,df.NDVI, facecolors='none', s=20, edgecolors='b')
for i, txt in enumerate(n):
plt.annotate(txt, (a[i],b[i]), fontsize=2.5)
axes=plt.gca()
fig=plt.gcf()
pdf.savefig(fig)
plt.show()
EDIT:
I achieved it using this:
def label_point(Prcp,Mean, Year, ax):
a = pd.concat({'Prcp': Prcp, 'NDVI': NDVI, 'Year': Year}, axis=1)
for i, point in a.iterrows():
ax.text(point['Prcp'], point['NDVI'], str(point['Year']))
label_point(group.Prcp, group.NDVI, group.Year, ax)
I achieved it like this:
df=pd.read_csv(r'E:\path.csv')
with PdfPages(r'E:\pth.pdf') as pdf:
for i, group in first.groupby('ID'):
fig, ax = plt.subplots()
plot = group.plot(x=['Prcp'], y=['NDVI'], title='NDVI_' + str(i), kind='scatter', ax=ax).get_figure()
def label_point(Prcp, NDVI, Year, ax):
a = pd.concat({'Prcp': Prcp, 'NDVI': NDVI, 'Year': Year}, axis=1)
for i, point in a.iterrows():
ax.text(point['Prcp'], point['NDVI'], str(point['Year']))
label_point(group.Prcp, group.NDVI, group.Year, ax)
ax = plt.gca()
ax.set_xlabel('Precipitation')
ax.set_ylabel('Mean NDVI')
pdf.savefig(plot, bbox_inches='tight', pad_inches=0)
plt.close(plot)

Categories