I have written a program like so:
# Author: Evan Gertis
# Date : 11/09
# program: Linear Regression
# Resource: https://seaborn.pydata.org/generated/seaborn.scatterplot.html
import seaborn as sns
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Step 1: load the data
grades = pd.read_csv("grades.csv")
logging.info(grades.head())
# Step 2: plot the data
plot = sns.scatterplot(data=grades, x="Hours", y="GPA")
fig = plot.get_figure()
fig.savefig("out.png")
Using the data set
Hours,GPA,Hours,GPA,Hours,GPA
11,2.84,9,2.85,25,1.85
5,3.20,5,3.35,6,3.14
22,2.18,14,2.60,9,2.96
23,2.12,18,2.35,20,2.30
20,2.55,6,3.14,14,2.66
20,2.24,9,3.05,19,2.36
10,2.90,24,2.06,21,2.24
19,2.36,25,2.00,7,3.08
15,2.60,12,2.78,11,2.84
18,2.42,6,2.90,20,2.45
I would like to plot out all of the relationships at this time I just get one plot:
Expected:
all relationships plotted
Actual:
I wrote a basic program and I was expecting all of the relationships to be plotted.
The origin of the problem is that the columns names in your file are the same and thus when pandas read the columns adds number to the loaded data frame
import seaborn as sns
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
grades = pd.read_csv("grades.csv")
print(grades.columns)
>>> Index(['Hours', 'GPA', 'Hours.1', 'GPA.1', 'Hours.2', 'GPA.2'], dtype='object')
therefore when you plot the scatter plot you need to give the name of the column names that pandas give
# in case you want all scatter plots in the same figure
plot = sns.scatterplot(data=grades, x="Hours", y="GPA", label='GPA')
sns.scatterplot(data=grades, x='Hours.1', y='GPA.1', ax=plot, label="GPA.1")
sns.scatterplot(data=grades, x='Hours.2', y='GPA.2', ax=plot, label='GPA.2')
fig = plot.get_figure()
fig.savefig("out.png")
There are better options than manually creating a plot for each group of columns
Because the columns in the file have redundant names, pandas automatically renames them.
Imports and DataFrame
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# read the data from the file
df = pd.read_csv('d:/data/gpa.csv')
# display(df)
Hours GPA Hours.1 GPA.1 Hours.2 GPA.2
0 11 2.84 9 2.85 25 1.85
1 5 3.20 5 3.35 6 3.14
2 22 2.18 14 2.60 9 2.96
3 23 2.12 18 2.35 20 2.30
4 20 2.55 6 3.14 14 2.66
5 20 2.24 9 3.05 19 2.36
6 10 2.90 24 2.06 21 2.24
7 19 2.36 25 2.00 7 3.08
8 15 2.60 12 2.78 11 2.84
9 18 2.42 6 2.90 20 2.45
Option 1: Chunk the column names
This option can be used to plot the data in a loop without manually creating each plot
Using this answer from How to iterate over a list in chunks will create a list of column name groups:
[Index(['Hours', 'GPA'], dtype='object'), Index(['Hours.1', 'GPA.1'], dtype='object'), Index(['Hours.2', 'GPA.2'], dtype='object')]
# create groups of column names to be plotted together
def chunker(seq, size):
return [seq[pos:pos + size] for pos in range(0, len(seq), size)]
# function call
col_list = chunker(df.columns, 2)
# iterate through each group of column names to plot
for x, y in chunker(df.columns, 2):
sns.scatterplot(data=df, x=x, y=y, label=y)
Option 2: Fix the data
# filter each group of columns, melt the result into a long form, and get the value
h = df.filter(like='Hours').melt().value
g = df.filter(like='GPA').melt().value
# get the gpa column names
gpa_cols = df.columns[1::2]
# use numpy to create a list of labels with the appropriate length
labels = np.repeat(gpa_cols, len(df))
# otherwise use a list comprehension to create the labels
# labels = [v for x in gpa_cols for v in [x]*len(df)]
# create a new dataframe
dfl = pd.DataFrame({'hours': h, 'gpa': g, 'label': labels})
# save dfl if desired
dfl.to_csv('gpa_long.csv', index=False)
# plot
sns.scatterplot(data=dfl, x='hours', y='gpa', hue='label')
Plot Result
Related
Reason why I am loading the df from the .csv is because another file creates the csv and then this file will access it (maybe this is an issue? not sure)
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('MAIN_DATAFRAME.csv')
def plot_graph_1(MAIN_DATAFRAME):
df1 = MAIN_DATAFRAME.loc[['Bots']]
df1 = df1.transpose()
df2 = MAIN_DATAFRAME.loc[['Speed']]
df2 = df2.transpose()
df3 = MAIN_DATAFRAME.loc[['Weight']]
df3 = df3.transpose()
df4 = MAIN_DATAFRAME.loc[['Chargers']]
df4 = df4.transpose()
ax = df1.plot(kind='bar')
df2.plot(ax=ax, kind='bar')
df3.plot(ax=ax,kind='bar')
df4.plot(ax=ax, kind='bar')
ax.bar(ax, df1)
plt.show()
plot_graph_1(df)
So I would like to have this Dataframe be plotted and ideally the bar charts will share axis and be different collors so that they can be distinguised when stacked on each other.
btw here is the dataframe:
Run 1
Run 2
Run 3
Run 4
Run 5
Run 6
Run 7
Run 8
Run 9
Run 10
Bots
5
6
7
8
9
10
11
12
13
14
Speed
1791
2359
2996
3593
4105
4551
4631
4656
4672
4674
Weight
612
733
810
888
978
1059
1079
1085
1090
1092
Chargers
10
10
10
10
10
10
10
10
10
10
I tried changing how I access the dataframe values. I also tried changing brackets from: df2 = MAIN_DATAFRAME.loc[['Speed']] to df2 = MAIN_DATAFRAME.loc['Speed'] and still get a key error.
You can transpose the whole DataFrame and then you can plot it like this:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
# Read data from CSV
df = pd.read_csv(
"3.csv",
index_col=0
)
# Define plotting function
def plot_bars_from_df(df: pd.DataFrame) -> plt.Axes:
"""Plot bar chart from DataFrame."""
df = df.transpose()
ax = df.plot(
kind="bar"
)
return ax
# Call function
plot_bars_from_df(df)
You'll get the following output
However, "Bots" and "Charger" are few orders of magnitude smaller than the other columns so it doesn't make much sense to plot them together.
I want to merge two plots, that is my dataframe:
df_inc.head()
id date real_exe_time mean mean+30% mean-30%
0 Jan 31 33.14 43.0 23.0
1 Jan 30 33.14 43.0 23.0
2 Jan 33 33.14 43.0 23.0
3 Jan 38 33.14 43.0 23.0
4 Jan 36 33.14 43.0 23.0
My first plot:
df_inc.plot.scatter(x = 'date', y = 'real_exe_time')
Then
My second plot:
df_inc.plot(x='date', y=['mean','mean+30%','mean-30%'])
When I try to merge with:
fig=plt.figure()
ax = df_inc.plot(x='date', y=['mean','mean+30%','mean-30%']);
df_inc.plot.scatter(x = 'date', y = 'real_exe_time', ax=ax)
plt.show()
I got the following:
How I can merge the right way?
You should not repeat your mean values as an extra column. df.plot() for categorical data will be plotted against the index - hence you will see the original scatter plot (also plotted against the index) squeezed into the left corner.
You could create instead an additional aggregation dataframe that you can plot then into the same graph:
import matplotlib.pyplot as plt
import pandas as pd
#test data generation
import numpy as np
n=30
np.random.seed(123)
df = pd.DataFrame({"date": np.random.choice(list("ABCDEF"), n), "real_exe_time": np.random.randint(1, 100, n)})
df = df.sort_values(by="date").reindex()
#aggregate data for plotting
df_agg = df.groupby("date")["real_exe_time"].agg(mean="mean").reset_index()
df_agg["mean+30%"] = df_agg["mean"] * 1.3
df_agg["mean-30%"] = df_agg["mean"] * 0.7
#plot both into the same subplot
ax = df.plot.scatter(x = 'date', y = 'real_exe_time')
df_agg.plot(x='date', y=['mean','mean+30%','mean-30%'], ax=ax)
plt.show()
Sample output:
You could also consider using seaborn that has, for instance, pointplots for categorical data aggregation.
I'm Guessing that you haven't transform the Date to a datetime object so the first thing you should do is this
#Transform the date to datetime object
df_inc['date']=pd.to_datetime(df_inc['date'],format='%b')
fig=plt.figure()
ax = df_inc.plot(x='date', y=['mean','mean+30%','mean-30%']);
df_inc.plot.scatter(x = 'date', y = 'real_exe_time', ax=ax)
plt.show()
I want to make a boxplot with on the the x-axis having the x variable split in different ranges, for eg: 0-5, 5-10, 10+. Is there a way to do this efficiently in Matplotlib/Seaborn without having to create uneven new columns based on subsetting? So for this example dataset below a I want a boxplot with 3 boxes 0-5 (1a4j,1a6u,1ahc), 5-10 (1brq,1bya), 10+ (1bya,1bbs) given the rot_bonds variable
structure rot_bonds no_atoms logP
0 1a4j 3 37 2.46
1 1a6u 4 17 1.58
2 1ahc 0 10 -0.06
3 1bbs 20 51 4.81
4 1brq 5 21 5.51
5 1bya 10 45 -9.75
Thanks in advance.
With seaborn you could use the slicing into ranges as the x axis, and for example 'no_atoms' as the y-values for the boxplot:
from matplotlib import pyplot as plt
from io import StringIO
import pandas as pd
import seaborn as sns
s = ''' structure rot_bonds no_atoms logP
0 1a4j 3 37 2.46
1 1a6u 4 17 1.58
2 1ahc 0 10 -0.06
3 1bbs 20 51 4.81
4 1brq 5 21 5.51
5 1bya 10 45 -9.75'''
df = pd.read_csv(StringIO(s), delim_whitespace=True)
sns.boxplot(x=pd.cut(df['rot_bonds'], [0, 5, 10, 1000]), y='no_atoms', data=df)
plt.show()
How can I make a distplot with seaborn to only have whole numbers?
My data is an array of numbers between 0 and ~18. I would like to plot the distribution of the numbers.
Impressions
0 210
1 1084
2 2559
3 4378
4 5500
5 5436
6 4525
7 3329
8 2078
9 1166
10 586
11 244
12 105
13 51
14 18
15 5
16 3
dtype: int64
Code I'm using:
sns.distplot(Impressions,
# bins=np.arange(Impressions.min(), Impressions.max() + 1),
# kde=False,
axlabel=False,
hist_kws={'edgecolor':'black', 'rwidth': 1})
plt.xticks = range(current.Impressions.min(), current.Impressions.max() + 1, 1)
Plot looks like this:
What I'm expecting:
The xlabels should be whole numbers
Bars should touch each other
The kde line should simply connect the top of the bars. By the looks of it, the current one assumes to have 0s between (x, x + 1), hence why the downward spike (This isn't required, I can turn off kde)
Am I using the correct tool for the job or distplot shouldn't be used for whole numbers?
For your problem can be solved bellow code,
import seaborn as sns # for data visualization
import numpy as np # for numeric computing
import matplotlib.pyplot as plt # for data visualization
arr = np.array([1,2,3,4,5,6,7,8,9])
sns.distplot(arr, bins = arr, kde = False)
plt.xticks(arr)
plt.show()
enter image description here
In this way, you can plot histogram using seaborn sns.distplot() function.
Note: Whatever data you will pass to bins and plt.xticks(). It should be an ascending order.
I would like to add a moving average calculation to my exchange time series.
Original data from Quandl
Exchange = Quandl.get("BUNDESBANK/BBEX3_D_SEK_USD_CA_AC_000",
authtoken="xxxxxxx")
# Value
# Date
# 1989-01-02 6.10500
# 1989-01-03 6.07500
# 1989-01-04 6.10750
# 1989-01-05 6.15250
# 1989-01-09 6.25500
# 1989-01-10 6.24250
# 1989-01-11 6.26250
# 1989-01-12 6.23250
# 1989-01-13 6.27750
# 1989-01-16 6.31250
# Calculating Moving Avarage
MovingAverage = pd.rolling_mean(Exchange,5)
# Value
# Date
# 1989-01-02 NaN
# 1989-01-03 NaN
# 1989-01-04 NaN
# 1989-01-05 NaN
# 1989-01-09 6.13900
# 1989-01-10 6.16650
# 1989-01-11 6.20400
# 1989-01-12 6.22900
# 1989-01-13 6.25400
# 1989-01-16 6.26550
I would like to add the calculated Moving Average as a new column to the right after Value using the same index (Date). Preferably I would also like to rename the calculated moving average to MA.
The rolling mean returns a Series you only have to add it as a new column of your DataFrame (MA) as described below.
For information, the rolling_mean function has been deprecated in pandas newer versions. I have used the new method in my example, see below a quote from the pandas documentation.
Warning Prior to version 0.18.0, pd.rolling_*, pd.expanding_*, and pd.ewm* were module level functions and are now deprecated. These are replaced by using the Rolling, Expanding and EWM. objects and a corresponding method call.
df['MA'] = df.rolling(window=5).mean()
print(df)
# Value MA
# Date
# 1989-01-02 6.11 NaN
# 1989-01-03 6.08 NaN
# 1989-01-04 6.11 NaN
# 1989-01-05 6.15 NaN
# 1989-01-09 6.25 6.14
# 1989-01-10 6.24 6.17
# 1989-01-11 6.26 6.20
# 1989-01-12 6.23 6.23
# 1989-01-13 6.28 6.25
# 1989-01-16 6.31 6.27
A moving average can also be calculated and visualized directly in a line chart by using the following code:
Example using stock price data:
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import datetime
plt.style.use('ggplot')
# Input variables
start = datetime.datetime(2016, 1, 01)
end = datetime.datetime(2018, 3, 29)
stock = 'WFC'
# Extrating data
df = web.DataReader(stock,'morningstar', start, end)
df = df['Close']
print df
plt.plot(df['WFC'],label= 'Close')
plt.plot(df['WFC'].rolling(9).mean(),label= 'MA 9 days')
plt.plot(df['WFC'].rolling(21).mean(),label= 'MA 21 days')
plt.legend(loc='best')
plt.title('Wells Fargo\nClose and Moving Averages')
plt.show()
Tutorial on how to do this: https://youtu.be/XWAPpyF62Vg
In case you are calculating more than one moving average:
for i in range(2,10):
df['MA{}'.format(i)] = df.rolling(window=i).mean()
Then you can do an aggregate average of all the MA
df[[f for f in list(df) if "MA" in f]].mean(axis=1)
To get the moving average in pandas we can use cum_sum and then divide by count.
Here is the working example:
import pandas as pd
import numpy as np
df = pd.DataFrame({'id': range(5),
'value': range(100,600,100)})
# some other similar statistics
df['cum_sum'] = df['value'].cumsum()
df['count'] = range(1,len(df['value'])+1)
df['mov_avg'] = df['cum_sum'] / df['count']
# other statistics
df['rolling_mean2'] = df['value'].rolling(window=2).mean()
print(df)
output
id value cum_sum count mov_avg rolling_mean2
0 0 100 100 1 100.0 NaN
1 1 200 300 2 150.0 150.0
2 2 300 600 3 200.0 250.0
3 3 400 1000 4 250.0 350.0
4 4 500 1500 5 300.0 450.0