Reason why I am loading the df from the .csv is because another file creates the csv and then this file will access it (maybe this is an issue? not sure)
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('MAIN_DATAFRAME.csv')
def plot_graph_1(MAIN_DATAFRAME):
df1 = MAIN_DATAFRAME.loc[['Bots']]
df1 = df1.transpose()
df2 = MAIN_DATAFRAME.loc[['Speed']]
df2 = df2.transpose()
df3 = MAIN_DATAFRAME.loc[['Weight']]
df3 = df3.transpose()
df4 = MAIN_DATAFRAME.loc[['Chargers']]
df4 = df4.transpose()
ax = df1.plot(kind='bar')
df2.plot(ax=ax, kind='bar')
df3.plot(ax=ax,kind='bar')
df4.plot(ax=ax, kind='bar')
ax.bar(ax, df1)
plt.show()
plot_graph_1(df)
So I would like to have this Dataframe be plotted and ideally the bar charts will share axis and be different collors so that they can be distinguised when stacked on each other.
btw here is the dataframe:
Run 1
Run 2
Run 3
Run 4
Run 5
Run 6
Run 7
Run 8
Run 9
Run 10
Bots
5
6
7
8
9
10
11
12
13
14
Speed
1791
2359
2996
3593
4105
4551
4631
4656
4672
4674
Weight
612
733
810
888
978
1059
1079
1085
1090
1092
Chargers
10
10
10
10
10
10
10
10
10
10
I tried changing how I access the dataframe values. I also tried changing brackets from: df2 = MAIN_DATAFRAME.loc[['Speed']] to df2 = MAIN_DATAFRAME.loc['Speed'] and still get a key error.
You can transpose the whole DataFrame and then you can plot it like this:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
# Read data from CSV
df = pd.read_csv(
"3.csv",
index_col=0
)
# Define plotting function
def plot_bars_from_df(df: pd.DataFrame) -> plt.Axes:
"""Plot bar chart from DataFrame."""
df = df.transpose()
ax = df.plot(
kind="bar"
)
return ax
# Call function
plot_bars_from_df(df)
You'll get the following output
However, "Bots" and "Charger" are few orders of magnitude smaller than the other columns so it doesn't make much sense to plot them together.
Related
I have written a program like so:
# Author: Evan Gertis
# Date : 11/09
# program: Linear Regression
# Resource: https://seaborn.pydata.org/generated/seaborn.scatterplot.html
import seaborn as sns
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Step 1: load the data
grades = pd.read_csv("grades.csv")
logging.info(grades.head())
# Step 2: plot the data
plot = sns.scatterplot(data=grades, x="Hours", y="GPA")
fig = plot.get_figure()
fig.savefig("out.png")
Using the data set
Hours,GPA,Hours,GPA,Hours,GPA
11,2.84,9,2.85,25,1.85
5,3.20,5,3.35,6,3.14
22,2.18,14,2.60,9,2.96
23,2.12,18,2.35,20,2.30
20,2.55,6,3.14,14,2.66
20,2.24,9,3.05,19,2.36
10,2.90,24,2.06,21,2.24
19,2.36,25,2.00,7,3.08
15,2.60,12,2.78,11,2.84
18,2.42,6,2.90,20,2.45
I would like to plot out all of the relationships at this time I just get one plot:
Expected:
all relationships plotted
Actual:
I wrote a basic program and I was expecting all of the relationships to be plotted.
The origin of the problem is that the columns names in your file are the same and thus when pandas read the columns adds number to the loaded data frame
import seaborn as sns
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
grades = pd.read_csv("grades.csv")
print(grades.columns)
>>> Index(['Hours', 'GPA', 'Hours.1', 'GPA.1', 'Hours.2', 'GPA.2'], dtype='object')
therefore when you plot the scatter plot you need to give the name of the column names that pandas give
# in case you want all scatter plots in the same figure
plot = sns.scatterplot(data=grades, x="Hours", y="GPA", label='GPA')
sns.scatterplot(data=grades, x='Hours.1', y='GPA.1', ax=plot, label="GPA.1")
sns.scatterplot(data=grades, x='Hours.2', y='GPA.2', ax=plot, label='GPA.2')
fig = plot.get_figure()
fig.savefig("out.png")
There are better options than manually creating a plot for each group of columns
Because the columns in the file have redundant names, pandas automatically renames them.
Imports and DataFrame
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# read the data from the file
df = pd.read_csv('d:/data/gpa.csv')
# display(df)
Hours GPA Hours.1 GPA.1 Hours.2 GPA.2
0 11 2.84 9 2.85 25 1.85
1 5 3.20 5 3.35 6 3.14
2 22 2.18 14 2.60 9 2.96
3 23 2.12 18 2.35 20 2.30
4 20 2.55 6 3.14 14 2.66
5 20 2.24 9 3.05 19 2.36
6 10 2.90 24 2.06 21 2.24
7 19 2.36 25 2.00 7 3.08
8 15 2.60 12 2.78 11 2.84
9 18 2.42 6 2.90 20 2.45
Option 1: Chunk the column names
This option can be used to plot the data in a loop without manually creating each plot
Using this answer from How to iterate over a list in chunks will create a list of column name groups:
[Index(['Hours', 'GPA'], dtype='object'), Index(['Hours.1', 'GPA.1'], dtype='object'), Index(['Hours.2', 'GPA.2'], dtype='object')]
# create groups of column names to be plotted together
def chunker(seq, size):
return [seq[pos:pos + size] for pos in range(0, len(seq), size)]
# function call
col_list = chunker(df.columns, 2)
# iterate through each group of column names to plot
for x, y in chunker(df.columns, 2):
sns.scatterplot(data=df, x=x, y=y, label=y)
Option 2: Fix the data
# filter each group of columns, melt the result into a long form, and get the value
h = df.filter(like='Hours').melt().value
g = df.filter(like='GPA').melt().value
# get the gpa column names
gpa_cols = df.columns[1::2]
# use numpy to create a list of labels with the appropriate length
labels = np.repeat(gpa_cols, len(df))
# otherwise use a list comprehension to create the labels
# labels = [v for x in gpa_cols for v in [x]*len(df)]
# create a new dataframe
dfl = pd.DataFrame({'hours': h, 'gpa': g, 'label': labels})
# save dfl if desired
dfl.to_csv('gpa_long.csv', index=False)
# plot
sns.scatterplot(data=dfl, x='hours', y='gpa', hue='label')
Plot Result
This question already has answers here:
Pandas dataframe groupby plot
(3 answers)
Saving plots (AxesSubPlot) generated from python pandas with matplotlib's savefig
(6 answers)
How to save a Seaborn plot into a file
(10 answers)
Closed 6 months ago.
I have a pandas dataframe as below:
Well Name
READTIME
WL
0
A
02-Jul-20
12
1
B
03-Aug-22
18
2
C
05-Jul-21
14
3
A
03-May-21
16
4
B
01-Jan-19
19
5
C
12-Dec-20
20
6
D
14-Nov-21
14
7
A
01-Mar-22
17
8
B
15-Feb-21
11
9
C
10-Oct-20
10
10
D
14-Sep-21
5
groupByName = df.groupby(['Well Name', 'READTIME'])
After grouping them by 'Well Name' and Readtime, i got the following:
Well Name READTIME WL
A 2020-07-02 12
2021-05-03 16
2022-03-01 17
B 2019-01-01 19
2021-02-15 11
2022-08-03 18
C 2020-10-10 10
2020-12-12 20
2021-07-05 14
D 2021-09-14 5
2021-11-14 14
I have got the following graph by running this code:
sns.relplot(data=df, x="READTIME", y="WL", hue="Well Name",kind="line", height=4, aspect=3)
I want to have a separate graph for each "Well Name" and saved it as a pdf. I will really appreciate your help with this. Thank you
To separate out the plots, you can iterate over the four unique Well Names in your dataset and filter the dataset for each Well Name before plotting:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# I saved your data as an Excel file
df = pd.read_excel('Book1.xlsx')
print(df)
# Get the set of unique Well Names
well_names = set(df['Well Name'].to_list())
for wn in well_names:
# Create dataframe containing only rows with this Well Name
this_wn = df[df['Well Name'] == wn]
# Plot, save, and show
sns.relplot(data=this_wn, x="READTIME", y="WL", hue="Well Name",kind="line", height=4, aspect=3)
plt.savefig(f'{wn}.png')
plt.show(block=True)
This generated the following 4 image files:
For saving in a PDF file, please see this answer.
In this case, specifying a row results in a faceted graph.
sns.relplot(data=df, x="READTIME", y="WL", hue="Well Name", kind="line", row='Well Name', height=4, aspect=3)
I have two data frames that collect historical price series of two different stocks. applying describe () I noticed that the elements of the first stock are 1291 while those of the second are 1275. This difference is due to the fact that the two securities are listed on different stock exchanges and therefore show differences on some dates. What I would like to do is keep the two separate dataframes, but make sure that in the first dataframe, all those rows whose dates are not present in the second dataframe are deleted in order to have the perfect matching of the two dataframes to do the analyzes. I have read that there are functions such as merge () or join () but I have not been able to understand well how to use them (if these are the correct functions). I thank those who will use some of their time to answer my question.
"ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1275 and the array at index 1 has size 1291"
Thank you
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_datareader as web
from scipy import stats
import seaborn as sns
pd.options.display.min_rows= None
pd.options.display.max_rows= None
tickers = ['DISW.MI','IXJ','NRJ.PA','SGOL','VDC','VGT']
wts= [0.19,0.18,0.2,0.08,0.09,0.26]
price_data = web.get_data_yahoo(tickers,
start = '2016-01-01',
end = '2021-01-01')
price_data = price_data['Adj Close']
ret_data = price_data.pct_change()[1:]
port_ret = (ret_data * wts).sum(axis = 1)
benchmark_price = web.get_data_yahoo('ACWE.PA',
start = '2016-01-01',
end = '2021-01-01')
benchmark_ret = benchmark_price["Adj Close"].pct_change()[1:].dropna()
#From now i get error
sns.regplot(benchmark_ret.values,
port_ret.values)
plt.xlabel("Benchmark Returns")
plt.ylabel("Portfolio Returns")
plt.title("Portfolio Returns vs Benchmark Returns")
plt.show()
(beta, alpha) = stats.linregress(benchmark_ret.values,
port_ret.values)[0:2]
print("The portfolio beta is", round(beta, 4))
Let's consider a toy example.
df1 consists of 6 days of data and df2 consists of 5 days of data.
What I have understood, you want df1 also to have 5 days of data matching the dates with df2.
df1
df1 = pd.DataFrame({
'date':pd.date_range('2021-05-17', periods=6),
'px':np.random.rand(6)
})
df1
date px
0 2021-05-17 0.054907
1 2021-05-18 0.192294
2 2021-05-19 0.214051
3 2021-05-20 0.623223
4 2021-05-21 0.004627
5 2021-05-22 0.127086
df2
df2 = pd.DataFrame({
'date':pd.date_range('2021-05-17', periods=5),
'px':np.random.rand(5)
})
df2
date px
0 2021-05-17 0.650976
1 2021-05-18 0.393061
2 2021-05-19 0.985700
3 2021-05-20 0.879786
4 2021-05-21 0.463206
Code
To consider only matching dates in df1 from df2.
df1 = df1[df1.date.isin(df2.date)]
Output df1
date px
0 2021-05-17 0.054907
1 2021-05-18 0.192294
2 2021-05-19 0.214051
3 2021-05-20 0.623223
4 2021-05-21 0.004627
I have these charts that I've created in Excel from dataframes of a structure like such:
so that the chart can be created like this, stacking the 5-Year Range area on top of the Min range (no fill) so that the range area can be shaded. The min/max/range/avg columns all calculate off of 2016-2020.
I know that I can plot lines for multiple years on the same axis by using a date index and applying month labels, but is there a way to replicate the shading of this chart, more specifically if my dataframes are in a simple date index-value format, like so:
Quantity
1/1/2016 6
2/1/2016 4
3/1/2016 1
4/1/2016 10
5/1/2016 7
6/1/2016 10
7/1/2016 10
8/1/2016 2
9/1/2016 1
10/1/2016 2
11/1/2016 3
… …
1/1/2020 4
2/1/2020 8
3/1/2020 3
4/1/2020 5
5/1/2020 8
6/1/2020 6
7/1/2020 6
8/1/2020 7
9/1/2020 8
10/1/2020 5
11/1/2020 4
12/1/2020 3
1/1/2021 9
2/1/2021 7
3/1/2021 7
I haven't been able to find anything similar in the plot libraries.
Two step process
restructure DF so that years are columns, rows indexed by uniform date time
plot using matplotlib
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
# straight date as index, quantity as column
d = pd.date_range("1-Jan-2016", "1-Mar-2021", freq="MS")
df = pd.DataFrame({"Quantity":np.random.randint(1, 10, len(d))}, index=d)
# re-structure as multi-index, make year column
# add calculated columns
dfg = (df.set_index(pd.MultiIndex.from_arrays([df.index.map(lambda d: dt.date(dt.date.today().year, d.month, d.day)),
df.index.year], names=["month","year"]))
.unstack("year")
.droplevel(0, axis=1)
.assign(min=lambda dfa: dfa.loc[:,[c for c in dfa.columns if dfa[c].count()==12]].min(axis=1),
max=lambda dfa: dfa.loc[:,[c for c in dfa.columns if dfa[c].count()==12]].max(axis=1),
avg=lambda dfa: dfa.loc[:,[c for c in dfa.columns if dfa[c].count()==12]].mean(axis=1).round(1),
)
)
fig, ax = plt.subplots(1, figsize=[14,4])
# now plot all the parts
ax.fill_between(dfg.index, dfg["min"], dfg["max"], label="5y range", facecolor="oldlace")
ax.plot(dfg.index, dfg[2020], label="2020", c="r")
ax.plot(dfg.index, dfg[2021], label="2021", c="g")
ax.plot(dfg.index, dfg.avg, label="5 yr avg", c="y", ls=(0,(1,2)), lw=3)
# adjust axis
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
ax.legend(loc = 'best')
(This question can be read alone, but is a sequel to: Timeseries from CSV data (Timestamp and events))
I would like to visualize CSV data (from 2 files) as shown below, by a timeseries representation, using python's pandas module (see links below).
Sample data of df1:
TIMESTAMP eventid
0 2017-03-20 02:38:24 1
1 2017-03-21 05:59:41 1
2 2017-03-23 12:59:58 1
3 2017-03-24 01:00:07 1
4 2017-03-27 03:00:13 1
The 'eventid' column always contains the value of 1, and I am trying to show the sum of events for each day in the dataset.
The 2nd dataset, df0, has similar structure but contains only zeros:
Sample data of df0:
TIMESTAMP eventid
0 2017-03-21 01:38:24 0
1 2017-03-21 03:59:41 0
2 2017-03-22 11:59:58 0
3 2017-03-24 01:03:07 0
4 2017-03-26 03:50:13 0
The x-axis label only shows the same date, and my question is: How can the different dates be shown? (What causes the same date to be shown multiple times on x labels?)
script so far:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
df1 = pd.read_csv('timestamp01.csv', parse_dates=True, index_col='TIMESTAMP')
df0 = pd.read_csv('timestamp00.csv', parse_dates=True, index_col='TIMESTAMP')
f, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(df0.resample('D').size())
ax1.set_xlim([pd.to_datetime('2017-01-27'), pd.to_datetime('2017-04-30')])
ax1.xaxis.set_major_formatter(ticker.FixedFormatter
(df0.index.strftime('%Y-%m-%d')))
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=15)
ax2.plot(df1.resample('D').size())
ax2.set_xlim([pd.to_datetime('2017-03-22'), pd.to_datetime('2017-04-29')])
ax2.xaxis.set_major_formatter(ticker.FixedFormatter(df1.index.strftime
('%Y-%m-%d')))
plt.setp(ax2.xaxis.get_majorticklabels(), rotation=15)
plt.show()
Output: (https://www.dropbox.com/s/z21koflkzglm6c3/figure_1.png?dl=0)
Links I have tried to follow:
http://pandas.pydata.org/pandas-docs/stable/visualization.html
Multiple timeseries plots from Pandas Dataframe
Pandas timeseries plot setting x-axis major and minor ticks and labels
Any help is much appreciated.
Making the example reproducible, we can create the following text file (data/timestamp01.csv):
TIMESTAMP;eventid
2017-03-20 02:38:24;1
2017-03-21 05:59:41;1
2017-03-23 12:59:58;1
2017-03-24 01:00:07;1
2017-03-27 03:00:13;1
(same for data/timestamp00.csv). We can then read them in
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
df1 = pd.read_csv('data/timestamp01.csv', parse_dates=True, index_col='TIMESTAMP', sep=";")
df0 = pd.read_csv('data/timestamp00.csv', parse_dates=True, index_col='TIMESTAMP', sep=";")
Plotting them
f, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(df0.resample('D').size())
ax2.plot(df1.resample('D').size())
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=30, ha="right")
plt.setp(ax2.xaxis.get_majorticklabels(), rotation=30, ha="right")
plt.show()
results in
which is the desired plot.