Python novice here, In the dataframe below, I need assistance writing a function that does the following:
I: select columns year,state,dept, revenue(mil)
II: boxplot of revenue(mil) ~ dept for each unique state of a unique year, something along the lines of groupby(['year','state])
III: export the graph as a 2-figure per page pdf file
# the dataset
import pandas as pd
import numpy as np
import seaborn as sns
df1={
'xcode':[5001,5001,5250,5250,5425,5425,5610,5610,5910,5910,5010,5010,6110,6110,6135,6135,6220,6220,6550,6550],
'town':["A01","A01","A01","A02","A01","A02","A03","A03","A01","A02","A03","A04","A01","A01","A01","A01","A01","A01","A02","A02"],
'state':["PA","PA","NY","NY","DE","DE","PA","PA","NY","NY","PA","PA","NY","NY","DE","DE","PA","PA","NY","NY"],
'dept':["hlth",'edu','edu','hlth','hlth','edu','hlth','edu','edu','hlth','edu','hlth','hlth','edu','hlth',"hlth",'edu','edu','hlth','hlth'],
'year':[2001,2001,2001,2001,2001,2002,2002,2002,2002,2002,2003,2003,2003,2003,2003,2004,2004,2004,2004,2004],
'revenue(mil)':[112.9,123,124,523.5,112,334,55,449,221.6,332,235,239,235,223,235.6,204,315.5,614,512,514.2],
'tax':[112.0,123,124,523,112,334.5,55,449,221,332,235.6,239,235,223.7,235,204,315,614,512,514.6]
}
df1 = pd.DataFrame(df1)
df1
My attempt:
import pandas as pd
import numpy as np
import matplotlib.backends.backend_pdf
def boxplot2pdf(df):
# select the columns
df = df[['year','state','dept', 'revenue(mil)']]
# set pdf page
pdf = matplotlib.backends.backend_pdf.PdfPages("boxplot2pdf.pdf")
# set number of graphs per page
N_plots_per_page = 2
########## Here is where I need help- grouping by year, grouping each year by state and plotting revenue(mil) by dept ###################################################
# for each unique year and unique state,boxplot revenue per dept
for group in groupby(["year","state"]):
g = sns.boxplot(x = dept,y = revenue(mil),data = df[group])
# the title showing specific year and state
plt.title("Year: State: ")
pdf.savefig(g,fig)
pdf.close()
#driver code
boxplot2pdf(df1)
Kindly share your full code with comments explaining your approach :)
#Abuzar, your output looks like this
import pandas as pd
import seaborn as sns
import matplotlib.backends.backend_pdf
import matplotlib.pyplot as plt
def boxplot2pdf(df, nFigPerPage):
years = df['year'].drop_duplicates().sort_values()
states = df['state'].drop_duplicates().sort_values()
pdf = matplotlib.backends.backend_pdf.PdfPages("boxplot2pdf.pdf")
nFig = 0
iFigPerPage = 0
for year in years:
for state in states:
df_year_state = df.loc[(df['state'] == state) & (df['year'] == year),['dept','revenue(mil)']]
title = "Year: {}, State: {}".format(year, state)
if nFig % nFigPerPage == 0:
fig, axs = plt.subplots(nrows=1, ncols=nFigPerPage, squeeze=True )
sns.boxplot(ax=axs[iFigPerPage], x="dept", y="revenue(mil)", hue='dept', data=df_year_state).set_title(title)
sns.swarmplot(ax=axs[iFigPerPage], x="dept", y="revenue(mil)", hue='dept', data=df_year_state)
iFigPerPage += 1
if iFigPerPage % nFigPerPage == 0:
iFigPerPage = 0
nFig += 1
if nFig % nFigPerPage == 0:
fig.tight_layout()
pdf.savefig()
plt.close()
pdf.close()
df={
'xcode':[5001,5001,5250,5250,5425,5425,5610,5610,5910,5910,5010,5010,6110,6110,6135,6135,6220,6220,6550,6550],
'town':["A01","A01","A01","A02","A01","A02","A03","A03","A01","A02","A03","A04","A01","A01","A01","A01","A01","A01","A02","A02"],
'state':["PA","PA","NY","NY","DE","DE","PA","PA","NY","NY","PA","PA","NY","NY","DE","DE","PA","PA","NY","NY"],
'dept':["hlth",'edu','edu','hlth','hlth','edu','hlth','edu','edu','hlth','edu','hlth','hlth','edu','hlth',"hlth",'edu','edu','hlth','hlth'],
'year':[2001,2001,2001,2001,2001,2002,2002,2002,2002,2002,2003,2003,2003,2003,2003,2004,2004,2004,2004,2004],
'revenue(mil)':[112.9,123,124,523.5,112,334,55,449,221.6,332,235,239,235,223,235.6,204,315.5,614,512,514.2],
'tax':[112.0,123,124,523,112,334.5,55,449,221,332,235.6,239,235,223.7,235,204,315,614,512,514.6]
}
df1 = pd.DataFrame(df)
boxplot2pdf(df1, nFigPerPage=2)
I am new to analytics,python and machine learning and I am working on Time forecasting. Using the following code I am getting the value for train and test data but the graph is plotted blank.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.tsa.api as ExponentialSmoothing
#Importing data
df = pd.read_csv('international-airline-passengers - Copy.csv')
#Printing head
print(df.head())
#Printing tail
print(df.tail())
df = pd.read_csv('international-airline-passengers - Copy.csv', nrows = 11856)
#Creating train and test set
#Index 10392 marks the end of October 2013
train=df[0:20]
test=df[20:]
#Aggregating the dataset at daily level
df.Timestamp = pd.to_datetime(df.Month,format='%m/%d/%Y %H:%M')
df.index = df.Timestamp
df = df.resample('D').mean()
train.Timestamp = pd.to_datetime(train.Month,format='%m/%d/%Y %H:%M')
print('1')
print(train.Timestamp)
train.index = train.Timestamp
train = train.resample('D').mean()
test.Timestamp = pd.to_datetime(test.Month,format='%m/%d/%Y %H:%M')
test.index = test.Timestamp
test = test.resample('D').mean()
train.Count.plot(figsize=(15,8), title= 'Result', fontsize=14)
test.Count.plot(figsize=(15,8), title= 'Result', fontsize=14)
plt.show()
Not able to understand the reason for getting the graph blank even when train and test data is having value.
Thanks in advance.
I think I found the issue here. The thing is you are using train.Count.plot here, while the value of "plt" is still empty.If you go through the documentation of matplotlib(link down below), you will find that you need to store some value in plt first and here since plt is empty, it is giving back empty plot.
Basically you are not plotting anything and just showing up the blank plot.
Eg: plt.subplots(values) or plt.scatter(values), or any of its function depending on requirements.Hope this helps.
https://matplotlib.org/
import holoviews as hv
import pandas as pd
import numpy as np
data=pd.read_csv("C:/Users/Nisarg.Bhatt/Documents/data.csv", engine="python")
train=data.groupby(["versionCreated"])["Polarity Score"].mean()
table=hv.Table(train)
print(table)
bar=hv.Bars(table).opts(plot=dict(width=1500))
renderer = hv.renderer('bokeh')
app = renderer.app(bar)
print(app)
from bokeh.server.server import Server
server = Server({'/': app}, port=0)
server.start()
server.show("/")
This is done by using Holoviews, it is used for visualisation purpose.If you are using for a professional application, you should definitely try this. Here the versionCreated is date and Polarity is similar to count. Try this
OR, if you want to stick to matplotlib try this:
fig, ax = plt.subplots(figsize=(16,9))
ax.plot(msft.index, msft, label='MSFT')
ax.plot(short_rolling_msft.index, short_rolling_msft, label='20 days rolling')
ax.plot(long_rolling_msft.index, long_rolling_msft, label='100 days rolling')
ax.set_xlabel('Date')
ax.set_ylabel('Adjusted closing price ($)')
ax.legend()
Also this can be used, if you want to stick with matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as dts
def use_matplot():
ax = df.plot(x='year', kind="area" )
years = dts.YearLocator(20)
ax.xaxis.set_major_locator(years)
fig = ax.get_figure()
fig.savefig('output.pdf')
dates = np.arange(1990,2061, 1)
dates = dates.astype('str').astype('datetime64')
df = pd.DataFrame(np.random.randint(0, dates.size, size=(dates.size,3)), columns=list('ABC'))
df['year'] = dates
cols = df.columns.tolist()
cols = [cols[-1]] + cols[:-1]
df = df[cols]
use_matplot()
In the above code, I get an error, "ValueError: year 0 is out of range" when trying to set the YearLocator so as to ensure the X-Axis has year labels for every 20th year. By default the plot has the years show up every 10 years. What am I doing wrong? Desired outcome is simply a plot with 1990, 2010, 2030, 2050 on the bottom. (Instead of default 1990, 2000, 2010, etc.)
Since the years are simple numbers, you may opt for not using them as dates at all and keeping them as numbers.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dates = np.arange(1990,2061, 1)
df = pd.DataFrame(np.random.randint(0,dates.size,size=(dates.size,3)),columns=list('ABC'))
df['year'] = dates
cols = df.columns.tolist()
cols = [cols[-1]] + cols[:-1]
df = df[cols]
ax = df.plot(x='year', kind="area" )
ax.set_xticks(range(2000,2061,20))
plt.show()
Apart from that, using Matplotlib locators and formatters on date axes created via pandas will most often fail. This is due to pandas using a completely different datetime convention. In order to have more freedom for setting custom tickers for datetime axes, you may use matplotlib. A stackplot can be plotted with plt.stackplot. On such a matplotlib plot, the use of the usual matplotlib tickers is unproblematic.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as dts
dates = np.arange(1990,2061, 1)
df = pd.DataFrame(np.random.randint(0,dates.size,size=(dates.size,3)),columns=list('ABC'))
df['year'] = pd.to_datetime(dates.astype(str))
cols = df.columns.tolist()
cols = [cols[-1]] + cols[:-1]
df = df[cols]
plt.stackplot(df["year"].values, df[list('ABC')].values.T)
years = dts.YearLocator(20)
plt.gca().xaxis.set_major_locator(years)
plt.margins(x=0)
plt.show()
Consider using set_xticklabels to specify values of x axis tick marks:
ax.set_xticklabels(sum([[i,''] for i in range(1990, 2060, 20)], []))
# [1990, '', 2010, '', 2030, '', 2050, '']