Matplotlib plot is a no-show - python

When I run this code
import pandas as pd
import numpy as np
def add_prop(group):
births = group.births.astype(float)
group['prop'] = births/births.sum()
return group
pieces = []
columns = ['name', 'sex', 'births']
for year in range(1880, 2012):
path = 'yob%d.txt' % year
frame = pd.read_csv(path, names = columns)
frame['year'] = year
pieces.append(frame)
names = pd.concat(pieces, ignore_index = True)
total_births = names.pivot_table('births', rows = 'year', cols = 'sex', aggfunc = sum)
total_births.plot(title = 'Total Births by sex and year')
I get no plot. This is from Wes McKinney's book on using Python for data analysis.
Can anyone point me in the right direction?

Put
import matplotlib.pyplot as plt
at the top, and
plt.show()
at the end.

In the IPython notebook you could also use %matplotlib inline at the top of the notebook to automatically display the created plots in the output cells.

your code is correct.
just put:
import matplotlib as plt
for displaying your plot:
plt.show()

Related

Write a function in Python to group by and generate a boxplot in python

Python novice here, In the dataframe below, I need assistance writing a function that does the following:
I: select columns year,state,dept, revenue(mil)
II: boxplot of revenue(mil) ~ dept for each unique state of a unique year, something along the lines of groupby(['year','state])
III: export the graph as a 2-figure per page pdf file
# the dataset
import pandas as pd
import numpy as np
import seaborn as sns
df1={
'xcode':[5001,5001,5250,5250,5425,5425,5610,5610,5910,5910,5010,5010,6110,6110,6135,6135,6220,6220,6550,6550],
'town':["A01","A01","A01","A02","A01","A02","A03","A03","A01","A02","A03","A04","A01","A01","A01","A01","A01","A01","A02","A02"],
'state':["PA","PA","NY","NY","DE","DE","PA","PA","NY","NY","PA","PA","NY","NY","DE","DE","PA","PA","NY","NY"],
'dept':["hlth",'edu','edu','hlth','hlth','edu','hlth','edu','edu','hlth','edu','hlth','hlth','edu','hlth',"hlth",'edu','edu','hlth','hlth'],
'year':[2001,2001,2001,2001,2001,2002,2002,2002,2002,2002,2003,2003,2003,2003,2003,2004,2004,2004,2004,2004],
'revenue(mil)':[112.9,123,124,523.5,112,334,55,449,221.6,332,235,239,235,223,235.6,204,315.5,614,512,514.2],
'tax':[112.0,123,124,523,112,334.5,55,449,221,332,235.6,239,235,223.7,235,204,315,614,512,514.6]
}
df1 = pd.DataFrame(df1)
df1
My attempt:
import pandas as pd
import numpy as np
import matplotlib.backends.backend_pdf
def boxplot2pdf(df):
# select the columns
df = df[['year','state','dept', 'revenue(mil)']]
# set pdf page
pdf = matplotlib.backends.backend_pdf.PdfPages("boxplot2pdf.pdf")
# set number of graphs per page
N_plots_per_page = 2
########## Here is where I need help- grouping by year, grouping each year by state and plotting revenue(mil) by dept ###################################################
# for each unique year and unique state,boxplot revenue per dept
for group in groupby(["year","state"]):
g = sns.boxplot(x = dept,y = revenue(mil),data = df[group])
# the title showing specific year and state
plt.title("Year: State: ")
pdf.savefig(g,fig)
pdf.close()
#driver code
boxplot2pdf(df1)
Kindly share your full code with comments explaining your approach :)
#Abuzar, your output looks like this
import pandas as pd
import seaborn as sns
import matplotlib.backends.backend_pdf
import matplotlib.pyplot as plt
def boxplot2pdf(df, nFigPerPage):
years = df['year'].drop_duplicates().sort_values()
states = df['state'].drop_duplicates().sort_values()
pdf = matplotlib.backends.backend_pdf.PdfPages("boxplot2pdf.pdf")
nFig = 0
iFigPerPage = 0
for year in years:
for state in states:
df_year_state = df.loc[(df['state'] == state) & (df['year'] == year),['dept','revenue(mil)']]
title = "Year: {}, State: {}".format(year, state)
if nFig % nFigPerPage == 0:
fig, axs = plt.subplots(nrows=1, ncols=nFigPerPage, squeeze=True )
sns.boxplot(ax=axs[iFigPerPage], x="dept", y="revenue(mil)", hue='dept', data=df_year_state).set_title(title)
sns.swarmplot(ax=axs[iFigPerPage], x="dept", y="revenue(mil)", hue='dept', data=df_year_state)
iFigPerPage += 1
if iFigPerPage % nFigPerPage == 0:
iFigPerPage = 0
nFig += 1
if nFig % nFigPerPage == 0:
fig.tight_layout()
pdf.savefig()
plt.close()
pdf.close()
df={
'xcode':[5001,5001,5250,5250,5425,5425,5610,5610,5910,5910,5010,5010,6110,6110,6135,6135,6220,6220,6550,6550],
'town':["A01","A01","A01","A02","A01","A02","A03","A03","A01","A02","A03","A04","A01","A01","A01","A01","A01","A01","A02","A02"],
'state':["PA","PA","NY","NY","DE","DE","PA","PA","NY","NY","PA","PA","NY","NY","DE","DE","PA","PA","NY","NY"],
'dept':["hlth",'edu','edu','hlth','hlth','edu','hlth','edu','edu','hlth','edu','hlth','hlth','edu','hlth',"hlth",'edu','edu','hlth','hlth'],
'year':[2001,2001,2001,2001,2001,2002,2002,2002,2002,2002,2003,2003,2003,2003,2003,2004,2004,2004,2004,2004],
'revenue(mil)':[112.9,123,124,523.5,112,334,55,449,221.6,332,235,239,235,223,235.6,204,315.5,614,512,514.2],
'tax':[112.0,123,124,523,112,334.5,55,449,221,332,235.6,239,235,223.7,235,204,315,614,512,514.6]
}
df1 = pd.DataFrame(df)
boxplot2pdf(df1, nFigPerPage=2)

Matplotlib plots turn out blank even having values

I am new to analytics,python and machine learning and I am working on Time forecasting. Using the following code I am getting the value for train and test data but the graph is plotted blank.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.tsa.api as ExponentialSmoothing
#Importing data
df = pd.read_csv('international-airline-passengers - Copy.csv')
#Printing head
print(df.head())
#Printing tail
print(df.tail())
df = pd.read_csv('international-airline-passengers - Copy.csv', nrows = 11856)
#Creating train and test set
#Index 10392 marks the end of October 2013
train=df[0:20]
test=df[20:]
#Aggregating the dataset at daily level
df.Timestamp = pd.to_datetime(df.Month,format='%m/%d/%Y %H:%M')
df.index = df.Timestamp
df = df.resample('D').mean()
train.Timestamp = pd.to_datetime(train.Month,format='%m/%d/%Y %H:%M')
print('1')
print(train.Timestamp)
train.index = train.Timestamp
train = train.resample('D').mean()
test.Timestamp = pd.to_datetime(test.Month,format='%m/%d/%Y %H:%M')
test.index = test.Timestamp
test = test.resample('D').mean()
train.Count.plot(figsize=(15,8), title= 'Result', fontsize=14)
test.Count.plot(figsize=(15,8), title= 'Result', fontsize=14)
plt.show()
Not able to understand the reason for getting the graph blank even when train and test data is having value.
Thanks in advance.
I think I found the issue here. The thing is you are using train.Count.plot here, while the value of "plt" is still empty.If you go through the documentation of matplotlib(link down below), you will find that you need to store some value in plt first and here since plt is empty, it is giving back empty plot.
Basically you are not plotting anything and just showing up the blank plot.
Eg: plt.subplots(values) or plt.scatter(values), or any of its function depending on requirements.Hope this helps.
https://matplotlib.org/
import holoviews as hv
import pandas as pd
import numpy as np
data=pd.read_csv("C:/Users/Nisarg.Bhatt/Documents/data.csv", engine="python")
train=data.groupby(["versionCreated"])["Polarity Score"].mean()
table=hv.Table(train)
print(table)
bar=hv.Bars(table).opts(plot=dict(width=1500))
renderer = hv.renderer('bokeh')
app = renderer.app(bar)
print(app)
from bokeh.server.server import Server
server = Server({'/': app}, port=0)
server.start()
server.show("/")
This is done by using Holoviews, it is used for visualisation purpose.If you are using for a professional application, you should definitely try this. Here the versionCreated is date and Polarity is similar to count. Try this
OR, if you want to stick to matplotlib try this:
fig, ax = plt.subplots(figsize=(16,9))
ax.plot(msft.index, msft, label='MSFT')
ax.plot(short_rolling_msft.index, short_rolling_msft, label='20 days rolling')
ax.plot(long_rolling_msft.index, long_rolling_msft, label='100 days rolling')
ax.set_xlabel('Date')
ax.set_ylabel('Adjusted closing price ($)')
ax.legend()
Also this can be used, if you want to stick with matplotlib

How do I change the year interval on a Pandas DataFrame area plot?

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as dts
def use_matplot():
ax = df.plot(x='year', kind="area" )
years = dts.YearLocator(20)
ax.xaxis.set_major_locator(years)
fig = ax.get_figure()
fig.savefig('output.pdf')
dates = np.arange(1990,2061, 1)
dates = dates.astype('str').astype('datetime64')
df = pd.DataFrame(np.random.randint(0, dates.size, size=(dates.size,3)), columns=list('ABC'))
df['year'] = dates
cols = df.columns.tolist()
cols = [cols[-1]] + cols[:-1]
df = df[cols]
use_matplot()
In the above code, I get an error, "ValueError: year 0 is out of range" when trying to set the YearLocator so as to ensure the X-Axis has year labels for every 20th year. By default the plot has the years show up every 10 years. What am I doing wrong? Desired outcome is simply a plot with 1990, 2010, 2030, 2050 on the bottom. (Instead of default 1990, 2000, 2010, etc.)
Since the years are simple numbers, you may opt for not using them as dates at all and keeping them as numbers.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dates = np.arange(1990,2061, 1)
df = pd.DataFrame(np.random.randint(0,dates.size,size=(dates.size,3)),columns=list('ABC'))
df['year'] = dates
cols = df.columns.tolist()
cols = [cols[-1]] + cols[:-1]
df = df[cols]
ax = df.plot(x='year', kind="area" )
ax.set_xticks(range(2000,2061,20))
plt.show()
Apart from that, using Matplotlib locators and formatters on date axes created via pandas will most often fail. This is due to pandas using a completely different datetime convention. In order to have more freedom for setting custom tickers for datetime axes, you may use matplotlib. A stackplot can be plotted with plt.stackplot. On such a matplotlib plot, the use of the usual matplotlib tickers is unproblematic.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as dts
dates = np.arange(1990,2061, 1)
df = pd.DataFrame(np.random.randint(0,dates.size,size=(dates.size,3)),columns=list('ABC'))
df['year'] = pd.to_datetime(dates.astype(str))
cols = df.columns.tolist()
cols = [cols[-1]] + cols[:-1]
df = df[cols]
plt.stackplot(df["year"].values, df[list('ABC')].values.T)
years = dts.YearLocator(20)
plt.gca().xaxis.set_major_locator(years)
plt.margins(x=0)
plt.show()
Consider using set_xticklabels to specify values of x axis tick marks:
ax.set_xticklabels(sum([[i,''] for i in range(1990, 2060, 20)], []))
# [1990, '', 2010, '', 2030, '', 2050, '']

Plotting pandas dataframe with two groups

I'm using Pandas and matplotlib to try to replicate this graph from tableau:
So far, I have this code:
group = df.groupby(["Region","Rep"]).sum()
total_price = group["Total Price"].groupby(level=0, group_keys=False)
total_price.nlargest(5).plot(kind="bar")
Which produces this graph:
It correctly groups the data, but is it possible to get it grouped similar to how Tableau shows it?
You can create some lines and labels using the respective matplotlib methods (ax.text and ax.axhline).
import pandas as pd
import numpy as np; np.random.seed(5)
import matplotlib.pyplot as plt
a = ["West"]*25+ ["Central"]*10+ ["East"]*10
b = ["Mattz","McDon","Jeffs","Warf","Utter"]*5 + ["Susanne","Lokomop"]*5 + ["Richie","Florence"]*5
c = np.random.randint(5,55, size=len(a))
df=pd.DataFrame({"Region":a, "Rep":b, "Total Price":c})
group = df.groupby(["Region","Rep"]).sum()
total_price = group["Total Price"].groupby(level=0, group_keys=False)
gtp = total_price.nlargest(5)
ax = gtp.plot(kind="bar")
#draw lines and titles
count = gtp.groupby("Region").count()
cum = np.cumsum(count)
for i in range(len(count)):
title = count.index.values[i]
ax.axvline(cum[i]-.5, lw=0.8, color="k")
ax.text(cum[i]-(count[i]+1)/2., 1.02, title, ha="center",
transform=ax.get_xaxis_transform())
# shorten xticklabels
ax.set_xticklabels([l.get_text().split(", ")[1][:-1] for l in ax.get_xticklabels()])
plt.show()

Date removed from x axis on overlaid plots matplotlib

I am trying to show time series lines representing an effort amount using matplotlib and pandas.
I've got my DF's to all to overlay in one plot, however when I do python seems to strip the x axis of the date and input some numbers. (I'm not sure where these come from but at a guess, not all days contain the same data so python has reverted to using an index id number). If I plot any one of these they come up with date on the x-axis.
Any hints or solutions to make the x axis show date for the multiple plot would be much appreciated.
This is the single figure plot with time axis:
Code I'm using to plot is
fig = pl.figure()
ax = fig.add_subplot(111)
ax.plot(b342,color='black')
ax.plot(b343,color='blue')
ax.plot(b344,color='red')
ax.plot(b345,color='green')
ax.plot(b346,color='pink')
ax.plot(fi,color='yellow')
plt.show()
This is the multiple plot fig with weird x axis:
One option would be to manually specify the x-axis based on the DataFrame index, and then plot directly using matplotlib.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# make up some data
n = 100
dates = pd.date_range(start = "2015-01-01", periods = n, name = "yearDate")
dfs = []
for i in range(3):
df = pd.DataFrame(data = np.random.random(n)*(i + 1), index = dates,
columns = ["FishEffort"] )
df.df_name = str(i)
dfs.append(df)
# plot it directly using matplotlib instead of through the DataFrame
fig = plt.figure()
ax = fig.add_subplot()
for df in dfs:
plt.plot(df.index,df["FishEffort"], label = df.df_name)
plt.legend()
plt.show()
Another option would be to concatenate your DataFrames and plot using Pandas. If you give your "FishEffort" field the correct label name when loading the data or via DataFrame.rename then the labels will be specified automatically.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
n = 100
dates = pd.date_range(start = "2015-01-01", periods = n, name = "yearDate")
dfs = []
for i in range(3):
df = pd.DataFrame(data = np.random.random(n)*(i + 1), index = dates,
columns = ["DataFrame #" + str(i) ] )
df.df_name = str(i)
dfs.append(df)
df = pd.concat(dfs, axis = 1)
df.plot()
I've found an answer that does what I want, it seems that calling plt.plot wasn't using the date as the x axis, however calling it using the pandas documentation did the trick.
ax = b342.plot(label='342')
b343.plot(ax=ax, label='test')
b344.plot(ax=ax)
b345.plot(ax=ax)
b346.plot(ax=ax)
fi.plot(ax=ax)
plt.show()
I was wondering if anyone knew hwo to change the labels here?

Categories