Iterate through DataFrame categorical columns to create subplots - python

I am trying to create a grid of Subplots for a predetermined x & y data. The functions should iterate through a pandas DataFrame, identify Categorical variables and plot the x & y data with a line for each level of a given categorial variable. The number of plots is equal to the number of Categorical variables, and the number of lines on each plot should be reflective of the number of categories for that variable.
I initially tried to group the Dataframe in a For loop on a given categorical variable, but I have had some mixed results. I think My issue is in how I am assigning what axis the lines are getting drawn on.
def grouping_for_graphs(df,x_col, y_col,category,func):
'''
funtion to group dataframe given a variable and
aggregation function
'''
X = df[x_col].name
y = df[y_col].name
category = df[category].name
df_grouped = df.groupby([X, category])[y].apply(func)
return df_grouped.reset_index()
# create a list of categorical variables to plot
cat_list = []
col_list = list(df.select_dtypes(include = ['object']).columns)
for col in col_list:
if len(df[col].unique()) < 7:
cat_list.append(col)
# create plots and axes
fig, axs = plt.subplots(2, 2, figsize=(30,24))
axs = axs.flatten()
# pick plot function
plot_func = plt.plot
# plot this
for ax, category in zip(axs, cat_list):
df_grouped = grouping_for_graphs(df,x_col, y_col,category,agg_func)
x_col = df_grouped.columns[0]
y_col = df_grouped.columns[-1]
category = str(list(df_grouped.columns.drop([x_lab, y_lab]))[0])
for feature in list(df_grouped[category].unique()):
X = df_grouped[df_grouped[category] == feature][x_col]
y = df_grouped[df_grouped[category] == feature][y_col]
ax.plot = plot_func(X,y)
ax.set_xlabel(x_col)
ax.set_ylabel(y_col)
ax.set_title(feature)
Other than getting an error that ax.plot is a 'list' object and is not callable, all the lines drawn are put on the final plot of the subplots.

I am confused with your plot_func. Remove this and just directly plot using ax.plot(X, y). The modified line is highlighted by a comment
fig, axs = plt.subplots(2, 2, figsize=(30,24))
axs = axs.flatten()
for ax, category in zip(axs, cat_list):
df_grouped = grouping_for_graphs(df,x_col, y_col,category,agg_func)
x_col = df_grouped.columns[0]
y_col = df_grouped.columns[-1]
category = str(list(df_grouped.columns.drop([x_lab, y_lab]))[0])
for feature in list(df_grouped[category].unique()):
X = df_grouped[df_grouped[category] == feature][x_col]
y = df_grouped[df_grouped[category] == feature][y_col]
ax.plot(X,y) # <--- Modified here
ax.set_xlabel(x_col)
ax.set_ylabel(y_col)
ax.set_title(feature)

Related

Multiple barh subplots automatically from loop

I am trying to create multiple bar charts automatically in a loop using a subplot.
I have created a function to create the parameters for the plot according to how many plots I need like so:
def create_parameters(parameters):
exec("def f_create_parameters({}): pass".format(', '.join(parameters)))
return locals()['f_create_parameters']
and the code that uses the function:
parList = []
names = []
even = 2
odd = 1
for i in range (0, len(listOfCategoriesEN)*2):
parList.append(create_parameters(["ax"+str(odd),"ax"+str(even)]))
names.append("ax"+str(odd))
names.append("ax"+str(even))
odd+=2
even+=2
Then this is the code where I am trying to create a single figure with multiple plots. I am getting all the plots overlayed on the last bar graph. Any idea how to fix it:
val = 0
fig2, (parList) = plt.subplots(len(listOfCategoriesEN)*2,2,figsize=(20,20))
for name,dict_ in categoriesDict.items():
df = pd.DataFrame.from_dict(dict_, orient='index', columns=["Title", "Pageviews"])
df = df.sort_values(by=['Pageviews'], ascending=False)
df[ "Pageviews"] = df[ "Pageviews"].astype(int)
#get top 5
df1 = df.head(5)
df1 = df1.sort_values(by=['Pageviews'], ascending=True)
df1['Title'] = df1['Title'].str.replace('’','\'')
if(not df1.empty):
x = df1['Title']
y = df1['Pageviews']
locals()[names[val]].barh(x, y, color=colours)
#locals()[names[val]].set_title(name+" TOP 5 PAGES")
val+=1
plt.show()
parList is a list of all your subplots.
By using plt.sca(ax) (sca = set current axis) you select the active axis to be ax and then plot your data:
val = 0
for name,dict_ in categoriesDict.items():
# do your data stuff
if(not df1.empty):
x = df1['Title']
y = df1['Pageviews']
ax = parList[val//2, val%2] # needs to be changed if you rearrange your plots
plt.sca(ax)
locals()[names[val]].barh(x, y, color=colours)
#locals()[names[val]].set_title(name+" TOP 5 PAGES")
val+=1

Create a function to get x number of most frequent column in a DF

I am trying to create a function that will return a graph with the x most frequent count plot in python.
def counting(df, label, number):
a = df[label].value_counts()[:number]
b = a.to_frame()
return b
def barplot(df, label, number = 3):
fig = plt.figure() #Plotting the graph
fig.set_size_inches([10, 5]) #Size of the figure
sns.countplot(x = label, data=counting(df, label, number))
plt.title(label)
plt.show()
return
I am seeing the label's values on the x axis. But the values should go on the Y axis, and the label names should go on X axis.

Different y axes per row, but otherwise equal row-wise y axes in column and row faceted plotly chart

This is a problem I encounter very often. I have a plotly figure with column and row facets. I have already unlinked the y axes using fig.update_yaxes(matches=None). However, while this is useful to scale axes between rows 1 and 2 as they exist in quite different domains, it breaks the ability to compare among column facets. You can see this issue in the plot below:
So my question is, how can I have the same y axes across all column facets in each row, while having different y axes for row 1 and row 2?
In order to ensure a row-wise matching you'll have to specify the following for the first row:
fig.layout.yaxis.matches = 'y'
fig.layout.yaxis2.matches = 'y'
fig.layout.yaxis3.matches = 'y'
And this for the second:
fig.layout.yaxis4.matches = 'y4'
fig.layout.yaxis5.matches = 'y4'
fig.layout.yaxis6.matches = 'y4'
As you can see, all y-axes are tied to the first y-axis of each corresponding row.
For those of you who would like to try it out, here's an example that builds on a facet plot
Complete code:
import plotly.express as px
df = px.data.gapminder()
fig = px.scatter(df, x='gdpPercap', y='lifeExp', color='continent', size='pop',
facet_col='year', facet_col_wrap=4)
fig.layout.yaxis.matches = 'y'
fig.layout.yaxis2.matches = 'y'
fig.layout.yaxis3.matches = 'y'
fig.layout.yaxis4.matches = 'y'
fig.layout.yaxis5.matches = 'y5'
fig.layout.yaxis7.matches = 'y5'
fig.layout.yaxis6.matches = 'y5'
fig.layout.yaxis8.matches = 'y5'
fig.layout.yaxis9.matches = 'y9'
fig.layout.yaxis10.matches = 'y9'
fig.layout.yaxis11.matches = 'y9'
fig.layout.yaxis12.matches = 'y9'
fig.show()
nrows = df.row_var.nunique() # or find a way to get number of rows from fig object..
for i in range(0,nrows):
fig.update_yaxes(showticklabels=True, matches=f'y{i+1}', col=i+1)
https://github.com/plotly/plotly_express/issues/147#issuecomment-537814046

Plot subplots using seaborn pairplot

If I draw the plot using the following code, it works and I can see all the subplots in a single row. I can specifically break the number of cols into three or two and show them. But I have 30 columns and I wanted to use a loop mechanism so that they are plotted in a grid of say 4x4 sub-plots
regressionCols = ['col_a', 'col_b', 'col_c', 'col_d', 'col_e']
sns.pairplot(numerical_df, x_vars=regressionCols, y_vars='price',height=4, aspect=1, kind='scatter')
plt.show()
The code using loop is below. However, I don't see anything rendered.
nr_rows = 4
nr_cols = 4
li_cat_cols = list(regressionCols)
fig, axs = plt.subplots(nr_rows, nr_cols, figsize=(nr_cols*4,nr_rows*4), squeeze=False)
for r in range(0, nr_rows):
for c in range(0,nr_cols):
i = r*nr_cols+c
if i < len(li_cat_cols):
sns.set(style="darkgrid")
bp=sns.pairplot(numerical_df, x_vars=li_cat_cols[i], y_vars='price',height=4, aspect=1, kind='scatter')
bp.set(xlabel=li_cat_cols[i], ylabel='Price')
plt.tight_layout()
plt.show()
Not sure what I am missing.
I think you didnt connect each of your subplot spaces in a matrix plot to scatter plots generated in a loop.
Maybe this solution with inner pandas plots could be proper for you:
For example,
1.Lets simply define an empty pandas dataframe.
numerical_df = pd.DataFrame([])
2. Create some random features and price depending on them:
numerical_df['A'] = np.random.randn(100)
numerical_df['B'] = np.random.randn(100)*10
numerical_df['C'] = np.random.randn(100)*-10
numerical_df['D'] = np.random.randn(100)*2
numerical_df['E'] = 20*(np.random.randn(100)**2)
numerical_df['F'] = np.random.randn(100)
numerical_df['price'] = 2*numerical_df['A'] +0.5*numerical_df['B'] - 9*numerical_df['C'] + numerical_df['E'] + numerical_df['D']
3. Define number of rows and columns. Create a subplots space with nr_rows and nr_cols.
nr_rows = 2
nr_cols = 4
fig, axes = plt.subplots(nrows=nr_rows, ncols=nr_cols, figsize=(15, 8))
for idx, feature in enumerate(numerical_df.columns[:-1]):
numerical_df.plot(feature, "price", subplots=True,kind="scatter",ax=axes[idx // 4,idx % 4])
4. Enumerate each feature in dataframe and plot a scatterplot with price:
for idx, feature in enumerate(numerical_df.columns[:-1]):
numerical_df.plot(feature, "price", subplots=True,kind="scatter",ax=axes[idx // 4,idx % 4])
where axes[idx // 4, idx % 4] defines the location of each scatterplot in a matrix you create in (3.)
So, we got a matrix plot:
Scatterplot matrix

Represent a column other than x, y on plot using matplotlib, python 2.7

I want to make a simulation of some data and I want to display my points with different colors for different categories. I have three columns where two columns I am using are x,y and I want to use third column which has two categories to be reflected on my plot.
y = np.array(q)
x = np.array(p)
fig = plt.figure(figsize = (18,18))
plt.show()
for t in range(6000):
ax = fig.add_subplot(2,1,1)
for i in s[t:t+4]: # s is a list that contains the third column
if i == 'Match':
ax.plot(x[i], y[i], 'bs')
else:
ax.plot(x[i],y[i],'ro')
There are lots of ways to do this, here is one using Pandas
#generate data
df = pd.DataFrame(np.random.random(size=(100,2)), columns=['x','y'])
df.loc[:,'cat'] = ['Match' if np.random.randint(0,2)==1 else '-' for i in range(100) ]
plt.plot(df.loc[df.cat=='Match','x'],df.loc[df.cat=='Match','y'],'bs')
plt.plot(df.loc[df.cat!='Match','x'],df.loc[df.cat=='Match','y'],'ro')

Categories