I have a pandas DataFrame containing NaN values. I want to make a bar plot with the indexes in the x axys, and a bar for each column, grouped by the indexes. I would like to plot only the bars with an actual value.
As far as I'm trying, from this example:
df = pandas.DataFrame({'foo':[1,None,None], 'bar':[None,2,0.5], 'col': [1,1.5,None]}, index=["A","B","C"])
df.plot.bar()
plt.show()
I can produce this plot:
What I would like is to remove the blank spaces left for the NaN columns. So to compact the bars and center the group above the x tick.
You can do something like the code below, by going through each row of the dataframe
and checking each column for NaNs.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(
{"foo": [1, None, None], "bar": [None, 2, 0.5], "col": [1, 1.5, None]},
index=["A", "B", "C"],
)
# define the colors for each column
colors = {"foo": "blue", "bar": "orange", "col": "green"}
fig = plt.figure(figsize=(10, 6))
ax = plt.gca()
# width of bars
width = 1
# create emptly lists for x tick positions and names
x_ticks, x_ticks_pos = [], []
# counter for helping with x tick positions
count = 0
# reset the index
# so that we can iterate through the numbers.
# this will help us to get the x tick positions
df = df.reset_index()
# go through each row of the dataframe
for idx, row in df.iterrows():
# this will be the first bar position for this row
count += idx
# this will be the start of the first bar for this row
start_idx = count - width / 2
# this will be the end of the last bar for this row
end_idx = start_idx
# for each column in the wanted columns,
# if the row is not null,
# add the bar to the plot
# also update the end position of the bars for this row
for column in df.drop(["index"], axis=1).columns:
if row[column] == row[column]:
plt.bar(count, row[column], color=colors[column], width=width, label=column)
count += 1
end_idx += width
# this checks if the row had any not NULL value in the desired columns
# in other words, it checks if there was any bar for this row
# if yes, add the center of all the row's bars and the row's name (A,B,C) to the respective lists
if end_idx != start_idx:
x_ticks_pos.append((end_idx + start_idx) / 2)
x_ticks.append(row["index"])
# now set the x_ticks
plt.xticks(x_ticks_pos, x_ticks)
# also plot the legends
# and make sure to not display duplicate labels
# the below code is taken from:
# https://stackoverflow.com/questions/13588920/stop-matplotlib-repeating-labels-in-legend
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())
plt.show()
Result:
Related
I draw a normal distribution plot based on the dataframe, then I want to annotate some data points in the plot as the index or column value of that data points. For example, here we have a dataframe:
df = pd.DataFrame({'col1': ['A', 'B', 'C'], 'col2': [3, 7, 9]})
Then I draw a normal distribution plot using the values in 'col2'.
Now, I want to annotate or label some data points with the values in 'col1'.
For example, I want to show the 'col1' value (or text) which is 'B' on the data point '7' in the normal distribution plot.
Use Matplotlib's built-in annotate feature:
####################### Just creating a mock dataframe #############################
import random ##
import string ##
import pandas as pd ##
import matplotlib.pyplot as plt ##
plt.rcParams["figure.figsize"] = (20,10) ##
##
col1 = [random.choice(string.ascii_uppercase) for _ in range(500)] ##
col2 = [random.choice(range(0,25)) for _ in range(500)] ##
df = pd.DataFrame({'col1': col1, 'col2': col2}) ##
####################################################################################
string = 'A' # The string you want to find
index = df[df.col1.eq(string)].index # The index(es) at which that string is in col1
# index = df[df.col1.str.contains(string)].index # If you are looking for a word or phrase in a string try this
y, x, _ = plt.hist(df.col2) # Plot the histogram and grab the x and y values
plt.ylim(0, y.max()+10) # Set the ylim to the max y value plus some number
for pos in index: # Annotate what you want (we'll just do the "string" value) at the...
plt.annotate(string, (df['col2'][pos],y.max()+5), fontsize = 20) #... corresponding value in col2 at that index
plt.show()
I have a dataframe with 4 columns. I want to plot it in sns.distplot as following:
g = sns.displot(dataframe, height = 25, kind = "kde", x = "value", fill = True, hue = "Testset", col = "Session", row = "Timepoint")
It produces the following plot with empty subplots, because I don't have all the combinations of values. Is there a way to remove the empty plots and plot it under one another.
import pandas as pd
import matplotlib.pyplot as plt
from random import random
DF = {"A":[(random(),random()),(random(),random())],
"B":[(random(),random()),(random(),random())],
"C":[(random(),random()),(random(),random())],
"D":[(random(),random()),(random(),random())]}
DF = pd.DataFrame(DF, index=["k=1","k=2"])
Each row of my dataframe contains the observations for one of the repeated experiments, and each column represents a group of subjects. An entry is a tuple of male datum and female datum. So I wanna plot the observations against groups, differentiating the genders by marker shape and color. I tried the below but it yielded too long a legend. How can I only display two labels (Male, Female)?
plt.figure()
for row in DF.index:
plt.plot(DF.columns, [a for a,b in DF.loc[row,]], 'b.', label="Male")
plt.plot(DF.columns, [b for a,b in DF.loc[row,]], 'g+', label="Female")
plt.legend(loc="upper right")
plt.show()
BTW, since the values are rather close, can I add a little bit random noise to the x-coordinate of each point (like jittering in R) so they can be more discernible?
Let's try reshaping your data before plot:
s = DF.unstack()
plot_df = pd.DataFrame(list(s.values), columns=['Male','Female'],
index=s.index.get_level_values(0))
styles = ['b.','g+']
plt.figure(figsize=(10,6))
for col,style in zip(plot_df.columns, styles):
plt.plot(plot_df.index, plot_df[col], style, label=col)
plt.legend(loc="upper right")
plt.show()
Output:
A quick solution would be to only add the label for the first row.
To add jitter to the x-positions, you can loop through generated elements, fetch their x-positions, add some random value and put them back. As the current distance is one, adding a number between -0.4 and 0.4 should work. The limits of the x axis need to be recalculated via relim and autoscale.
import pandas as pd
import matplotlib.pyplot as plt
from random import random
DF = {"A": [(random(), random()), (random(), random())],
"B": [(random(), random()), (random(), random())],
"C": [(random(), random()), (random(), random())],
"D": [(random(), random()), (random(), random())]}
DF = pd.DataFrame(DF, index=["k=1", "k=2"])
fig, ax = plt.subplots()
for row in DF.index:
ax.plot(DF.columns, [a for a, b in DF.loc[row,]], 'b.', label="Male" if row == "k=1" else None)
ax.plot(DF.columns, [b for a, b in DF.loc[row,]], 'g+', label="Female" if row == "k=1" else None)
for elements in ax.lines:
xs = elements.get_xydata()[:, 0] # get_xdata() returns strings, but get_xydata is fully numeric
jittered_xs = xs + np.random.uniform(-0.4, 0.4, xs.shape)
elements.set_xdata(jittered_xs)
ax.relim()
ax.autoscale(enable=True)
plt.legend(loc="upper right")
plt.show()
How to sort the values on x axis in an ascending order in a scatter plot?
fig, ax = plt.subplots()
fig.set_size_inches(18, 8)
ax.scatter(data=ipl,x='budget',y='player')
n = 4
for idx, label in enumerate(ax.xaxis.get_ticklabels()):
if idx % n != 0:
label.set_visible(False)
In the image,the values on x axis are randomly arranged.
you can use a small regex to extract the values, convert to int and sort by the integer values:
#dummy data
ipl = pd.DataFrame(
{
'budget': ['${}M'.format(a) for a in np.random.randint(0,200,10)],
'player': np.random.randint(-10,10,10)
}
)
ipl['values'] = ipl['budget'].str.extract('([0-9]+)').astype(int)
ipl.sort_values(by='values', inplace=True)
I am trying to create a grid of Subplots for a predetermined x & y data. The functions should iterate through a pandas DataFrame, identify Categorical variables and plot the x & y data with a line for each level of a given categorial variable. The number of plots is equal to the number of Categorical variables, and the number of lines on each plot should be reflective of the number of categories for that variable.
I initially tried to group the Dataframe in a For loop on a given categorical variable, but I have had some mixed results. I think My issue is in how I am assigning what axis the lines are getting drawn on.
def grouping_for_graphs(df,x_col, y_col,category,func):
'''
funtion to group dataframe given a variable and
aggregation function
'''
X = df[x_col].name
y = df[y_col].name
category = df[category].name
df_grouped = df.groupby([X, category])[y].apply(func)
return df_grouped.reset_index()
# create a list of categorical variables to plot
cat_list = []
col_list = list(df.select_dtypes(include = ['object']).columns)
for col in col_list:
if len(df[col].unique()) < 7:
cat_list.append(col)
# create plots and axes
fig, axs = plt.subplots(2, 2, figsize=(30,24))
axs = axs.flatten()
# pick plot function
plot_func = plt.plot
# plot this
for ax, category in zip(axs, cat_list):
df_grouped = grouping_for_graphs(df,x_col, y_col,category,agg_func)
x_col = df_grouped.columns[0]
y_col = df_grouped.columns[-1]
category = str(list(df_grouped.columns.drop([x_lab, y_lab]))[0])
for feature in list(df_grouped[category].unique()):
X = df_grouped[df_grouped[category] == feature][x_col]
y = df_grouped[df_grouped[category] == feature][y_col]
ax.plot = plot_func(X,y)
ax.set_xlabel(x_col)
ax.set_ylabel(y_col)
ax.set_title(feature)
Other than getting an error that ax.plot is a 'list' object and is not callable, all the lines drawn are put on the final plot of the subplots.
I am confused with your plot_func. Remove this and just directly plot using ax.plot(X, y). The modified line is highlighted by a comment
fig, axs = plt.subplots(2, 2, figsize=(30,24))
axs = axs.flatten()
for ax, category in zip(axs, cat_list):
df_grouped = grouping_for_graphs(df,x_col, y_col,category,agg_func)
x_col = df_grouped.columns[0]
y_col = df_grouped.columns[-1]
category = str(list(df_grouped.columns.drop([x_lab, y_lab]))[0])
for feature in list(df_grouped[category].unique()):
X = df_grouped[df_grouped[category] == feature][x_col]
y = df_grouped[df_grouped[category] == feature][y_col]
ax.plot(X,y) # <--- Modified here
ax.set_xlabel(x_col)
ax.set_ylabel(y_col)
ax.set_title(feature)