import pandas as pd
import numpy as np
np.random.seed(365)
rows = 100
data = {'Month': np.random.choice(['2014-01', '2014-02', '2014-03', '2014-04'], size=rows),
'Code': np.random.choice(['A', 'B', 'C'], size=rows),
'ColA': np.random.randint(5, 125, size=rows),
'ColB': np.random.randint(0, 51, size=rows),}
df = pd.DataFrame(data)
df = df[((~((df.Code=='A')&(df.Month=='2014-04')))&(~((df.Code=='C')&(df.Month=='2014-03'))))]
dfg = df.groupby(['Code', 'Month']).sum()
For above. I wish to plot a stacked plot..
dfg.unstack(level=0).plot(kind='bar', stacked =True)
I wish to stack over 'Code' column. But, above is stacking over 'Month' Why?. How to better plot stacked plot with this?
The index of the input dataframe is used by default as x-value in plot.bar
IIUC, you need:
dfg.unstack(level=1).plot(kind='bar', stacked=True)
legend position:
ax = dfg.unstack(level=1).plot(kind='bar', stacked=True, legend=False)
ax.figure.legend(loc='center left', bbox_to_anchor=(1, 0.5))
Related
I have this dataframe, for that I'm trying to create the piechart similar to the attached image.
Index
Category
SE
3
COL
2
PE
1
DP-PD
1
COL
1
OTH
1
I have tried the following, it's creating a pie chart, but not as expected.
import matplotlib.pyplot as plt
# assign data of lists.
data = {'index': ['SE', 'COL', 'PE', 'OTH', 'DP-PD'], 'Category': [3, 2, 1, 1,1]}
# Create DataFrame
df = pd.DataFrame(data)
plt.pie(df["Category"], labels = df["Category"],startangle=90)
plt.title("Observation statistics", fontsize = 24)
I need the same color which is mentioned in the legend for each category. These are the color codes:
{'DP-PD': '#1E90FF', 'ID': '#FFA500', 'ENC': '#D3D3D3', 'SE': '#FFFF00',
'COL': '#FF0000', 'GL': '#32CD32', 'COT': '#0000CD', 'PE': '#A52A2A',
'FI': '#000000', 'OTH': '#00BFFF'}
I'd like the following output:
My dataframe has 6 columns, I would like to plot scatter plot for each column against other columns, is there way to loop through all columns of dataframe to plot scatter plot rather than manually selecting x and y ?
Example
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(100, 6), columns=['a', 'b', 'c', 'd', 'e', 'f'])
df.head()
# I don't want to manually select x and y instead, I am looking for automatically selecting one columns with another column
df.plot(kind='scatter', x='a', y='b', color='r')
df.plot(kind='scatter', x='a', y='c', color='r')
df.plot(kind='scatter', x='a', y='d', color='r')
df.plot(kind='scatter', x='a', y='e', color='r')
df.plot(kind='scatter', x='a', y='f', color='r')
Of course, you can iterate over columns like this
for column in df:
print(df[column])
I have a slightly odd csv file where the month column is repeated as such. My goal is to create a bar graph where each month has two columns of y (from both a and b). I have tried to approach this by separating the data frame into two - a only and b only - but the repetition of the month column gets in the way. Fairly new to Python and Pandas so perhaps there is a function I'm not aware of? Any help is appreciated.
month cond. y
Jan a 4
Jan b 8
Feb a 2
Feb b 9
March a 3
March b 7
Perhaps the most common way to approach this problem is to reshape the long-form data to wide-form via pivot and then DataFrame.plot:
import pandas as pd
from matplotlib import pyplot as plt
df = pd.DataFrame({
'month': ['Jan', 'Jan', 'Feb', 'Feb', 'March', 'March'],
'cond.': ['a', 'b', 'a', 'b', 'a', 'b'],
'y': [4, 8, 2, 9, 3, 7]
})
df.pivot(index='month', columns='cond.', values='y').plot(kind='bar', rot=0)
plt.tight_layout()
plt.show()
There is a noticeable issue in that the x-axis columns appear out of order as they are alphabetically ordered and not ordered by Date. One option would be to reindex before plotting. There would be more options if the month column was regular, but since it contains both full month names and abbreviations manually reindexing is likely the best option.
import pandas as pd
from matplotlib import pyplot as plt
df = pd.DataFrame({
'month': ['Jan', 'Jan', 'Feb', 'Feb', 'March', 'March'],
'cond.': ['a', 'b', 'a', 'b', 'a', 'b'],
'y': [4, 8, 2, 9, 3, 7]
})
(
df.pivot(index='month', columns='cond.', values='y')
.reindex(['Jan', 'Feb', 'March']) # Re-order so they appear correctly on x-axis
.plot(kind='bar', rot=0)
)
plt.tight_layout()
plt.show()
Seaborn is highly popular in solving these types of questions as the hue argument allows the reshaping step to be avoided. Additionally x will be in order of appearance in the frame so reindex is also unnecessary (assuming the data appears in the correct order in the source DataFrame)
sns.barplot:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
sns.set_theme() # (optional) Use seaborn theme
df = pd.DataFrame({
'month': ['Jan', 'Jan', 'Feb', 'Feb', 'March', 'March'],
'cond.': ['a', 'b', 'a', 'b', 'a', 'b'],
'y': [4, 8, 2, 9, 3, 7]
})
sns.barplot(data=df, x='month', y='y', hue='cond.')
plt.tight_layout()
plt.show()
Using the hue attribute to categorize also works
import seaborn as sns
sns.barplot(data=df,x='Month',y='y',hue='Cond')
result_plot
I am using the following code to produce a pie plot.
My question is, how do I mask/hide the numbers inside the pie chart?
I do not want the numbers 0.62, 0.31 and 0.02 inside the pie chart to be visible.
Thanks in advance.
import pandas as pd
import matplotlib.pyplot as plt
df99 = pd.DataFrame({
'Data': ['A', 'B', 'C'],
'Perc': [0.62, 0.31, 0.02]})
plt.pie(df99['Perc']*100, colors=['#002c4b','#392e2c','#92847a','#ccc2bb','#6b879d','#7FBAA4','#8E654C','#006CB8','#CBBBE9','#9778D3'],counterclock=False,startangle=-270,pctdistance=1.2,labeldistance=1.2,labels=df99['Data'],
autopct=lambda p: f"{p*df99['Perc'].sum()/100:.2f}")
plt.show()
IIUC,
import pandas as pd
import matplotlib.pyplot as plt
df99 = pd.DataFrame({
'Data': ['A', 'B', 'C'],
'Perc': [0.62, 0.31, 0.02]})
plt.pie(df99['Perc']*100,
colors=['#002c4b','#392e2c','#92847a','#ccc2bb','#6b879d','#7FBAA4','#8E654C','#006CB8','#CBBBE9','#9778D3'],counterclock=False,startangle=-270,pctdistance=1.2,labeldistance=1.2,
labels=df99['Data'],
autopct=None)
plt.show()
Output:
Let's use pandas plot also,
df99.set_index('Data').mul(100).plot.pie(y='Perc',colors=['#002c4b','#392e2c','#92847a','#ccc2bb','#6b879d','#7FBAA4','#8E654C','#006CB8','#CBBBE9','#9778D3'],counterclock=False,startangle=-270)
Output:
I'm trying to build a heatmap to illustrate the correlation between indexes and a range (string).
data = {'Report': [1,2,3,4],
'Hours': [30,45,85,24],
'Wage': [100,446,245,632],
'Worker': [321,63,456,234],
'Buyer': [36,53,71,52],
'Range': ['High', 'Medium', 'Low', 'Low']
}
df = pd.DataFrame(data, columns = ['Report', 'Hours', 'Wage', 'Worker', 'Buyer', 'Range'])
My expected result would be a heatmap with 'Hours', 'Wage', 'Worker', and 'Buyer' on the left as indexes and three categories in 'Range' on the bottom.
How do I achieve the desired result using seaborn heatmap?
Thanks in advance!
I appreciate any help!!
data = {'Report': [1,2,3,4],
'Hours': [30,45,85,24],
'Wage': [100,446,245,632],
'Worker': [321,63,456,234],
'Buyer': [36,53,71,52],
'Range': ['High', 'Medium', 'Low', 'Low']
}
df = pd.DataFrame(data, columns = ['Report', 'Hours', 'Wage', 'Worker', 'Buyer', 'Range'])
df_corr = df.corr()
fig, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(df_corr, square=True, vmax=1, vmin=-1, center=0)
print(df_corr)
Report Hours Wage Worker Buyer
Report 1.000000 0.103434 0.774683 0.103496 0.595586
Hours 0.103434 1.000000 -0.333933 0.548300 0.845140
Wage 0.774683 -0.333933 1.000000 -0.542259 0.208270
Worker 0.103496 0.548300 -0.542259 1.000000 0.356177
Buyer 0.595586 0.845140 0.208270 0.356177 1.000000
Just calculate the correlation coefficients and draw them with a headmap.