A little info: I'm very new to programming and this is a small part of the my first script. The goal of this particular segment is to display a seaborn heatmap with vertical depth on y-axis, time on x-axis and intensity of a scientific measurement as the heat function.
I'd like to apologize if this has been answered elsewhere, but my searching abilities must have failed me.
sns.set()
nametag = 'Well_4_all_depths_capf'
Dp = D[D.well == 'well4']
print(Dp.date)
heat = Dp.pivot("depth", "date", "capf")
### depth, date and capf are all columns of a pandas dataframe
plt.title(nametag)
sns.heatmap(heat, linewidths=.25)
plt.savefig('%s%s.png' % (pathheatcapf, nametag), dpi = 600)
this is the what prints from the ' print(Dp.date) '
so I'm pretty sure the formatting from the dataframe is in the format I want, particularly Year, day, month.
0 2016-08-09
1 2016-08-09
2 2016-08-09
3 2016-08-09
4 2016-08-09
5 2016-08-09
6 2016-08-09
...
But, when I run it the date axis always prints with blank times (00:00 etc) that I don't want.
Is there a way to remove these from the date axis?
Is the problem that in a cell above I used this function to scan the file name and make a column with the date??? Is it wrong to use datetime instead of just a date function?
D['date']=pd.to_datetime(['%s-%s-%s' %(f[0:4],f[4:6],f[6:8]) for f in
D['filename']])
You have to use strftime function for your date series of dataframe to plot xtick labels correctly:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import random
dates = [datetime.today() - timedelta(days=x * random.getrandbits(1)) for x in xrange(25)]
df = pd.DataFrame({'depth': [0.1,0.05, 0.01, 0.005, 0.001, 0.1, 0.05, 0.01, 0.005, 0.001, 0.1, 0.05, 0.01, 0.005, 0.001, 0.1, 0.05, 0.01, 0.005, 0.001, 0.1, 0.05, 0.01, 0.005, 0.001],\
'date': dates,\
'value': [-4.1808639999999997, -9.1753490000000006, -11.408113999999999, -10.50245, -8.0274750000000008, -0.72260200000000008, -6.9963940000000004, -10.536339999999999, -9.5440649999999998, -7.1964070000000007, -0.39225599999999999, -6.6216390000000001, -9.5518009999999993, -9.2924690000000005, -6.7605589999999998, -0.65214700000000003, -6.8852289999999989, -9.4557760000000002, -8.9364629999999998, -6.4736289999999999, -0.96481800000000006, -6.051482, -9.7846860000000007, -8.5710630000000005, -6.1461209999999999]})
pivot = df.pivot(index='depth', columns='date', values='value')
sns.set()
ax = sns.heatmap(pivot)
ax.set_xticklabels(df['date'].dt.strftime('%d-%m-%Y'))
plt.xticks(rotation=-90)
plt.show()
Example with standard heatmap datetime labels
import pandas as pd
import seaborn as sns
dates = pd.date_range('2019-01-01', '2020-12-01')
df = pd.DataFrame(np.random.randint(0, 100, size=(len(dates), 4)), index=dates)
sns.heatmap(df)
We can create some helper classes/functions to get to some better looking labels and placement. AxTransformer enables conversion from data coordinates to tick locations, set_date_ticks allows custom date ranges to be applied to plots.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections.abc import Iterable
from sklearn import linear_model
class AxTransformer:
def __init__(self, datetime_vals=False):
self.datetime_vals = datetime_vals
self.lr = linear_model.LinearRegression()
return
def process_tick_vals(self, tick_vals):
if not isinstance(tick_vals, Iterable) or isinstance(tick_vals, str):
tick_vals = [tick_vals]
if self.datetime_vals == True:
tick_vals = pd.to_datetime(tick_vals).astype(int).values
tick_vals = np.array(tick_vals)
return tick_vals
def fit(self, ax, axis='x'):
axis = getattr(ax, f'get_{axis}axis')()
tick_locs = axis.get_ticklocs()
tick_vals = self.process_tick_vals([label._text for label in axis.get_ticklabels()])
self.lr.fit(tick_vals.reshape(-1, 1), tick_locs)
return
def transform(self, tick_vals):
tick_vals = self.process_tick_vals(tick_vals)
tick_locs = self.lr.predict(np.array(tick_vals).reshape(-1, 1))
return tick_locs
def set_date_ticks(ax, start_date, end_date, axis='y', date_format='%Y-%m-%d', **date_range_kwargs):
dt_rng = pd.date_range(start_date, end_date, **date_range_kwargs)
ax_transformer = AxTransformer(datetime_vals=True)
ax_transformer.fit(ax, axis=axis)
getattr(ax, f'set_{axis}ticks')(ax_transformer.transform(dt_rng))
getattr(ax, f'set_{axis}ticklabels')(dt_rng.strftime(date_format))
ax.tick_params(axis=axis, which='both', bottom=True, top=False, labelbottom=True)
return ax
These provide us a lot of flexibility, e.g.
fig, ax = plt.subplots(dpi=150)
sns.heatmap(df, ax=ax)
set_date_ticks(ax, '2019-01-01', '2020-12-01', freq='3MS')
or if you really want to get weird you can do stuff like
fig, ax = plt.subplots(dpi=150)
sns.heatmap(df, ax=ax)
set_date_ticks(ax, '2019-06-01', '2020-06-01', freq='2MS', date_format='%b `%y')
For your specific example you'll have to pass axis='x' to set_date_ticks
First, the 'date' column must be converted to a datetime dtype with pandas.to_datetime
If the desired result is to only have the dates (without time), then the easiest solution is to use the .dt accessor to extract the .date component. Alternative, use dt.strftime to set a specific string format.
strftime() and strptime() Format Codes
df.date.dt.strftime('%H:%M') would extract hours and minutes into a string like '14:29'
In the example below, the extracted date is assigned to the same column, but the value can also be assigned as a new column.
pandas.DataFrame.pivot_table is used to aggregate a function if there are multiple values in a column for each index, pandas.DataFrame.pivot should be used if there is only a single value.
This is better than .groupby because the dataframe is correctly shaped to be easily plotted.
Tested in python 3.8.11, pandas 1.3.2, matplotlib 3.4.3, seaborn 0.11.2
import pandas as pd
import numpy as np
import seaborn as sns
# create sample data
dates = [f'2016-08-{d}T00:00:00.000000000' for d in range(9, 26, 2)] + ['2016-09-09T00:00:00.000000000']
depths = np.arange(1.25, 5.80, 0.25)
np.random.seed(365)
p1 = np.random.dirichlet(np.ones(10), size=1)[0] # random probabilities for random.choice
p2 = np.random.dirichlet(np.ones(19), size=1)[0] # random probabilities for random.choice
data = {'date': np.random.choice(dates, size=1000, p=p1), 'depth': np.random.choice(depths, size=1000, p=p2), 'capf': np.random.normal(0.3, 0.05, size=1000)}
df = pd.DataFrame(data)
# display(df.head())
date depth capf
0 2016-08-19T00:00:00.000000000 4.75 0.339233
1 2016-08-19T00:00:00.000000000 3.00 0.370395
2 2016-08-21T00:00:00.000000000 5.75 0.332895
3 2016-08-23T00:00:00.000000000 1.75 0.237543
4 2016-08-23T00:00:00.000000000 5.75 0.272067
# make sure the date column is converted to a datetime dtype
df.date = pd.to_datetime(df.date)
# extract only the date component of the date column
df.date = df.date.dt.date
# reshape the data for heatmap; if there's no need to aggregate a function, then use .pivot(...)
dfp = df.pivot_table(index='depth', columns='date', values='capf', aggfunc='mean')
# display(dfp.head())
date 2016-08-09 2016-08-11 2016-08-13 2016-08-15 2016-08-17 2016-08-19 2016-08-21 2016-08-23 2016-08-25 2016-09-09
depth
1.50 0.334661 NaN NaN 0.302670 0.314186 0.325257 0.313645 0.263135 NaN NaN
1.75 0.305488 0.303005 0.410124 0.299095 0.313899 0.280732 0.275758 0.260641 NaN 0.318099
2.00 0.322312 0.274105 NaN 0.319606 0.268984 0.368449 0.311517 0.309923 NaN 0.306162
2.25 0.289959 0.315081 NaN 0.302202 0.306286 0.339809 0.292546 0.314225 0.263875 NaN
2.50 0.314227 0.296968 NaN 0.312705 0.333797 0.299556 0.327187 0.326958 NaN NaN
# plot
sns.heatmap(dfp, cmap='GnBu')
I had a similar problem, but the date was the index. I've just converted the date to string (pandas 1.0) before plotting and it worked for me.
heat['date'] = heat.date.astype('string')
Related
I'm trying to plot a graph of a time series which has dates from 1959 to 2019 including months, and I when I try plotting this time series I'm getting a clustered x-axis where the dates are not showing properly. How is it possible to remove the months and get only the years on the x-axis so it wont be as clustered and it would show the years properly?
fig,ax = plt.subplots(2,1)
ax[0].hist(pca_function(sd_Data))
ax[0].set_ylabel ('Frequency')
ax[1].plot(pca_function(sd_Data))
ax[1].set_xlabel ('Years')
fig.suptitle('Histogram and Time series of Plot Factor')
plt.tight_layout()
# fig.savefig('factor1959.pdf')
pca_function(sd_Data)
comp_0
sasdate
1959-01 -0.418150
1959-02 1.341654
1959-03 1.684372
1959-04 1.981473
1959-05 1.242232
...
2019-08 -0.075270
2019-09 -0.402110
2019-10 -0.609002
2019-11 0.320586
2019-12 -0.303515
[732 rows x 1 columns]
From what I see, you do have years on your second subplot, they are just overlapped because there are to many of them placed horizontally. Try to increase figsize, and rotate ticks:
# Builds an example dataframe.
df = pd.DataFrame(columns=['Years', 'Frequency'])
df['Years'] = pd.date_range(start='1/1/1959', end='1/1/2023', freq='M')
df['Frequency'] = np.random.normal(0, 1, size=(df.shape[0]))
fig, ax = plt.subplots(2,1, figsize=(20, 5))
ax[0].hist(df.Frequency)
ax[0].set_ylabel ('Frequency')
ax[1].plot(df.Years, df.Frequency)
ax[1].set_xlabel('Years')
for tick in ax[0].get_xticklabels():
tick.set_rotation(45)
tick.set_ha('right')
for tick in ax[1].get_xticklabels():
tick.set_rotation(45)
tick.set_ha('right')
fig.suptitle('Histogram and Time series of Plot Factor')
plt.tight_layout()
p.s. if the x-labels still overlap, try to increase your step size.
First off, you need to store the result of the call to pca_function into a variable. E.g. called result_pca_func. That way, the calculations (and possibly side effects or different randomization) are only done once.
Second, the dates should be converted to a datetime format. For example using pd.to_datetime(). That way, matplotlib can automatically put year ticks as appropriate.
Here is an example, starting from a dummy test dataframe:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.DataFrame({'Date': [f'{y}-{m:02d}' for y in range(1959, 2019) for m in range(1, 13)]})
df['Values'] = np.random.randn(len(df)).cumsum()
df = df.set_index('Date')
result_pca_func = df
result_pca_func.index = pd.to_datetime(result_pca_func.index)
fig, ax2 = plt.subplots(figsize=(10, 3))
ax2.plot(result_pca_func)
plt.tight_layout()
plt.show()
My problem is similar to the one encountered on this topic: Change heatmap's yticks for multi-index dataframe
I would like to have yticks every 6 months, with them being the index of my dataframe. But I can't manage to make it work.
The issue is that my dataframe is 13500*290 and the answer given in the link takes a long time and doesn't really work (see image below).
This is an example of my code without the solution from the link, this part works fine for me:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
df = pd.DataFrame(index = pd.date_range(datetime(1984, 6, 10), datetime(2021, 1, 14), freq='1D') )
for i in range(0,290):
df['Pt{0}'.format(i)] = np.random.random(size=len(df))
f, ax = plt.subplots(figsize=(20,20))
sns.heatmap(df, cmap='PuOr', vmin = np.min(np.min(df)), vmax = np.max(np.max(df)), cbar_kws={"label": "Ice Velocity (m/yr)"})
This part does not work for me and produces the figure below, which shouldn't have the stack of ylabels on the yaxis:
f, ax = plt.subplots(figsize=(20,20))
years = df.index.get_level_values(0)
ytickvalues = [year if index in (2, 7, 12) else '' for index, year in enumerate(years)]
sns.heatmap(df, cmap='PuOr', vmin = np.min(np.min(df)), vmax = np.max(np.max(df)), cbar_kws={"label": "Ice Velocity (m/yr)"}, yticklabels = ytickvalues)
Here are a couple ways to adapt that link for your use case (1 label per 6 months):
Either: Show an empty string except on Jan 1 and Jul 1 (i.e., when %m%d evals to 0101 or 0701)
labels = [date if date.strftime('%m%d') in ['0101', '0701'] else ''
for date in df.index.date]
Or: Show an empty string except every ~365/2 days (i.e., when row % 183 == 0)
labels = [date if row % 183 == 0 else ''
for row, date in enumerate(df.index.date)]
Note that you don't have a MultiIndex, so you can just use df.index.date (no need for get_level_values).
Here is the output with a minimized version of your df:
sns.heatmap(df, cmap='PuOr', cbar_kws={'label': 'Ice Velocity (m/yr)'},
vmin=df.values.min(), vmax=df.values.max(),
yticklabels=labels)
I need to show the values in a dataframe column as a binary colormap, based on whether they are above or below a threshold. I also need to show the raw numbers.
I am using seaborn heatmap.
sample data frame
Month Raw_value
Jan 3.72
feb 2.51
Mar 1.82
...
I have converted the column to binary, then retained only the index & binary column & removed other columns from dataframe, then plotted as binary. But this won't help to show the actual numbers (see figure).
kpi2['status'] = np.where(kpi2['raw_value'] > thres, 1,0) # convert to binary
kpi2 = kpi2.set_index('month')
kpi2.drop(['raw_value'], axis=1, inplace=True) # drop other columns
kpi_transposed = kpi2.transpose() # convert column to row for plotting horizontally
rdgn = sns.diverging_palette(h_neg=130, h_pos=10, s=99, l=55, sep=3, as_cmap=True)
sns.heatmap(kpi_transposed, linecolor='white', linewidths=1.3,vmin=0, vmax=1, cmap=rdgn, cbar=False)
plt.show()
But if I work with the raw data column, I am unsure how to color it as binary.
Any idea?
Instead of dropping raw_value. transpose both columns, then when calling the heatmap use loc to select the status as the data and pass raw_value to annot of sns.heatmap:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
kpi2 = pd.DataFrame({'month': ['Jan', 'Feb', 'Mar'],
'raw_value': [3.72, 2.51, 1.82]})
thres = 3
kpi2['status'] = np.where(kpi2['raw_value'] > thres, 1, 0)
kpi2 = kpi2.set_index('month')
kpi_transposed = kpi2.transpose()
rdgn = sns.diverging_palette(h_neg=130, h_pos=10, s=99, l=55, sep=3,
as_cmap=True)
ax = sns.heatmap(
kpi_transposed.loc[['status']], # Select Status Column
annot=kpi_transposed.loc[['raw_value']], # Select raw_values for annot
fmt='.2f', # Specify Annotation Format
linecolor='white', linewidths=1.3, vmin=0, vmax=1,
cmap=rdgn, cbar=False
)
# Add Percent Sign
for t in ax.texts:
t.set_text(t.get_text() + "%")
plt.show()
I would like to have a plot for each case in my dataframe. my x-axis is a range between 10 and 500 and my y-axis is the row values of each column for each case (prob-10, prob-20 and so on up to prob-500)
what i really wants is to have a plot for example for case1
x-axis = 10 , 20 ,30 , .....
y-axis = 0,35, 0,24, 0,44 .....
my dataframe looks like this
cases
year
HH
prob-10
prob-20
prob-30
case1
2018
200
0,35
0,24
0,44
case2
2017
299
0,33
0,25
0,33
case3
2019
200
0,45
0,22
0,33
In case that you meant a row for a specific value of case you could try this:
import pandas as pd
import matplotlib.pyplot as plt
import logging
df = pd.DataFrame({"cases": ["case1", "case_2", "case_3"],
"year": ["2017", "2018", "2019"],
"HH": [200, 299, 200],
"prob-10": [0.35, 0.33, 0.45],
"prob-20": [0.24, 0.25, 0.22],
"prob-30": [0.44, 0.33, 0.33]})
def plot_row_as_bars(df: pd.DataFrame,
case: str="case1",
exclude_list: list=["cases", "year", "HH"]):
""" Plot row of a Pandas dataframe as bar chart
"""
temp = df.loc[df["cases"] == case]
# Check temporary dataframe
if len(temp) < 0:
logging.warning(f"Case: {case} is not avialable.")
return
# Exclude columns
temp = temp.drop(exclude_list, axis = 1)
x = temp.columns
y = temp.iloc[0].to_list()
plt.bar(x, y)
plt.grid()
plt.show()
plot_row_as_bars(df)
Set index to cases, this will ease the df.plot function later and [filter] only the columns starting with prob-. Convert the comma used in the original dataframe to dot so that the dataframe can be interpreted as float with astype(float). Then, transpose the dataframe to get the y-axis with the row values of each column for each case, plot the chart with df.plot(kind='bar').
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('sample.csv', sep='\s+')
df = df.set_index('cases')
# get only prob-* columns
df = df.filter(regex='prob-*')
# replace original comma with dot
df = df.apply(lambda x: x.str.replace(',', '.'))
df = df.astype(float)
df = df.T
df.plot(kind='bar')
plt.xticks(rotation=45)
plt.show()
I'm not certain what you mean by "draw the first row", but if you want to simply save the first row as a variable you could try casting the dataframe to a dictionary like this.
row1 = dict(df)[0][1]
print(row1)
I am trying to plot a simple pandas Series object, its something like this:
2018-01-01 10
2018-01-02 90
2018-01-03 79
...
2020-01-01 9
2020-01-02 72
2020-01-03 65
It includes only the first month of each year, so it only contains the month January and all its values through the days.
When i try to plot it
# suppose the name of the series is dates_and_values
dates_and_values.plot()
It returns a plot like this (made using my current data)
It is clearly plotting by year and then the month, so it looks pretty squished and small, since i don't have any other months except January, is there a way to plot it by the year and day so it outputs a better plot to observe the days.
the x-axis is the index of the dataframe
dates are a continuous series, x-axis is continuous
change index to be a string of values, means it it no longer continuous and squishes your graph
have generated some sample data that only has January to demonstrate
import matplotlib.pyplot as plt
cf = pd.tseries.offsets.CustomBusinessDay(weekmask="Sun Mon Tue Wed Thu Fri Sat",
holidays=[d for d in pd.date_range("01-jan-1990",periods=365*50, freq="D")
if d.month!=1])
d = pd.date_range("01-jan-2015", periods=200, freq=cf)
df = pd.DataFrame({"Values":np.random.randint(20,70,len(d))}, index=d)
fig, ax = plt.subplots(2, figsize=[14,6])
df.set_index(df.index.strftime("%Y %d")).plot(ax=ax[0])
df.plot(ax=ax[1])
I suggest that you convert the series to a dataframe and then pivot it to get one column for each year. This lets you plot the data for each year with a separate line, either in the same plot using different colors or in subplots. Here is an example:
import numpy as np # v 1.19.2
import pandas as pd # v 1.2.3
# Create sample series
rng = np.random.default_rng(seed=123) # random number generator
dt = pd.date_range('2018-01-01', '2020-01-31', freq='D')
dt_jan = dt[dt.month == 1]
series = pd.Series(rng.integers(20, 90, size=dt_jan.size), index=dt_jan)
# Convert series to dataframe and pivot it
df_raw = series.to_frame()
df_pivot = df_raw.pivot_table(index=df_raw.index.day, columns=df_raw.index.year)
df = df_pivot.droplevel(axis=1, level=0)
df.head()
# Plot all years together in different colors
ax = df.plot(figsize=(10,4))
ax.set_xlim(1, 31)
ax.legend(frameon=False, bbox_to_anchor=(1, 0.65))
ax.set_xlabel('January', labelpad=10, size=12)
for spine in ['top', 'right']:
ax.spines[spine].set_visible(False)
# Plot years separately
axs = df.plot(subplots=True, color='tab:blue', sharey=True,
figsize=(10,8), legend=None)
for ax in axs:
ax.set_xlim(1, 31)
ax.grid(axis='x', alpha=0.3)
handles, labels = ax.get_legend_handles_labels()
ax.text(28.75, 80, *labels, size=14)
if ax.is_last_row():
ax.set_xlabel('January', labelpad=10, size=12)
ax.figure.subplots_adjust(hspace=0)