I am plotting some time series from .nc files using pandas, xarray and matplotlib. I have two datasets:
Sea Surface Temerature from 1982 to 2019, from which I plot the monthly mean for my area and represent the monthly temperature variation for those 37 years.
Sea Sea Surface Temerature from 2020 to 2021, where I plot the monthly variation for each of the years.
Two plot this, I use te following code (PLEASE NOTE THAT DUE TO MEMORY ALLOCATION ISSUES I HAD WHILE LOOPING THROUGH THE VARIABLES I WROTE A VERY BASIC CODE WITH NO LOOPS, SORRY FOR THAT!)
import xarray as xr
import matplotlib.pyplot as plt
from matplotlib import dates as md
import pandas as pd
import numpy as np
import netCDF4
import seaborn as sns
import marineHeatWaves as mhw
import datetime
sns.set()
ds_original = xr.open_dataset('sst_med_f81_to21_L4.nc')
ds_original_last = xr.open_dataset('sst_med_f20_to21_L4.nc')
extract_date = datetime.datetime.today()
date = extract_date.strftime("%Y-%m-%d")
ds1 = ds_original.sel(time=slice('1982-01-01','2019-12-31'))
ds2 = ds_original_last.sel(time=slice('2020-01-01','2020-12-31'))
ds3 = ds_original_last.sel(time=slice('2021-01-01', date))
# Convert to Pandas Dataframe
df1 = ds1.to_dataframe().reset_index().set_index('time')
df2 = ds2.to_dataframe().reset_index().set_index('time')
df3 = ds3.to_dataframe().reset_index().set_index('time')
#Converting to Celsius
def kelvin_to_celsius(temp_k):
"""
Receives temperature in K and returns
temperature in Cº
"""
temp_c = temp_k - 273.15
return temp_c
df1['analysed_sst_C'] = kelvin_to_celsius(df1['analysed_sst'])
df2['analysed_sst_C'] = kelvin_to_celsius(df2['analysed_sst'])
df3['analysed_sst_C'] = kelvin_to_celsius(df3['analysed_sst'])
#Indexing by month and yearday
df1['month'] = df1.index.month
df1['yearday'] = df1.index.dayofyear
df2['month'] = df2.index.month
df2['yearday'] = df2.index.dayofyear
df3['month'] = df3.index.month
df3['yearday'] = df3.index.dayofyear
# Calculating the average
daily_sst_82_19 = df1.analysed_sst_C.groupby(df1.yearday).agg(np.mean)
daily_sst_2020 = df2.analysed_sst_C.groupby(df2.yearday).agg(np.mean)
daily_sst_2021 = df3.analysed_sst_C.groupby(df3.yearday).agg(np.mean)
# Quick Plot
sns.set_theme(style="whitegrid")
fig, ax=plt.subplots(1, 1, figsize=(15, 7))
ax.xaxis.set_major_locator(md.MonthLocator())
ax.xaxis.set_major_formatter(md.DateFormatter('%b'))
ax.margins(x=0)
plt.plot(daily_sst_82_19, label='1982-2019')
plt.plot(daily_sst_2020,label='2020')
plt.plot(daily_sst_2021,label='2021', c = 'black')
plt.legend(loc = 'upper left')
I obtain the following plot:
I want my plot to start with Jan and end with Dec, but I cannot figure out where is the problem. I have tried to set x axis limit between to specific dates, but this creates a conflict as one of the time series is for 37 years and the other two are for 1 year only.
Any help would be very appreciated!!
UPDATE
I figured out how to move the months, specifying the follwing:
ax.xaxis.set_major_locator(MonthLocator(bymonthday=2))
So I obtained this:
But I still ned to delete that last Jan, and I cannot figure out how to do it.
Okay so I figure out how to solve the issue.
Fine tunning plot parameters, I switched the DateFormatter to %D, to see the year as well. For my surprise, the year was set to 1970 and I have no idea why, because my oldest dataset starts in 1981. So once I discovered this, I set up the xlims to the ones you can read below and it worked pretty well:
#Add to plot settings:
ax.set_xlim(np.datetime64('1970-01-01'), np.datetime64('1970-12-31'))
ax.xaxis.set_major_locator(MonthLocator(bymonthday=1))
ax.xaxis.set_major_formatter(md.DateFormatter('%b'))
Result:
Related
I have read in a monthly temperature anomalies csv file using Pandas read.csv() function. Years are from 1881 to 2022. I excluded the last 3 months of 202 to avoid -999 values). Date format is yyyy-mm-dd. How can I just plot the year and only one value instead of 12 on the x-axis (i.e., I don't need 12 1851s, 1852s, etc.)?
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.dates import YearLocator, MonthLocator, DateFormatter
import matplotlib.dates as mdates
ds = pd.read_csv('path_to_file.csv', header='infer', engine='python', skipfooter=3)
dates = ds['Date']
tAnoms = ds[' Berkeley Earth 2m Air Temperature (degree C) 0N-90N;0E-360E']
fig = plt.figure(figsize=(10,10))
ax = plt.subplot(111)
ax.plot(dates,tAnoms)
ax.plot(dates,tAnoms.rolling(60, center=True).mean())
ax.xaxis.set_major_locator(mdates.YearLocator(month=1) # EDIT
years_fmt = mdates.DateFormatter('%Y') # EDIT 2
ax.xaxis.set_major_formatter(years_fmt) # EDIT 2
plt.show()
EDIT: adding the following gives me the 2nd plot
EDIT 2: Gives me yearly values, but only from 1970-1975. 3rd plot
You could:
Create a new column year from your Date column.
Compute the average temperature for each year (using mean or median): df.groupby(['year']).mean()
So, I found a good, but maybe not perfect solution. First thing I needed to do was use parse_dates & infer_datetime_format when reading in the csv file. Then, convert dates to pydatetime(). mdates.AutoDateLocator() was what I needed along with set_major_formatter. Not sure how I could manually change the interval, however (e.g., change to every 10 years or 25 years instead of using the default. This does work well enough though.
ds = pd.read_csv('path_to_file.csv', parse_dates=['Date'], infer_datetime_format=True,
header='infer', engine='python', skipfooter=3)
dates = ds['Date'].dt.to_pydatetime() # Convert to pydatetime()
tAnoms = ds[' Berkeley Earth 2m Air Temperature (degree C) 0N-90N;0E-360E']
fig = plt.figure(figsize=(10,10))
ax = plt.subplot(111)
# Produce plot
ax.plot(dates,tAnoms.rolling(60, center=True).mean())
# Use AutoDateLocator() from matplotlib.dates (mdates)
# Set date format to years
ax.xaxis.set_major_locator(mdates.AutoDateLocator())
years_fmt = mdates.DateFormatter('%Y')
ax.xaxis.set_major_formatter(years_fmt)
plt.show()
I am trying to represent CDC Delay of Care data as a line graph but am having some trouble formatting the y axis so that it is a percentage to the hundredths place. I would also like for the x axis to show every year in the range selected.
Here is my code:
import pandas as pd
from isolation import isolate_total_stub, isolate_age_stub
import matplotlib.pyplot as plt
# very simple extraction, drop some columns and check some data
cdc_data = pd.read_csv('CDC_Delay_of_Care_Data.csv')
# separate the categories of delayed care
delay_of_medical_care = cdc_data[cdc_data.PANEL == 'Delay or nonreceipt of needed medical care due to cost']
# isolate the totals stub
total_delay_of_medical_care = isolate_total_stub(delay_of_medical_care)
x_axis = total_delay_of_medical_care.YEAR
y_axis = total_delay_of_medical_care.ESTIMATE
plt.plot(x_axis, y_axis)
plt.xlabel('Year')
plt.ylabel('Percentage')
plt.show()
The graph that displays looks like this:
line graph
Excuse me for being a novice, I have been googling for an hour now and instead of continue to search for an answer I thought it would be more productive to ask StackOverflow.
Thank you for your time.
To change the format of Y-axis, you can use set_major_formatter
To change X-axis to date in year format, you will need to use set_major_locator, assuming that your date is in datetime format
To change format of X-axis, you can again use the set_major_formatter
I am showing a small example below with dummy data. Hope this works.
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import matplotlib.dates as mdate
estimate = [8, 7.1, 11, 10.6, 8, 8.3]
year = ['2000-01-01', '2004-01-01', '2008-01-01', '2012-01-01', '2016-01-01', '2020-01-01']
year=pd.to_datetime(year) ## Convert string to datetime
plt.figure(figsize=(12,5)) ## Added so the Years don't overlap on each other
plt.plot(year, estimate)
plt.xlabel('Year')
plt.ylabel('Percentage')
plt.gca().yaxis.set_major_formatter(FormatStrFormatter('%.2f')) ## Makes X-axis label with two decimal points
locator = mdate.YearLocator()
plt.gca().xaxis.set_major_locator(locator) ## Changes datetime to years - 1 label per year
plt.gca().xaxis.set_major_formatter(mdate.DateFormatter('%Y')) ## Shows X-axis in Years
plt.gcf().autofmt_xdate() ## Rotates X-labels, if you want to use it
plt.show()
Output plot
I am trying to create a plot with an amount (int) in the y-axis and days in the x-axis.
I want the plot to always have the whole month in the x-axis although I dont have data for all days.
This is the code I tryed:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates
import datetime as dt
df=get_pandas_data(datab) #Taking data from database in pandas DataFrame
fig = plt.figure(figsize=(10,10)) #Initialize plot
ax1 = fig.add_subplot(1,1,1)
dates=[dt.datetime.strptime(d,'%Y-%m-%d').date() for d in df['date']]
dates=list(set(dates)) #Takes all the dates from de Dataframe and sets to avoid repeated dates
s=df.resample('D', on='date')['amount'].sum() #Takes the total amount for the same date
ax1.bar(dates,s) #Bar plot for dates and amount
ax1.set(xlabel="Date",
ylabel="Balance (€)",
title="Total Monthly balance") # Plot information
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y'))
#this is soposed to set all days of the month in the x-axis
ax1.xaxis.set_major_locator(mdates.DayLocator(interval=1))
fig.autofmt_xdate()
plt.show()
The result I get from this is a plot but only with those days that have data.
How can I make the plot to have all days in the month and plot the bar on those who have data?
This works fine with bare datetimes and matplotlib so you must be malforming your data somehow when doing your pandas manipulations. But we can't really help because we don't have your dataframe. Its always preferable to create a standalone example with dummy data, and as little code as possible to recreate the issue. a) 90% of the time you will realize your problem b) if not, we can help...
import numpy as np
import matplotlib.pyplot as plt
import datetime
x = np.array([1, 3, 7, 8, 10])
y = x * 2
dates = [datetime.datetime(2000, 2, xx) for xx in x]
fig, ax = plt.subplots()
ax.bar(dates, y)
fig.autofmt_xdate()
plt.show()
I have a dataset containing information related to COVID-19 data with columns = ['total_cases', 'new_cases', 'date']. The data increases monotonically with atleast no sudden spikes in new_cases in January month. The dataset can be found here: https://fnvuusdqoptinxntjrmodi.coursera-apps.org/edit/CovidIndiaData.csv with lots of columns out of which I use only ['total_cases', 'new_cases', 'date'].
First 10 days data is 0 for 'new_cases' as shown in this image:
I use this code to plot bar plot for 'date' vs 'new_cases':
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.dates import DateFormatter
df = pd.read_csv("CovidIndiaData.csv", parse_dates=['date'], index_col=['date'])
df = df[['new_cases', 'total_cases']]
df.fillna(0)
fig = plt.figure()
ax = plt.gca()
ax.bar(df.index.values,
df['new_cases'],
color='purple')
ax.set(xlabel="Date",
ylabel="New Cases",
title="New Cases per day",
xlim=["2020-01-01", "2020-07-18"])
date_form = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(date_form)
ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))
plt.setp(ax.get_xticklabels(), rotation=45)
plt.show()
The final plot looks like this:
The plot shows some spikes at 7th January ('01-07' on plot) where clearly in dataset the new_cases are 0. This is continued approximately after every one month interval.
Where does this data come from? How can I plot a correct graph for this data?
Thanks to Davis Herring for pointing out my mistake.
In case anyone faces similar issue, the solution is to specify date format when your date isn't in standardized format.
What I did is:
mydateparser = lambda x: pd.datetime.strptime(x, "%d-%m-%Y")
df = pd.read_csv("CovidIndiaData.csv", parse_dates=['date'], date_parser=mydateparser, index_col=['date'])
I am running into some issues adding Matplotlib lines into Pandas plot. I am trying to plot a straight line using the slope to determine what the start and end-points are. But the resultant graph does not look like a straight line at all.
I have simplified the case to the MVCE below. The initial part is for setup to replicate the key feature of the complicated dataframe I have.
import pandas as pd
import matplotlib.pyplot as plt
LEN_SER = 23
dates = pd.date_range('2015-07-03', periods=LEN_SER, freq='B')
df = pd.DataFrame(range(1,LEN_SER+1), index=dates)
ts = df.iloc[:,0]
# The above is the setup of the MVCE to replicate the issue.
fig = plt.figure()
ax1 = plt.subplot2grid((1, 1), (0, 0))
ax1.plot([ts.index[5], ts.index[20]],
[ts[5], ts[5] + (1.0 * (20 - 5))], 'o-')
ts.plot(ax=ax1)
plt.show()
This gives a graph that has a wavy line due to the weekends. The Matplotlib is affecting how Pandas is plotting the series. If I take out the ax1.plot() line, then it becomes a straight line.
So the question is: How do I draw straight lines on my Pandas plot with Matplotlib? Put it another way, I want the plot to treat the axis labels as categories so weekends will be ignored. That way, I am hoping that Matplotlib and Pandas will both give a straight line.
As you correctly observe, if you delete the line ax1.plot(), then matplotlib treats your dates as categories, and the pandas plot is a nice straight line. However, in the command
ax1.plot([ts.index[5], ts.index[20]],
[ts[5], ts[5] + (1.0 * (20 - 5))], 'o-')
you ask matplotlib to interpolate between two points, in the process of interpolating matplotlib recognize dates in the x-axis. That is why the straight line pandas plot with respect to date categories (5 a week) becomes a wavy line with respect to dates (7 a week). Which is correct as well, because with respect to dates your data simply isn't a represented by a straight line.
You can force the category interpretation replacing dates by strings through
df.index = df.reset_index().apply(lambda x: x['index'].strftime('%Y-%m-%d'), axis=1)
before defining ts. That results in the plot
Now the matplotlib plot is just two categories against two values and matplotlib does not bother to realize that the two categories are among the categories in the pandas plot. (Changing the order of the two plots saves your x-axis at least.) Modifying the matplotlib plot to
ax1.plot([5, 20], [ts[5], ts[5] + (1.0 * (20 - 5))], 'o-')
plots a line between categories 5 and 20, and finally gives you two straight lines with respect to a categories x-axis.
Full code:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn') # (optional - style was set when I produced my graph)
LEN_SER = 23
dates = pd.date_range('2015-07-03', periods=LEN_SER, freq='B')
df = pd.DataFrame(range(1,LEN_SER+1), index=dates)
df.index = df.reset_index().apply(lambda x: \
x['index'].strftime('%Y-%m-%d'), axis=1) # dates -> categories (string)
ts = df.iloc[:,0]
# The above is the setup of the MVCE to replicate the issue.
fig = plt.figure()
ax1 = plt.subplot2grid((1, 1), (0, 0))
ax1.plot([5, 20], [ts[5], ts[5] + (1.0 * (20 - 5))], 'o-')
# x coordinates 'categories' 5 and 20
ts.plot(ax=ax1)
plt.show()
You already answered the question: " probably due to the weekends"
replace:
dates = pd.date_range('2015-07-03', periods=LEN_SER, freq='B')
with
dates = pd.date_range('2015-07-03', periods=LEN_SER, freq='D')
B - business day frequency
D - calendar day frequency
And your lines are straightened.
You're right - it is due to weekends. You can tell by the slope - five consecutive days have a sharper incline (+1 each day), than the three consecutive days (+1 total). So, what exactly do you want to plot? If you want to literally plot the blue line, you can interpolate the points between your two points like this:
...
# ts.plot(ax=ax1)
ts.iloc[[5,20]].resample('1D').interpolate(how='mean').plot(ax=ax1)
plt.show()
For simplicity I started from 2015-07-04. Does it work for you?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
LEN_SER = 21
dates = pd.date_range('2015-07-04', periods=LEN_SER, freq='B')
the_axes = []
# take the_axes like monday and friday for each week
for monday, friday in zip(dates[dates.weekday==0], dates[dates.weekday==4]):
the_axes.append([monday.date(), friday.date()])
x = dates
y = range(1,LEN_SER+1)
n_Axes = len(the_axes)
fig,(axes) = plt.subplots(1, n_Axes, sharey=True, figsize=(15,8))
for i in range(n_Axes):
ax = axes[i]
ax.plot(x, y)
ax.set_xlim(the_axes[i])
fig.autofmt_xdate()
print(dates)
plt.show()