I have massive CSV file which uses timeseries which spans 3 years and looks something like this:
Date Company1 Company2
2020-01-01 00:00:00 100 200
2020-01-01 01:00:00 110 180
2020-01-01 02:00:00 90 210
2020-01-01 03:00:00 100 200
.... ... ...
2020-12-31 21:00:00 100 200
2020-12-31 22:00:00 80 230
2020-12-31 23:00:00 120 220
Except I have 10 companies.
Anyway, I managed to plot 3 plots for one month for each year, looks like this
newMatrix.plot(x='Date', y='Company4', xlim=('2020-01-01 00:00:00', '2020-01-31 23:00:00'))
newMatrix.plot(x='Date', y='Company4', xlim=('2021-01-01 00:00:00', '2021-01-31 23:00:00'))
newMatrix.plot(x='Date', y='Company4', xlim=('2022-01-01 00:00:00', '2022-01-31 23:00:00'))
Now the problem is that I can't figure out how to make one figure where I can see better how the trends differ between years (for example during January each year). The best outcome would be to have the days/months on the x axis and each plotted line representing each year.
I have been experimenting combining matplotlib with pandas plot, but so far I either get no plot or three figures. How can I solve this?
I think you need to do something like this (keep in mind I don't have the entire df):
import pandas as pd
import matplotlib.pyplot as plt
# create DataFrame
data = {'Date': ['2020-01-01 00:00:00', '2020-01-01 01:00:00', '2020-01-01 02:00:00', '2020-01-01 03:00:00', '2020-12-31 21:00:00', '2020-12-31 22:00:00', '2020-12-31 23:00:00'],
'Company1': [100, 110, 90, 100, 100, 80, 120],
'Company2': [200, 180, 210, 200, 200, 230, 220]}
df = pd.DataFrame(data)
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(15, 6), sharey=True)
for i, company in enumerate(df.columns):
# create mask for January for each year
mask_2020 = (df.index >= '2020-01-01') & (df.index <= '2020-01-31')
mask_2021 = (df.index >= '2021-01-01') & (df.index <= '2021-01-31')
mask_2022 = (df.index >= '2022-01-01') & (df.index <= '2022-01-31')
axes[i//5, i%5].plot(df.loc[mask_2020, company], label='2020')
axes[i//5, i%5].plot(df.loc[mask_2021, company], label='2021')
axes[i//5, i%5].plot(df.loc[mask_2022, company], label='2022')
axes[i//5, i%5].set_title(company)
axes[i//5, i%5].legend()
fig.text(0.04, 0.5, 'Value', va='center', rotation='vertical')
fig.text(0.5, 0.04, 'Date', ha='center')
fig.suptitle('Company trends for January of each year')
plt.subplots_adjust(wspace=0.3, hspace=0.5)
plt.show()
which gives:
Related
I have a dataframe containing a time series with two columns as follows:
dailyp kind
date
2015-01-01 165.0 national
2015-01-02 210.0 not_festive
2015-01-03 222.0 not_festive
2015-01-04 190.0 not_festive
2015-01-05 200.0 not_festive
... ... ...
2019-12-28 260.0 not_festive
2019-12-29 226.0 not_festive
2019-12-30 216.0 not_festive
2019-12-31 189.0 not_festive
2020-01-01 237.0 not_festive
I have written a function to plot the time series differing on the value of kind that goes as follows:
def plot_timeseries_by_category(df, category_col):
# Get the unique years from the index
years = df.index.year.unique()
# Create a subplot for each year
fig, axes = plt.subplots(len(years), 1, figsize=(10, len(years) * 5))
if len(years) == 1:
axes = [axes]
for year, ax in zip(years, axes):
# Filter the data for the current year
df_year = df[df.index.year == year]
# Create all neccesary colors
colors = {category: f'C{index}' for index, category in enumerate(df[category_col].unique())}
print(df_year.index)
# Groupby category and plot
for category, group in df_year.groupby(category_col):
group.plot('index', 'dailyp', marker='o', ax=ax, color=colors[category], label=category)
ax.set_title(str(year))
ax.legend()
The code also breaks the time series by years, but that works just fine. But instead of plotting a single line with different colors depending on the category, it plots a line for each category. I want to achieve what is shown in the approved answer of this post Plot Multicolored line based on conditional in python, but couldn't make it work.
Any help is appreciated!
You can do this
import pandas as pd
import matplotlib.pyplot as plt
data = {
'date': ['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04', '2015-01-05',
'2019-12-28', '2019-12-29', '2019-12-30', '2019-12-31', '2020-01-01'],
'dailyp': [165.0, 210.0, 222.0, 190.0, 200.0,
260.0, 226.0, 216.0, 189.0, 237.0],
'kind': ['national', 'not_festive', 'not_festive', 'not_festive', 'not_festive',
'not_festive', 'not_festive', 'not_festive', 'not_festive', 'not_festive']
}
df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
def plot_timeseries_by_category(df, category_col):
# Get the unique years from the index
years = df.index.year.unique()
# Create a subplot for each year
fig, axes = plt.subplots(len(years), 1, figsize=(10, len(years) * 5))
if len(years) == 1:
axes = [axes]
for year, ax in zip(years, axes):
# Filter the data for the current year
df_year = df[df.index.year == year].reset_index().reset_index()
# Create all neccesary colors
colors = {category: f'C{index}' for index, category in enumerate(df[category_col].unique())}
# Groupby category and plot
for category, group in df_year.groupby(category_col):
group = group.set_index('index')
ax.plot(group.index, group['dailyp'], marker='o', color=colors[category], label=category)
ax.set_title(str(year))
ax.legend()
plot_timeseries_by_category(df, 'kind')
plt.show()
which will return a plot for each year
I have to find the difference in data provided at 00:00:00 and 23:59:59 per day for seven days.
How to find the difference in the data frame, which is given on the start date and end date?
Sample Data
Date Data
2018-12-01 00:00:00 2
2018-12-01 12:00:00 5
2018-12-01 23:59:59 10
2018-12-02 00:00:00 12
2018-12-02 12:00:00 15
2018-12-02 23:59:59 22
Expected Output
Date Data
2018-12-01 8
2018-12-02 10
Example
data = {
'Date': ['2018-12-01 00:00:00', '2018-12-01 12:00:00', '2018-12-01 23:59:59',
'2018-12-02 00:00:00', '2018-12-02 12:00:00', '2018-12-02 23:59:59'],
'Data': [2, 5, 10, 12, 15, 22]
}
df = pd.DataFrame(data)
Code
df['Date'] = pd.to_datetime(df['Date'])
out = (df.resample('D', on='Date')['Data']
.agg(lambda x: x.iloc[-1] - x.iloc[0]).reset_index())
out
Date Data
0 2018-12-01 8
1 2018-12-02 10
Update
more efficient way
you can get same result following code:
g = df.resample('D', on='Date')['Data']
out = g.last().sub(g.first()).reset_index()
You can use groupby and iterate over with min-max range.
import pandas as pd
df = pd.DataFrame({
'Date': ['2018-12-01 00:00:00', '2018-12-01 12:00:00', '2018-12-01 23:59:59',
'2018-12-02 00:00:00', '2018-12-02 12:00:00', '2018-12-02 23:59:59'],
'Data': [2, 5, 10, 12, 15, 22]
})
df['Date'] = pd.to_datetime(df['Date'])
df['Date_Only'] = df['Date'].dt.date
result = df.groupby('Date_Only').apply(lambda x: x['Data'].max() - x['Data'].min())
print(result)
I have this dataframe:
I need to get the values of the single days between time 05:00:00 and 06:00:00 (so, in this example, ignore 07:00:00)
And create a separate dataframe for each day considering the last 3 days.
This is the result i want to achive: (3 dataframes considering 3 days and Time between 05 and 06)
I tried this: (without success)
df.sort_values(by = "Time", inplace=True)
df_of_yesterday = df[ (df.Time.dt.hour > 4)
& (df.Time.dt.hour < 7)]
You can use:
from datetime import date, time, timedelta
today = date.today()
m = df['Time'].dt.time.between(time(5), time(6))
df_yda = df.loc[m & (df['Time'].dt.date == today - timedelta(1))]
df_2da = df.loc[m & (df['Time'].dt.date == today - timedelta(2))]
df_3da = df.loc[m & (df['Time'].dt.date == today - timedelta(3))]
Output:
>>> df_yda
Time Open
77 2022-03-09 05:00:00 0.880443
78 2022-03-09 06:00:00 0.401932
>> df_2da
Time Open
53 2022-03-08 05:00:00 0.781377
54 2022-03-08 06:00:00 0.638676
>>> df_3da
Time Open
29 2022-03-07 05:00:00 0.838719
30 2022-03-07 06:00:00 0.897211
Setup a MRE:
import pandas as pd
import numpy as np
rng = np.random.default_rng()
dti = pd.date_range('2022-03-06', '2022-03-10', freq='H')
df = pd.DataFrame({'Time': dti, 'Open': rng.random(len(dti))})
Use Series.between with set offsets.DateOffset for datetimes between this times in list comprehension for list of DataFrames:
now = pd.to_datetime('now').normalize()
dfs = [df[df.Time.between(now - pd.DateOffset(days=i, hour=5),
now - pd.DateOffset(days=i, hour=6))] for i in range(1,4)]
print (dfs[0])
print (dfs[1])
print (dfs[2])
I've manually copied your data into a dictionary and then converted it to your desired output.
First you should probably edit your question to use the text version of the data instead of an image, here's a small example:
data = {
'Time': [
'2022-03-06 05:00:00',
'2022-03-06 06:00:00',
'2022-03-06 07:00:00',
'2022-03-07 05:00:00',
'2022-03-07 06:00:00',
'2022-03-07 07:00:00',
'2022-03-08 05:00:00',
'2022-03-08 06:00:00',
'2022-03-08 07:00:00',
'2022-03-09 05:00:00',
'2022-03-09 06:00:00',
'2022-03-09 07:00:00'
],
'Open': [
'13823.6',
'13786.6',
'13823.6',
'13823.6',
'13786.6',
'13823.6',
'13823.6',
'13786.6',
'13823.6',
'13823.6',
'13786.6',
'13823.6'
]
}
df = pd.DataFrame(data)
Then you can use this code to get all the dates that are on the same day and in between the hours 4 and 7 and then create your dataframes as follows:
import pandas as pd
from datetime import datetime
dict = {}
for index, row in df.iterrows():
found = False
for item in dict:
date = datetime.strptime(row['Time'], '%Y-%m-%d %H:%M:%S')
date2 = datetime.strptime(item, '%Y-%m-%d')
if(date.date() == date2.date() and date.hour > 4 and date.hour < 7):
dict[item].append(row['Open'])
found = True
date = datetime.strptime(row['Time'], '%Y-%m-%d %H:%M:%S')
if(not found and date.hour > 4 and date.hour < 7):
dict[date.strftime('%Y-%m-%d')] = []
dict[date.strftime('%Y-%m-%d')].append(row['Open'])
for key in dict:
temp = {
key: dict[key]
}
df = pd.DataFrame(temp)
print(df)
This should be very easy, but I'm having several issues. The thing is, I want to do something like this post, but (1) I have a datetime field, so I have the hour, minutes and seconds in my date column, (2) I want to plot a line graph by day.
So, this is my data:
date col1 col2
2020-01-01 00:01:020 20 500
2020-01-02 00:01:020 10 500
2020-01-02 00:01:000 20 500
2020-01-02 00:01:021 20 500
2020-02-05 20:11:010 30 500
2020-02-05 10:01:020 10 500
.
.
.
So, as I mentioned above, what I want is to plot the daily average of col1.
I started with this:
df.groupby('date')['col1'].mean()
That didn't work because of the hours, minutes and seconds.
Later, I tried this:
df["day"] = df["date"].dt.day
df.groupby("day")["col1"].mean().plot(kind="line")
I almost did it, but the column day is not actually the day, but a number which represents the position of the day in the year, I guess. So any ideas on how to make this plot?
IIUC:
groupby date instead of day:
df.groupby(df['date'].dt.date)["col1"].mean().plot(kind="line",rot=25)
#you don't need to create a column date for this directly pass date in groupby()
OR
df.groupby(df['date'].dt.normalize())["col1"].mean().plot(kind="line",rot=25)
Optional(you can also do this by these 2 but the above 2 fits best for your data and condition since the below ones will create unnecessary dates and NaN's):
#via pd.Grouper():
df.groupby(pd.Grouper(key='date',freq='1D'))["col1"].mean().dropna().plot(kind="line")
#OR
#via dt.floor():
df.groupby(df['date'].dt.floor('1D'))["col1"].mean().dropna().plot(kind="line")
output(for given sample data):
Since this question has seaborn and plotly tags as well,
sns.lineplot performs this operation without the need for groupby mean as the default estimator will compute the mean value per x instance. To remove error shading set ci=None.
Imports and setup:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
df = pd.DataFrame({
'date': ['2020-01-01 00:01:020', '2020-01-02 00:01:020',
'2020-01-02 00:01:000', '2020-01-02 00:01:021',
'2020-02-05 20:11:010', '2020-02-05 10:01:020'],
'col1': [20, 10, 20, 20, 30, 10],
'col2': [500, 500, 500, 500, 500, 500]
})
df['date'] = pd.to_datetime(df['date'])
Plotting Code:
# Seaborn Line Plot x is the date, y is col1 default estimator is mean
ax = sns.lineplot(data=df, x=df['date'].dt.date, y='col1', ci=None)
ax.tick_params(axis='x', rotation=45) # Make X ticks easier to read
plt.tight_layout()
plt.show()
For plotly take the groupby mean and create a px.line.
Imports and setup:
import pandas as pd
import plotly.express as px
df = pd.DataFrame({
'date': ['2020-01-01 00:01:020', '2020-01-02 00:01:020',
'2020-01-02 00:01:000', '2020-01-02 00:01:021',
'2020-02-05 20:11:010', '2020-02-05 10:01:020'],
'col1': [20, 10, 20, 20, 30, 10],
'col2': [500, 500, 500, 500, 500, 500]
})
df['date'] = pd.to_datetime(df['date'])
Plotting code:
plot_values = df.groupby(df['date'].dt.date)["col1"].mean()
fig = px.line(plot_values)
fig.show()
What do you want exactly? the date without time?
try this:
df["day"] = df["date"].apply(lambda l: l.date())
I have one dataframe df as below:
df = pd.DataFrame({'date': [20121231,20130102, 20130105, 20130106, 20130107, 20130108],'price': [25, 163, 235, 36, 40, 82]})
How to make df['date'] as date type and make 'price' as y-label and 'date' as x-label?
Thanks a lot.
Use to_datetime with parameter format, check http://strftime.org/:
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
print (df)
date price
0 2012-12-31 25
1 2013-01-02 163
2 2013-01-05 235
3 2013-01-06 36
4 2013-01-07 40
5 2013-01-08 82
And then plot:
df.plot(x='date', y='price')
import pandas as pd
%matplotlib inline
df = pd.DataFrame({'date': [20121231,20130102, 20130105, 20130106, 20130107,
20130108],'price': [25, 163, 235, 36, 40, 82]})
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df.plot(x='date', y='price')
With pandas you can directly convert the date column to datetime type. And then you can plot with matplotlib. Take a look at this answer and also this one.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
df = pd.DataFrame(
{'date': [20121231, 20130102, 20130105, 20130106, 20130107, 20130108],
'price': [25, 163, 235, 36, 40, 82]
})
fig, ax = plt.subplots()
# Date plot with matplotlib
ax.plot_date(
pd.to_datetime(df["date"], format="%Y%m%d"),
df["price"],
'v-'
)
# Days and months and the horizontal locators
ax.xaxis.set_minor_locator(dates.DayLocator())
ax.xaxis.set_minor_formatter(dates.DateFormatter('%d\n%a'))
ax.xaxis.set_major_locator(dates.MonthLocator())
ax.xaxis.set_major_formatter(dates.DateFormatter('\n\n\n%b\n%Y'))
ax.xaxis.grid(True, which="minor")
ax.yaxis.grid()
plt.tight_layout()
plt.show()
Result: