I'm playing around with kaggle dataframe to practice using matplotlib.
I was creating bar graph one by one, but it keeps adding up.
When I called plt.show() there were like 10 windows of figure suddenly shows up.
Is it possible to combine 4 of those figures into 1 window?
These part are in the same segments "Time Analysis" So I want to combine these 4 figures in 1 window.
import matplotlib.pyplot as plt
import seaborn as sns
dataset = ('accidents_data.csv')
df = pd.read_csv(dataset)
"""Time Analysis :
Analyze the time that accidents happen for various patterns and trends"""
df.Start_Time = pd.to_datetime(df.Start_Time) #convert the start time column to date time format
df['Hour_of_Accident'] = df.Start_Time.dt.hour #extract the hour from the time data
hour_accident = df['Hour_of_Accident'].value_counts()
hour_accident_df = hour_accident.to_frame() #convert the series data to dataframe in order to sort the index columns
hour_accident_df.index.names = ['Hours'] #naming the index column
hour_accident_df.sort_index(ascending=True, inplace=True)
print(hour_accident_df)
# Plotting the hour of accidents data in a bargraph
hour_accident_df.plot(kind='bar',figsize=(8,4),color='blue',title='Hour of Accident')
#plt.show() #Show the bar graph
"""Analyzing the accident frequency per day of the week"""
df['Day_of_the_week'] = df.Start_Time.dt.day_of_week
day_of_accident = df['Day_of_the_week'].value_counts()
day_of_accident_df = day_of_accident.to_frame() #convert the series data to dataframe so that we can sort the index columns
day_of_accident_df.index.names = ['Day'] # Renaming the index column
day_of_accident_df.sort_index(ascending=True, inplace=True)
print(day_of_accident_df)
f, ax = plt.subplots(figsize = (8, 5))
x = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Sartuday', 'Sunday']
l = day_of_accident_df.index.values
y = day_of_accident_df.Day_of_the_week
plt.bar(l, y, color='green')
plt.title('Day of the week vs total number of accidents')
plt.ylabel("No. of accidents recorded")
ax.set_xticks(l)
ax.set_xticklabels(x)
#plt.show()
"""Analysis for the months"""
df['Month'] = df.Start_Time.dt.month
accident_month = df['Month'].value_counts()
accident_month_df = accident_month.to_frame() #convert the series data to dataframe so that we can sort the index columns
accident_month_df.index.names = ['Month'] # Renaming the index column
accident_month_df.sort_index(ascending=True, inplace=True)
print(accident_month_df)
#Plotting the Bar Graph
accident_month_df.plot(kind='bar',figsize=(8,5),color='purple',title='Month of Accident')
"""Yearly Analysis"""
df['Year_of_accident'] = df.Start_Time.dt.year
#Check the yearly trend
yearly_count = df['Year_of_accident'].value_counts()
yearly_count_df = pd.DataFrame({'Year':yearly_count.index, 'Accidents':yearly_count.values})
yearly_count_df.sort_values(by='Year', ascending=True, inplace=True)
print(yearly_count_df)
#Creating line plot
yearly_count_df.plot.line(x='Year',color='red',title='Yearly Accident Trend ')
plt.show()
Related
I would like to plot a bar chart using pandas and plotly that shows the frequency of players by day while at the same time, I can filter bars shown by Level so there has to be a "Level Legend" on the graph for me to filter the result shown. I would also like all the 7-days to be shown on the x-axis in an orderly manner. Help is much appreciated!
Below is my code:
# data lists
day = ['Monday','Wednesday','Sunday','Wednesday']
level = ['Level 5','Level 2','Level 1','Level 2']
score = ['50','20','10','25']
player = ['Tom','Sam','Bill','Max']
# create new dataframe and insert desired columns
df = pd.DataFrame({'day':day,
'level':level,
'score':score,
'player':player})
df
I'd like the output to be something like this:
The complete snippet below will filter your dataset by levels using buttons, and display count of players by day on the x-axis. The essence of how this is done is:
Set day as an ordered cateogory using pd.Categorical(df['day'], categories=new_order, ordered=True),
split the data by df['level'].unique() and assign traces using fig.add_bar(),
set the visibilty of each trace through visibility = [list(s) for s in [e==1 for e in np.eye(len(levels))]]
Plot 1
Plot 2
As you can see I've added a few details to your dataset to make it a bit more interesting.
Complete code:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
# data lists
day = ['Monday','Wednesday','Sunday','Wednesday', 'Tuesday']
level = ['Level 5','Level 2','Level 1','Level 2', 'Level 2']
score = ['50','20','10','25', '25']
player = ['Tom','Sam','Bill','Max', 'Sam']
# create new dataframe and insert desired columns
df = pd.DataFrame({'day':day,
'level':level,
'score':score,
'player':player})
# plotly setup
fig = go.Figure()
# data management
# - grouping
# - setiing up visibility attributes for groups
# - treat days as categorical variable
levels = df['level'].unique()[::-1]
nlevels = len(levels)
visibility = [list(s) for s in [e==1 for e in np.eye(len(levels))]]
new_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['day'] = pd.Categorical(df['day'], categories=new_order, ordered=True)
buttons = [] # container for buttons in updatemenu
# split data by levels,
# and add button per level
# with counts of players per day
# in an orderly fashion
for i, l in enumerate(levels):
ds = df[df['level']==l]
dg = ds.groupby('day').agg({'player':'count'}).reset_index()
fig.add_bar(x = dg['day'], y = dg['player'],
visible=True if l=='Level 1' else False)
# one button per dataframe to trigger the visibility
# of all columns / traces for each dataframe
button = dict(label=l,
method = 'restyle',
args = ['visible',visibility[i]])
buttons.append(button)
# include dropdown updatemenu in layout
fig.update_layout(updatemenus=[dict(type="dropdown",
direction="down",
buttons = buttons)])
fig.show()
Thank you in advance for the assistance!
I am trying to create a heat map from time-series data and the data begins mid year, which is causing the top of my heat map to be shifted to the left and not match up with the rest of the plot (Shown Below). How would I go about shifting the just the top line over so that the visualization of the data syncs up with the rest of the plot?
(Code Provided Below)
import pandas as pd
import matplotlib.pyplot as plt
# links to datadata
url1 = 'https://raw.githubusercontent.com/the-datadudes/deepSoilTemperature/master/minotDailyAirTemp.csv'
# load the data into a DataFrame, not a Series
# parse the dates, and set them as the index
df1 = pd.read_csv(url1, parse_dates=['Date'], index_col=['Date'])
# groupby year and aggregate Temp into a list
dfg1 = df1.groupby(df1.index.year).agg({'Temp': list})
# create a wide format dataframe with all the temp data expanded
df1_wide = pd.DataFrame(dfg1.Temp.tolist(), index=dfg1.index)
# ploting the data
fig, (ax1) = plt.subplots(ncols=1, figsize=(20, 5))
ax1.matshow(df1_wide, interpolation=None, aspect='auto');
Now, what its the problem, the dates on the dataset, if you see the Dataset this start on
`1990-4-24,15.533`
To solve this is neccesary to add the data between 1990/01/01 -/04/23 and delete the 29Feb.
rng = pd.date_range(start='1990-01-01', end='1990-04-23', freq='D')
df = pd.DataFrame(index= rng)
df.index = pd.to_datetime(df.index)
df['Temp'] = np.NaN
frames = [df, df1]
result = pd.concat(frames)
result = result[~((result.index.month == 2) & (result.index.day == 29))]
With this data
dfg1 = result.groupby(result.index.year).agg({'Temp': list})
df1_wide = pd.DataFrame(dfg1['Temp'].tolist(), index=dfg1.index)
# ploting the data
fig, (ax1) = plt.subplots(ncols=1, figsize=(20, 5))
ax1.matshow(df1_wide, interpolation=None, aspect='auto');
The problem with the unfilled portions are a consequence of the NaN values on your dataset, in this case you take the option, replace the NaN values with the column-mean or replace by the row-mean.
Another ways are available to replace the NaN values
df1_wide = df1_wide.apply(lambda x: x.fillna(x.mean()),axis=0)
Thank you in advance! (Image provided below)
I am trying to have the Y-Axis of my heatmap reflect the year associated with the data it is pulling. What is happening is that the Y-Axis is merely counting the number of years (0, 1, 2, ....30) when it should be appearing as 1990, 1995, 2000, etc.
How do I update my code (provided below) so that the Y-Axis shows the actual year instead of the year count?
# links to Minot data if you want to pull from the web
##url2 = 'https://raw.githubusercontent.com/the-
datadudes/deepSoilTemperature/master/allStationsDailyAirTemp1.csv'
raw_data = pd.read_csv('https://raw.githubusercontent.com/the-
datadudes/deepSoilTemperature/master/allStationsDailyAirTemp1.csv', index_col=1, parse_dates=True)
df_all_stations = raw_data.copy()
selected_station = 'Minot'
# load the data into a DataFrame, not a Series
# parse the dates, and set them as the index
df1 = df_all_stations[df_all_stations['Station'] == selected_station]
# groupby year and aggregate Temp into a list
dfg1 = df1.groupby(df1.index.year).agg({'Temp': list})
# create a wide format dataframe with all the temp data expanded
df1_wide = pd.DataFrame(dfg1.Temp.tolist(), index=dfg1.index)
# adding the data between 1990/01/01 -/04/23 and delete the 29th of Feb
rng = pd.date_range(start='1990-01-01', end='1990-04-23', freq='D')
df = pd.DataFrame(index= rng)
df.index = pd.to_datetime(df.index)
df['Temp'] = np.NaN
frames = [df, df1]
result = pd.concat(frames)
result = result[~((result.index.month == 2) & (result.index.day == 29))]
dfg1 = result.groupby(result.index.year).agg({'Temp': list})
df1_wide = pd.DataFrame(dfg1['Temp'].tolist(), index=dfg1.index)
# Setting all leftover empty fields to the average of that time in order to fill in the gaps
df1_wide = df1_wide.apply(lambda x: x.fillna(x.mean()),axis=0)
# ploting the data
fig, (ax1) = plt.subplots(ncols=1, figsize=(20, 5))
##ax1.set_title('Average Daily Air Temperature - Minot Station')
ax1.set_xlabel('Day of the year')
ax1.set_ylabel('Years since start of data collection')
# Setting the title so that it changes based off of the selected station
ax1.set_title('Average Air Temp for ' + str(selected_station))
# Creating Colorbar
cbm = ax1.matshow(df1_wide, interpolation=None, aspect='auto');
# Plotting the colorbar
cb = plt.colorbar(cbm, ax=ax1)
cb.set_label('Temp in Celsius')
Add this line at the end of your code:
ax1.set_yticklabels(['']+df1_wide.index.tolist()[::5])
I have created a code, which shows a heatmap of the data in the CSV file.
The code is as follows:
import pandas as pd
import matplotlib.pyplot as plt
data= pd.read_csv("data.csv" , sep=';', header=0,
index_col='Date')
fig=plt.imshow(data, cmap='YlOrBr', interpolation='nearest')
plt.colorbar()
plt.xlabel("Time (UTC)")
plt.ylabel("Date")
plt.show()
The dataset is as follows:
The time range varies from 00:00 till 23:50 with steps of 10 minutes.
I want the x axis to show the time from 00:00 till 23:50 in steps per hour.
The index is set as date. The date range is from 29-Oct-2017 till 24-Mar-2018.
I want the Y axis to show the date range in steps of months.
You can stack columns, then groupby month and hour and then unstack it back (I'm taking mean values here when aggregating, but you can change to sum or whatever aggregation should be done there):
df = pd.DataFrame(np.nan,
columns=pd.date_range('00:00', '23:50', freq='10min'),
index=pd.date_range('2017-10-29', '2018-03-24'))
df[df.columns] = np.random.randint(0, 100, df.shape)
fig, ax = plt.subplots(2, figsize=(10,6))
ax[0].imshow(df, cmap='YlOrBr')
ix = df.stack().index
l1 = ix.get_level_values(0).month
l2 = ix.get_level_values(1).hour
df2 = df.stack().groupby([l1,l2], sort=False).mean().unstack(1)
ax[1].imshow(df2, cmap='YlOrBr')
Output (original DataFrame above, processed below):
Update:
If the goal is just to put monthly and hourly labels on the same plot, please see below:
df = pd.DataFrame(np.nan,
columns=pd.date_range('00:00', '23:50', freq='10min').astype(str),
index=pd.date_range('2017-10-29', '2018-03-24').astype(str))
df[df.columns] = np.random.randn(*(df.shape))
fig, ax = plt.subplots(1, figsize=(10,6))
l1 = pd.to_datetime(df.index).month
l2 = pd.to_datetime(df.columns).hour
x = pd.Series(l2).drop_duplicates()
y = pd.Series(l1).drop_duplicates()
ax.imshow(df, cmap='YlOrBr')
ax.set_xticks(x.index)
ax.set_xticklabels(x)
ax.set_yticks(y.index)
ax.set_yticklabels(y)
Output:
This produces a graph of all these stock prices plotted against the date. If you zoom in, all the tiny ticks have labels for the dates. I wanted to reduce the frequency of ticks so that it only displayed tick marks at the month and year. I have tried using locators and formatters, but whenever I add them all of the ticks and tick labels completely disappear. All that's left at the x-axis is the x-axis label.
Does any of the issue lie within the fact that I extract the date and use that for the x-axis plot for every new batch of stock prices I want to plot? Any advice would be appreciated. I am a beginner programmer.
from iexfinance import get_historical_data
import pandas as pd
import matplotlib.pyplot as plt
def tester():
start_date = '20170828'
end_date = '20180828'
symbols =['GOOG', 'IBM', 'CRON']
for symbol in symbols:
f_temp = get_historical_data(symbol, start_date, end_date, output_format='pandas')
df_close = pd.DataFrame(f_temp['close'])
df_open = pd.DataFrame(f_temp['open'])
df_date_string =
pd.to_datetime(f_temp.index).strftime("%Y%m%d").astype(str)
df = pd.merge(df_open, df_close, on=df_date_string)
df.columns = ['date', 'open', 'close']
plt.legend(symbols)
plot_data(df)
plt.show()
return df
def normalize_data(df):
return df/df.ix[0, :]
def plot_data(df):
normalized = normalize_data(df['close'])
plt.plot(df['date'], normalized)
plt.title("Normalized close stock prices")
plt.xlabel("Dates")
plt.ylabel("Close prices")
plt.tight_layout()
if __name__ == "__main__":
df = tester()