How to create a Boxplot with Timestamp using Matplotlib and Seaborn? - python

I have been trying to get a boxplot with each box representing an emotion over a period of time.
The data frame used to plot this contains timestamp and emotion name. I have tried converting the timestamp into a string first and then to datetime and finally to int64. This resulted in the gaps between x labels as seen in the plot. I have tried the same without converting to int64, but the matplotlib doesn't seem to allow the dates in the plot.
I'm attaching the code I have used here:
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib qt
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
data = pd.read_csv("TX-governor-sentiment.csv")
## check data types
data.dtypes
# drop rows with all missing values
data = data.dropna(how='all')
## transforming the timestamp column
#convert from obj type to string then to date type
data['timestamp2'] = data['timestamp']
data['timestamp2'] = pd.to_datetime(data['timestamp2'].astype(str), format='%m/%d/%Y %H:%M')
# convert to number format with the following logic:
# yyyymmddhourmin --> this allows us to treat dates as a continuous variable
data['timestamp2'] = data['timestamp2'].dt.strftime('%Y%m%d%H%M')
data['timestamp2'] = data['timestamp2'].astype('int64')
print (data[['timestamp','timestamp2']])
#data transformation for data from Orange
df = pd.DataFrame(columns=('timestamp', 'emotion'))
for index, row in data.iterrows():
if row['sentiment'] == 0:
df.loc[index] = [row['timestamp2'], 'Neutral']
else:
df.loc[index] = [row['timestamp2'], row['Emotion']]
# Plot using Seaborn & Matplotlib
#convert timestamp in case it's not in number format
df['timestamp'] = df['timestamp'].astype('int64')
fig = plt.figure(figsize=(10,10))
#colors = {"Neutral": "grey", "Joy": "pink", "Surprise":"blue"}
#visualize as boxplot
plot_ = sns.boxplot(x="timestamp", y="emotion", data=df, width=0.5,whis=np.inf);
#add data point on top
plot_ = sns.stripplot(x="timestamp", y="emotion", data=df, alpha=0.8, color="black");
fig.canvas.draw()
#modify ticks and labels
plt.xlim([202003010000,202004120000])
plt.xticks([202003010000, 202003150000, 202003290000, 202004120000], ['2020/03/01', '2020/03/15', '2020/03/29', '2020/04/12'])
#add colors
for patch in plot_.artists:
r, g, b, a = patch.get_facecolor()
patch.set_facecolor((r, g, b, .3))
Please let me know how I can overcome this problem of gaps in the boxplot. Thank you!

Related

How to turn individual points into a kernel density map?

I am trying to recreate in Python a map I made in Tableau. I'm pretty sure it's called a kernel density map (just "density" map in Tableau). Each point is just a single point and doesn't correspond with any kind of value.
I have plotted my points on a map, but am unable to figure out how to give them the contour appearance in the Tableau map. I see some examples that include np.meshgrid or matplotlib's contourf function, but I'm unable to apply it to my data because I don't have a Z coordinate (from what I can tell). Below is what I have currently:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import timedelta, date
import matplotlib
from pandas import Series, DataFrame
import geopandas as gpd
from geopandas import GeoDataFrame
from shapely.geometry import Point, mapping
# import data
df = pd.read_csv('./Data/complete.csv', on_bad_lines='skip') # csv had some bad data, had to skip
# drop unneccessary columns
df = df[['datetime', 'latitude', 'longitude']]
# convert 'datetime' to YYYY-MM-DD
df['datetime'] = pd.to_datetime(df['datetime'], dayfirst=True)
# clean the data
# remove values that contain '/' and 'q'
df = df.drop(df[df.latitude.str.contains(r'[/q]')].index)
# convert lat/long to float
df['latitude'] = df.latitude.astype('float')
df['longitude'] = df.longitude.astype('float')
# create GDF from lat/long
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude)).set_crs('EPSG:4326')
# import shapefile
us_map = gpd.read_file(r'./Data/USA_States_(Generalized)/USA_States_Generalized.shp')
#remove AK and HI
us_map = us_map[~us_map['STATE_NAME'].isin(['Alaska', 'Hawaii'])]
# plotting only 2012 sightings
sightings2012 = df[(df['datetime'] > '12/31/2011') & (df['datetime'] < '01/01/2013')]
# create GDF for 2012 sightings
gdf2012 = gpd.GeoDataFrame(sightings2012,
geometry=gpd.points_from_xy(sightings2012.longitude,
sightings2012.latitude)).set_crs('EPSG:4326')
# clip the sightings data
us_sightings_2012 = gpd.clip(gdf2012, us_map)
cmap = matplotlib.cm.get_cmap('plasma')
# plot
fig, ax = plt.subplots(1, 1)
us_map.plot(ax=ax)
us_sightings_2012.plot(ax=ax, cmap=cmap)
plt.show()
And here is my output:

Matplotlib x-axis ticks, fixed location for dates

In the timeline plot I’m making, I want date tickers to show only specified dates. (In my example I show tickers for events ‘A’, but it can be any list on tickers). I found how to do it when x-axis data is numeric (upper subplot in my example), but this won’t work with timestamp date type (bottom plot).
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
myData = pd.DataFrame({'date':['2019-01-15','2019-02-10','2019-03-20','2019-04-17','2019-05-23','2019-06-11'],'cnt':range(6),'event':['a','b','a','b','a','b']})
myData['date'] = [pd.Timestamp(j) for j in myData['date']]
start = pd.Timestamp('2019-01-01')
stop = pd.Timestamp('2019-07-01')
inxa = myData.loc[myData['event'] == 'a'].index
inxb = myData.loc[myData['event'] == 'b'].index
# create two plots, one with 'cnt' as x-axis, the other 'dates' on x-axis.
fig, ax = plt.subplots(2,1,figsize=(16,9))
ax[0].plot((0,6),(0,0), 'k')
ax[1].plot((start, stop),(0,0))
for g in inxa:
ax[0].plot((myData.loc[g,'cnt'],myData.loc[g,'cnt']),(0,1),c='r')
ax[1].plot((myData.loc[g,'date'],myData.loc[g,'date']),(0,1),c='r')
for g in inxb:
ax[0].plot((myData.loc[g,'cnt'],myData.loc[g,'cnt']),(0,2),c='b')
ax[1].plot((myData.loc[g,'date'],myData.loc[g,'date']),(0,2),c='b')
xlist0 = myData.loc[myData['event']=='a','cnt']
xlist1 = myData.loc[myData['event']=='a','date']
ax[0].xaxis.set_major_locator(ticker.FixedLocator(xlist0))
# ax[1].xaxis.set_major_locator(**???**)
Couldn't find a sufficient duplicate, maybe I didn't look hard enough. There are a number of ways to do this:
Converting to numbers first or using the underlying values of a Pandas DateTime Series
xticks = [mdates.date2num(z) for z in xlist1]
# or
xticks = xlist1.values
and at least a couple ways to use it/them
ax[1].xaxis.set_major_locator(ticker.FixedLocator(xticks))
ax[1].xaxis.set_ticks(xticks)
Date tick labels
How to set the xticklabels for date in matplotlib
how to get ticks every hour?
...

Uncertain why trendline is not appearing on matplotlib scatterplot

I am trying to plot a trendline for a matplotlib scatterplot and am uncertain why the trendline is not appearing. What should I change in my code to make the trendline appear? Event is a categorical data type.
I've followed what most other stackoverflow questions suggest about plotting a trendline, but am uncertain why my trendline is not appearing.
#import libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pandas.plotting import register_matplotlib_converters
#register datetime converters
register_matplotlib_converters()
#read dataset using pandas
dataset = pd.read_csv("UsrNonCallCDCEvents_CDCEventType.csv")
#convert date to datetime type
dataset['Interval'] = pd.to_datetime(dataset['Interval'])
#convert other columns to numeric type
for cols in list(dataset):
if cols != 'Interval' and cols != 'CDCEventType':
dataset[cols] = pd.to_numeric(dataset[cols])
#create pivot of dataset
pivot_dataset = dataset.pivot(index='Interval',columns='CDCEventType',values='AvgWeight(B)')
#create scatterplot with trendline
x = pivot_dataset.index.values.astype('float64')
y = pivot_dataset['J-STD-025']
plt.scatter(x,y)
z = np.polyfit(x,y,1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")
plt.show()
This is the graph currently being output. I am trying to get this same graph, but with a trendline: https://imgur.com/a/o18a5Y3
It's also fine that x axis is not showing dates
A snippet of my dataframe looks like this: https://imgur.com/a/xJAcgEI
I've painted out the irrelvant column names

aggregate tick data to open high low close non time related

I would like to consolidate tick data stored in a pandas dataframe to the open high low close format but not time related, but aggregated for every 100 ticks. After that I would like to display them in a candlestick chart using matlibplot.
I solved this already for a time related aggregation using a pandas dataset with two values: TIMESTAMP and PRICE. The TIMESTAMP already has the pandas date format so I work with that:
df["TIMESTAMP"]= pd.to_datetime(df["TIMESTAMP"])
df = df.set_index(['TIMESTAMP'])
data_ohlc = df['PRICE'].resample('15Min').ohlc()
Is there any function, that resamples datasets in the ohlc format not using a time frame, but a count of ticks?
After that it comes to visualization, so for plotting I have to change date format to mdates. The candlestick_ohlc function requires a mdate format:
data_ohlc["TIMESTAMP"] = data_ohlc["TIMESTAMP"].apply(mdates.date2num)
from mpl_finance import candlestick_ohlc
candlestick_ohlc(ax1,data_ohlc.values,width=0.005, colorup='g', colordown='r',alpha=0.75)
So is there any function to display a candle stick chart without mdates because by aggregating tick data there would be no time relation?
As there seems to be no build in function for this problem I wrote one myself. The given dataframe needs to have the actual values in the column "PRICE":
def get_pd_ohlc(mydf, interval):
## use a copy, so that the new column doesn't effect the original dataset
mydf = mydf.copy()
## Add a new column to name tick interval
interval = [(1+int(x/interval)) for x in range(mydf["PRICE"].count())]
mydf["interval"] = interval
##Step 1: Group
grouped = mydf.groupby('interval')
##Step 2: Calculate different aggregations
myopen = grouped['PRICE'].first()
myhigh = grouped['PRICE'].max()
mylow = grouped['PRICE'].min()
myclose = grouped['PRICE'].last()
##Step 3: Generate Dataframe:
pd_ohlc = pd.DataFrame({'OPEN':myopen,'HIGH':myhigh,'LOW':mylow,'CLOSE':myclose})
return(pd_ohlc)
pd_100 = get_pd_ohlc(df,100)
print (pd_100.head())
I also found a solution to display ist. Module mpl_finance has a function candlestick2_ohlc, that does not need any datetime information. Here is the code:
#Making plot
import matplotlib.pyplot as plt
from mpl_finance import candlestick2_ohlc
fig = plt.figure()
plt.rcParams['figure.figsize'] = (16,8)
ax1 = plt.subplot2grid((6,1), (0,0), rowspan=12, colspan=1)
#Making candlestick plot
candlestick2_ohlc(ax1, pd_ohlc['OPEN'], pd_ohlc['HIGH'],
pd_ohlc['LOW'], pd_ohlc['CLOSE'], width=0.5,
colorup='#008000', colordown='#FF0000', alpha=1)

Date removed from x axis on overlaid plots matplotlib

I am trying to show time series lines representing an effort amount using matplotlib and pandas.
I've got my DF's to all to overlay in one plot, however when I do python seems to strip the x axis of the date and input some numbers. (I'm not sure where these come from but at a guess, not all days contain the same data so python has reverted to using an index id number). If I plot any one of these they come up with date on the x-axis.
Any hints or solutions to make the x axis show date for the multiple plot would be much appreciated.
This is the single figure plot with time axis:
Code I'm using to plot is
fig = pl.figure()
ax = fig.add_subplot(111)
ax.plot(b342,color='black')
ax.plot(b343,color='blue')
ax.plot(b344,color='red')
ax.plot(b345,color='green')
ax.plot(b346,color='pink')
ax.plot(fi,color='yellow')
plt.show()
This is the multiple plot fig with weird x axis:
One option would be to manually specify the x-axis based on the DataFrame index, and then plot directly using matplotlib.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# make up some data
n = 100
dates = pd.date_range(start = "2015-01-01", periods = n, name = "yearDate")
dfs = []
for i in range(3):
df = pd.DataFrame(data = np.random.random(n)*(i + 1), index = dates,
columns = ["FishEffort"] )
df.df_name = str(i)
dfs.append(df)
# plot it directly using matplotlib instead of through the DataFrame
fig = plt.figure()
ax = fig.add_subplot()
for df in dfs:
plt.plot(df.index,df["FishEffort"], label = df.df_name)
plt.legend()
plt.show()
Another option would be to concatenate your DataFrames and plot using Pandas. If you give your "FishEffort" field the correct label name when loading the data or via DataFrame.rename then the labels will be specified automatically.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
n = 100
dates = pd.date_range(start = "2015-01-01", periods = n, name = "yearDate")
dfs = []
for i in range(3):
df = pd.DataFrame(data = np.random.random(n)*(i + 1), index = dates,
columns = ["DataFrame #" + str(i) ] )
df.df_name = str(i)
dfs.append(df)
df = pd.concat(dfs, axis = 1)
df.plot()
I've found an answer that does what I want, it seems that calling plt.plot wasn't using the date as the x axis, however calling it using the pandas documentation did the trick.
ax = b342.plot(label='342')
b343.plot(ax=ax, label='test')
b344.plot(ax=ax)
b345.plot(ax=ax)
b346.plot(ax=ax)
fi.plot(ax=ax)
plt.show()
I was wondering if anyone knew hwo to change the labels here?

Categories