Unwanted stacked data in date_plot - python

I'm trying to display to curves. The abscesses are dates and the ordinates are double value (in this case power).
The data are not provided with the same dates. But when two dates matches, the second set of data added is stacked on the previous one.
Example 1: FR is added after DE and has 4 times less data
Example 2: DE is added after FR and has 4 times more data.
The code i'm currently running is :
# Clean figure
fig = plt.figure()
for country in ['DE', 'FR']:
production = getProduction(
country=country,
start=start,
end=end,
session=session,
verbose=False,
debug=False)
allTimeseries = production['all']['timeseries']
print(allTimeseries)
timestamps = []
values = []
for date in allTimeseries.keys():
timestamps.append(date)
values.append(allTimeseries[date]['power']['quantity'])
# Add the plot to the figure
plt.plot_date(timestamps, values, label=country, antialiased=True)
plt.xticks(rotation=30, ha="right")
plt.legend(loc='upper left', ncol=1)
# plt.show()
plt.tight_layout()
plt.savefig('test.png', dpi=fig.dpi)
How to prevent the two series to stack ?

Related

Flatten broken horizontal bar chart to line graph or heatmap

I have data for all the time I've spent coding. This data is represented as a dictionary where the key is the date and the value is a list of tuples containing the time I started a coding session and how long the coding session lasted.
I have successfully plotted this on a broken_barh using the below code, where the y-axis is the date, the x-axis is the time in that day and each broken bar is an individual session.
for i,subSessions in enumerate(sessions.values()):
plt.broken_barh(subSessions, (i,1))
months = {}
start = getStartMonth()
for month in period_range(start=start,end=datetime.today(),freq="M"):
month = str(month)
months[month] = (datetime.strptime(month,'%Y-%m')-start).days
plt.yticks(list(months.values()),months.keys())
plt.xticks(range(0,24*3600,3600),[str(i)+":00" for i in range(24)],rotation=45)
plt.gca().invert_yaxis()
plt.show()
I want to use this data to discover what times of the day I spend the most time coding, but it isn't very clear from the above chart so I'd like to display it as a line graph or heatmap where the y-axis is the number of days I spent coding at the time on the x-axis (or, in other words, how many sessions are present in that column of the above chart). How do I accomplish this?
You can find some great examples of how to create a heatmap from matplotlib website.
Here is a basic code with some random data:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
index_labels = np.arange(0,24)
column_labels = pd.date_range(start='1/1/2022', end='1/31/2022').strftime('%m/%d')
#random data
np.random.seed(12345)
data = np.random.randint(0,60, size=(len(index_labels), len(column_labels)))
df = pd.DataFrame(data=data, columns=column_labels, index=index_labels)
#heatmap function
def heatmap(df, ax, cbarlabel="", cmap="Greens", label_num_dec_place=0):
df = df.copy()
# Ploting a blank heatmap
im = ax.imshow(df.values, cmap)
# create a customized colorbar
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.05, extend='both', extendfrac=0.05)
cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom", fontsize=14)
# Setting ticks
ax.set_xticks(np.arange(df.shape[1]), labels=df.columns, fontsize=12)
ax.set_yticks(np.arange(df.shape[0]), labels=list(df.index), fontsize=12)
# proper placement of ticks
ax.tick_params(axis='x', top=True, bottom=False,
labeltop=True, labelbottom=False)
ax.spines[:].set_visible(False)
ax.grid(which="both", visible="False", color="white", linestyle='solid', linewidth=2)
ax.grid(False)
# Rotation of tick labels
plt.setp(ax.get_xticklabels(), rotation=-60,
ha="right", rotation_mode=None)
plt.setp(ax.get_yticklabels(), rotation=30)
#plotting and saving
fig, ax = plt.subplots(facecolor=(1,1,1), figsize=(20,8), dpi=200)
heatmap(df=df, ax=ax, cbarlabel="time (min)", cmap="Greens", label_num_dec_place=0)
plt.savefig('time_heatmap.png',
bbox_inches='tight',
facecolor=fig.get_facecolor(),
transparent=True,
)
Output:
One way to do it is to use sampling. Choose how many samples you want to take in a given interval (the precision, for example 288 samples per day) and split each interval by that number of samples and count how many sessions are within this sample. The downside to this is that it can't be 100% precise and increasing the precision increases the time it takes to generate (for me, it takes several minutes to generate a second-precise image, though this level of precision makes little to no difference to the result).
Here is some code which can produce both a heatmap and a line graph
# Configuration options
precisionPerDay = 288
timeTicksPerDay = 24
timeTickRotation = 60
timeTickFontSize = 6
heatmap = True
# Constants
hoursInDay = 24
secondsInHour = 3600
secondsInDay = hoursInDay*secondsInHour
xInterval = secondsInDay/precisionPerDay
timeTickSecondInterval = precisionPerDay/timeTicksPerDay
timeTickHourInterval = hoursInDay/timeTicksPerDay
# Calculating x-axis (time) ticks
xAxis = range(precisionPerDay)
timeTickLabels = []
timeTickLocations = []
for timeTick in range(timeTicksPerDay):
timeTickLocations.append(int(timeTick*timeTickSecondInterval))
hours = timeTick/timeTicksPerDay*hoursInDay
hour = int(hours)
minute = int((hours-hour)*60)
timeTickLabels.append(f"{hour:02d}:{minute:02d}")
# Calculating y-axis (height)
heights = []
for dayX in xAxis:
rangeStart = dayX*xInterval
rangeEnd = rangeStart+xInterval
y = 0
for date,sessions in sessions.items():
for session in sessions:
if session[0] < rangeEnd and session[0]+session[1] > rangeStart:
y += 1
heights.append(y)
# Plotting data
if heatmap:
plt.yticks([])
plt.imshow([heights], aspect="auto")
else:
plt.plot(xAxis,heights)
plt.ylim(ymin=0)
plt.xlim(xmin=0,xmax=len(heights))
plt.xlabel("Time of day")
plt.ylabel("How often I've coded at that time")
plt.xticks(timeTickLocations,timeTickLabels,
fontsize=timeTickFontSize,rotation=timeTickRotation)
plt.show()
And here are some sample results
Graph produced by same configuration options shown in above code
Same data but as a line graph with a lower precision (24 per day) and more time ticks (48)

How to prevent "nan" from labeling in locations with missing data in geopandas?

I'm having trouble working with missing data in my csv. I would like for the output below to look the same, just without the nan labels, but all preserve the state borders with the missing data.
Output with "nan" and decimal issue:
In addition, I'm having trouble with the decimal places. I do not want these to appear. Also, I'm only trying to do this for the specific column of data I am trying to plot. Below is the code I have tried to address both of these issues:
csv = pd.read_csv(r'C:\Downloads\Data.csv')
sf = r'C:\Downloads\s_11au16\s_11au16.shp'
US = gpd.read_file(sf)
#Merge them
data = gpd.GeoDataFrame(csv.merge(US))
#set projection
data = data.to_crs(epsg=6923)
#data = data[['NAME', 'soil_data']]
#data = data[data['soil_data'].notna()]
#data.soil_data = data.soil_data.astype(int)
#set up basemap
ax = data.plot(figsize = (12,8), column="soil_data", cmap="Greens", edgecolor='black', linewidth=.5, vmin=0, vmax=100,missing_kwds={"color": "white", "edgecolor": "k", "label": "none"})
#ax.set_title("Topsoil Moisture: Adequate + Surplus %", fontsize=18, fontweight='bold')
ax.set_axis_off()
#annotate data
label = data
label.apply(lambda x: ax.annotate(text=x['soil_data'], xy=x.geometry.centroid.coords[0], color="black", ha='center', fontsize=14,
path_effects=[pe.withStroke(linewidth=3, foreground="white")]), axis=1)
I tried to use this block code below to solve the problem, but this did not work.
data = data[['NAME', 'soil_data']]
data = data[data['soil_data'].notna()]
data.soil_data = data.soil_data.astype(int)
Again, the two problems are 1) getting nan to not label, and 2) make labels whole numbers.
Difficult to be sure without a fully reproducible example, but I would bet for something like:
#annotate data
label = data.dropna(subset='soil_data')
label.apply(lambda x: ax.annotate(text=int(x['soil_data']), xy=x.geometry.centroid.coords[0], color="black", ha='center', fontsize=14,
path_effects=[pe.withStroke(linewidth=3, foreground="white")]), axis=1)

How to manually set x-ticks in sns with python

How is x-ticks manually set in seaborn sns in python?
This might be a duplicate of: How to set x axis ticklabels in a seaborn plot, but the solution did not work for us.
We would like the x-ticks to start from 2020-01, but as data is only available from 2020-02, it doesn't automatically start the x-ticks on 2020-01. Although it will be an empty space, we would still like to include 2020-01. Following is the function.
def create_lineplot(dataframe):
months = mdates.MonthLocator() # every month
years_fmt = mdates.DateFormatter('%Y-%m') # This is a format. Will be clear in Screenshot
# Filtering data to only select relevant columns and data from the year 2020
dataframe = dataframe[['dev_id', 'temp_20', 'temp_60', 'datetime']]
dataframe["datetime"] = pd.to_datetime(dataframe["datetime"])
soil = dataframe[dataframe['datetime'].dt.year == 2020]
fig, axes = plt.subplots(figsize=(20, 2))
mdf = pd.melt(soil, id_vars=['datetime', 'dev_id'], var_name=['Temperature'])
g = sns.relplot(data=mdf, x='datetime', y='value', kind='line', hue='Temperature', height=5, aspect=3)
g._legend.remove()
axes.xaxis.set_major_locator(months)
axes.xaxis.set_major_formatter(years_fmt)
axes.xaxis.set_minor_locator(months)
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.legend(loc='upper right')
plt.savefig('lineplot.png')
plt.show()
When we include following:
g.set_xticklabels(['2020-01','2020-02','2020-03','2020-04','2020-05','2020-06','2020-07','2020-08', '2020-09', '2020-10', '2020-11', '2020-12'])
between
g = sns.relplot(data=mdf, x='datetime', y='value', kind='line', hue='Temperature', height=5, aspect=3)
--- HERE ---
g._legend.remove()
then the tick is added as desired, but the values are stretched so it seems like there is data in 2020-01 as well.
Following is an example of the data:
Bonus
How to align the ticks after adding a new?

Box plot with divisor in Seaborn Python

I m trying to replicate this boxplot with seaborn. I wish to have a division like in the image. I thought that I can create a different Boxplot and union in a single image but isn't a great idea for computation, create many images, use a merge and delete all.
I used Seaborn to put the value on the box in this way
this is my function:
def boxplot(df, name,prot,min,max):
fig = plt.figure(figsize=(100, 20))
plt.title(name+ " RMSE from "+ str(min) +"h PSW to " + str(max) +"h PWS")
plt.ylabel("RMSE")
plt.xlabel("")
box_plot = sns.boxplot(x="Interval" ,y="RMSE", data=df, palette="Set1", showfliers = False)
ax = box_plot.axes
lines = ax.get_lines()
categories = ax.get_xticks()
for cat in categories:
# every 4th line at the interval of 6 is median line
# 0 -> p25 1 -> p75 2 -> lower whisker 3 -> upper whisker 4 -> p50 5 -> upper extreme value
y = round(lines[4+cat*5].get_ydata()[0],3)
ax.text(
cat,
y,
f'{y}',
ha='center',
va='center',
fontweight='bold',
size=70,
color='white',
bbox=dict(facecolor='#445A64'))
box_plot.figure.tight_layout()
plt.savefig("output/"+str(prot)+ str(name)+".jpg")
plt.close(fig)
I added this code too for each colour (foolish) to set the same colour for each same elements in the box. Ad example for values "15" on the x-axe I set red, and so on...
for i in range(0,len(box_plot.artists),12):
mybox = ax.artists[i]
mybox.set_facecolor('red')
for i in range(1,len(box_plot.artists),12):
mybox = ax.artists[i]
mybox.set_facecolor('orange')
I tried to use a "hue" for the category in my dataset (adding a row 15,30 near various values) but when use hue the boxplot take so many distances between them like this and I really don't like.
I tried to use "order" as same but didn't work.
This kind of plot is called "facetting" when you have a plot that's repeated for different levels of a categorical variable. In seaborn, you can create a FacetGrid, or use catplot to do this kind of things. With a bit of tweaking, you get a result that's very similar to your desired output
# dummy data
N=100
psws = [3,6,12,24,36]
times = [15,30,45,60]
df = pd.DataFrame(columns=pd.MultiIndex.from_product([psws,times], names=['PSW','Time']))
for psw in psws:
for time in times:
df[(psw,time)] = np.random.normal(loc=time, size=(N,))
# data need to be in "long-form"
df = df.melt()
g = sns.catplot(kind='box', data=df, x='Time', y='value', col='PSW', height=4, aspect=0.5, palette='Greys')
g.fig.subplots_adjust(wspace=0)
# remove the spines of the axes (except the leftmost one)
# and replace with dasehd line
for ax in g.axes.flatten()[1:]:
ax.spines['left'].set_visible(False)
[tick.set_visible(False) for tick in ax.yaxis.get_major_ticks()]
xmin,xmax = ax.get_xlim()
ax.axvline(xmin, ls='--', color='k')

Changing x-axis labels to hours instead of the sample number

I currently have a dataset of 70,000 samples (sampled at 1Hz), and I am graphing it using MatPlotLib.
I am wondering how to change the x-axis labels to be in hours, instead of sample #.
The code that I am using today is as follows:
test = pd.read_csv("test.txt", sep='\t')
test.columns = ['TS', 'ppb', 'ppm']
test.head()
# The first couple minutes were with an empty container
# Then the apple was inserted into the container.
fig5 = plt.figure()
ax1 = fig5.add_subplot(111)
ax1.scatter(test.index, test['ppm'])
ax1.set_ylabel('(ppm)', color='b')
ax1.set_xlabel('Sampling Time', color='k')
ax2 = ax1.twinx()
ax2.scatter(test.index, test['ppb'], color = 'c')
ax2.set_ylabel('(ppb)', color='c')
plt.show
My data looks as follows:
If the data is sampled at 1Hz, that means that every 3600 samples is one hour. So create a new column like:
test['hours'] = (test.index - test.index[0])/3600.0

Categories