Get smooth line plot by filling missing values - python

I have multiple Dataframes (up to 30) which all contain timestamps with associated values. The timestamp in the DataFrames do not necessarily overlap and the recorded values can only stay the same or increase. A DataFrame may look like this:
time coverage
0 0.000000 32.111748
1 0.875050 32.482579
2 1.850576 32.784133
3 3.693440 34.205134
...
I uploaded a couple of csv files with data here 1, 2, 3, 4.
So what I am trying to do is to plot the increase of the mean and median coverage values over time for all recordings, as follows:
# data is a list of dataframes
keys = ["Run " + str(i) for i in range(len(data))]
glued = pd.concat(data, keys=keys).reset_index(level=0).rename(columns={'level_0': 'Run'})
glued["roundtime"] = glued["time"] / 60
glued["roundtime"] = glued["roundtime"].round(0) # 1 significant digit
f, (ax1, ax2) = plt.subplots(2)
my_dpi = 96
stepsize = 5
start = 0
end = 60
ax1.set_title("Mean")
ax2.set_title("Median")
f.set_size_inches(1980 / my_dpi, 1080 / my_dpi)
ax1 = sns.lineplot(x="roundtime", y="coverage", ci="sd", estimator="mean", data=glued, ax=ax1)
ax1.set(xlabel="Time", ylabel="Coverage in percent")
ax1.xaxis.set_ticks(np.arange(start, end, stepsize))
ax1.set_xlim(0, 70)
ax2 = sns.lineplot(x="roundtime", y="coverage", ci="sd", estimator='median', data=glued, ax=ax2)
ax2.set(xlabel="Time", ylabel="Coverage in percent")
ax2.xaxis.set_ticks(np.arange(start, end, stepsize))
ax2.set_xlim(0, 70)
plt.show()
The result looks like this.
However, the curve should never decrease as the "coverage" values can never decrease either. The reason for this, I suspect, is that at certain points in time I only have recordings of some DataFrames with lower values and therefore the mean/median is also lower.
I tried to fix this by aligning the indices of all the DataFrames and filling missing values with previous recordings, before doing any of the previous code. Like this:
#create a common index
index = None
for df in data:
df.set_index("time", inplace=True, drop=False)
if index is not None:
index = index.union(df.index)
else:
index = df.index
# reindex all dataframes and fill missing values
new_data = []
for df in data:
print(df)
new_df = df.reindex(index, fill_value=np.NaN)
new_df = new_df.fillna(method="ffill")
new_data.append(new_df)
data = new_data
The result however does change much and decreases at certain times. It looks like this:
Is this approach wrong or am I simply missing something?

Related

Create a function that will accept a DataFrame as input and return pie-charts for all the appropriate Categorical features

I can create 1 pie-chart using the 'Churn' column to group the data, however, not sure how to create a function that will accept a DataFrame as input and return pie-charts for all the appropriate Categorical features & show percentage distribution in the pie charts?
As DataFrame, I am using "Telco-Customer-Churn.csv"
f,axes=plt.subplots(1,2,figsize=(17,7))
df_churn['Churn'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('Churn',data=df_churn,ax=axes[1])
axes[0].set_title('Categorical Variable Pie Chart')
plt.show()
I did something like this, not sure if i did it right:-
#%% PlotMultiplePie
Input: df = Pandas dataframe, categorical_features = list of features , dropna = boolean variable to use NaN or not
Output: prints multiple px.pie()
def PlotMultiplePie(df_churn,categorical_features = None,dropna = False):
# set a threshold of 30 unique variables, more than 50 can lead to ugly pie charts
threshold = 40
# if user did not set categorical_features
if categorical_features == None:
categorical_features = df_churn.select_dtypes(['object','category']).columns.to_list()
print(categorical_features)
# loop through the list of categorical_features
for cat_feature in categorical_features:
num_unique = df_churn[cat_feature].nunique(dropna = dropna)
num_missing = df_churn[cat_feature].isna().sum()
# prints pie chart and info if unique values below threshold
if num_unique <= threshold:
print('Pie Chart for: ', cat_feature)
print('Number of Unique Values: ', num_unique)
print('Number of Missing Values: ', num_missing)
fig = px.pie(df_churn[cat_feature].value_counts(dropna = dropna), values=cat_feature,
names = df_churn[cat_feature].value_counts(dropna = dropna).index,title = cat_feature,template='ggplot2')
fig.show()
else:
print('Pie Chart for ',cat_feature,' is unavailable due high number of Unique Values ')
print('Number of Unique Values: ', num_unique)
print('Number of Missing Values: ', num_missing)
print('\n')
This worked for me. Defined a function to plot the pie charts, for all categorical variables in a dataframe.
#Function to plot Pie-Charts for all categorical variables in the dataframe
def pie_charts_for_CategoricalVar(df_pie,m):
'''Takes in a dataframe(df_pie) and plots pie charts for all categorical columns. m = number of columns required in grid'''
#get all the column names in the dataframe
a = []
for i in df_pie:
a.append(i)
#isolate the categorical variable names from a to b
b = []
for i in a:
if (df[i].dtype.name) == 'category':
b.append(i)
plt.figure(figsize=(15, 12))
plt.subplots_adjust(hspace=0.2)
plt.suptitle("Pie-Charts for Categorical Variables in the dataframe", fontsize=18, y=0.95)
# number of columns, as inputted while calling the function
ncols = m
# calculate number of rows
nrows = len(b) // ncols + (len(b) % ncols > 0)
# loop through the length of 'b' and keep track of index
for n, i in enumerate(b):
# add a new subplot iteratively using nrows and ncols
ax = plt.subplot(nrows, ncols, n + 1)
# filter df and plot 'i' on the new subplot axis
df.groupby(i).size().plot(kind='pie', autopct='%.2f%%',ax=ax)
ax.set_title(i.upper())
ax.set_xlabel("")
ax.set_ylabel("")
plt.show()
#calling the function to plot pie-charts for categorical variable
pie_charts_for_CategoricalVar(df,5) #dataframe, no. of cols in the grid

Grouped Column Operations in Python using Pandas

I have a data frame consisting of a .csv import that contains n number of trials. Trials are arranged by column with a header (wavelength1 for trial 1, wavelength2 for trial 2 etc.) We're tracking the absorption of a solution over time during a chemical reaction. You can see a SS of the excel file in the link. Trials are grouped in to threes (with g of sugar being the IDV and the absorbance in nm being the DV). For each trial:
I need to determine what the maximum and minimum values are. This can of course be done using max() and min() but when we are sampling every 0.25 seconds, the data can be noisy, meaning that I have to smooth it out. I have already built a function to do that. We're also probably just going to sample every one second as it's much smoother anyway.
Each group of three trials needs to be plotted on the same graph for comparison. n number of trials will create n/3 graphs.
I'm coming from an intermediate background in MATLAB. This is not something I was ever able to figure out in there, either.
What have I done so far?
I have attempted to make a list out of the header for each trial, and then use use a for loop to move through the data using the df.column_name command:
data = pd.read_csv('data.csv')
col_name = data.columns.values
print(col_name)
for i in col_name:
print(data.col_name[i])
The code works up to the 4th line, where it returns the error: AttributeError: 'DataFrame' object has no attribute 'col_name'. Here is where I would like to make a series or set (whatever it's called here) with all of the values from the wavelength1 trial to plot/manipulate/etc. It's worth noting that I have gotten the multiple plots and multiple lines to work manually: but I want to automate it as that's ofc the point of coding. Here's one out of four graphs of the 'manual' version:
import pandas as pd
import matplotlib.pyplot as plt
#import matplotlib as matplotlib
data = pd.read_csv('data.csv')
plt.close("all")
n_rows = 2
n_columns = 2
#initialize figure
figure_size = (30,15)
font_size = 13
f, ([plt1, plt2], [plt3, plt4]) = plt.subplots(n_rows,n_columns, figsize = figure_size)
#plot first three runs
x=data.time1
y=data.wavelength1
plt1.plot(x,y, label='Trial 1')
x=data.time2
y=data.wavelength2
plt1.plot(x,y,label='Trial 2')
plt1.set_title('0.3g Glucose', fontweight="bold", size=font_size)
x=data.time3
y=data.wavelength3
plt1.plot(x,y,label='Trial 3')
plt1.set_ylabel('Wavelength (nm)', fontsize = font_size)
plt1.set_xlabel('Time (s)', fontsize = font_size)
plt1.legend(fontsize=font_size)
My first thought was just to do:
for i in range (0,num_col):
plot(time,data.wavelength(i))
But this does not work. I'm sure it's something quite simple but it is escaping me.
Example data:
https://ufile.io/ac226vma
Thanks in advance!
[1]: https://i.stack.imgur.com/gMtBN.png
Analysis
I need to determine what the maximum and minimum values are.
Since you want the largest value within each trial, and each trial is represented by one column, you can use DataFrame.min() to get the smallest value in each column. If you want to know the index of the smallest value, you can throw in idxmin() too. Same idea with max.
df = pd.read_csv("data.csv")
# Get max and min values
print("ANALYSIS OF MIN AND MAX VALUES")
analysis_df = pd.DataFrame()
analysis_df["min"] = df.min()
analysis_df["min_idx"] = df.idxmin()
analysis_df["max"] = df.max()
analysis_df["max_idx"] = df.idxmax()
print(analysis_df)
produces:
ANALYSIS OF MIN AND MAX VALUES
min min_idx max max_idx
wavelength1 801.0 120 888.0 4
wavelength2 809.0 85 888.0 1
wavelength3 728.0 96 837.0 1
wavelength4 762.0 114 864.0 3
wavelength5 785.0 115 878.0 2
wavelength6 747.0 118 866.0 1
wavelength7 748.0 119 851.0 3
wavelength8 776.0 113 880.0 0
wavelength9 812.0 112 900.0 0
wavelength10 770.0 110 863.0 1
wavelength11 759.0 100 858.0 0
wavelength12 787.0 91 876.0 0
wavelength13 756.0 66 862.0 2
wavelength14 809.0 70 877.0 1
wavelength15 828.0 62 866.0 0
Plotting
Each group of three trials needs to be plotted on the same graph for comparison. n number of trials will create n/3 graphs.
This is easier if you break it up into a few smaller subproblems.
First, you want to take a list of all of your columns and break them up into groups of three. I copied the code to do this from here.
def grouper(n, iterable, fillvalue=None):
"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return itertools.zip_longest(fillvalue=fillvalue, *args)
Now, once we have a group of three column names, we need to get the values within the dataframe associated with those columns. Also, since your datafile contains unequal numbers of observations per trial, we need to get rid of the NaN's at the end of the file.
def get_trials(df, column_group_names):
"""Get columns from dataframe, dropping missing values."""
column_group = df[list(column_group_names)]
column_group = column_group.dropna(how='all')
return column_group
Now, let's combine those two functions:
col_iterator = grouper(3, df.columns)
[...]
for column_group_names in col_iterator:
column_group = get_trials(df, column_group_names)
[...]
This will let us loop over the columns in groups of three, and plot them individually. Since we've filtered it down to the data we're interested in, we can use DataFrame.plot to plot it to the matplotlib plot.
Next, we need to loop over the subplots. This is a little annoying to do while also looping over groups, so I like to define an iterator.
def subplot_axes_iterator(n_rows, n_columns):
for i in range(n_rows):
for j in range(n_columns):
yield i, j
Example of it in use:
>>> list(subplot_axes_iterator(2, 2))
[(0, 0), (0, 1), (1, 0), (1, 1)]
Now, combine those pieces:
# Plot data
n_rows = 2
n_columns = 3
figure_size = (15, 10)
font_size = 13
fig, axes = plt.subplots(n_rows, n_columns, figsize=figure_size)
col_iterator = grouper(3, df.columns)
axes_iterator = subplot_axes_iterator(n_rows, n_columns)
plot_names = [
"Group 1",
"Group 2",
"Group 3",
"Group 4",
"Group 5",
]
for column_group_names, axes_position, plot_name in \
zip(col_iterator, axes_iterator, plot_names):
print(f"plotting {column_group_names} at {axes_position}")
column_group = get_trials(df, column_group_names)
column_group.plot(ax=axes[axes_position])
axes[axes_position].set_title(plot_name, fontweight="bold", size=font_size)
axes[axes_position].set_xlabel("Time (s)", fontsize=font_size)
axes[axes_position].set_ylabel("Wavelength (nm)", fontsize=font_size)
plt.tight_layout()
plt.show()
(By the way, you said that you want 4 graphs, but the dataset posted has fifteen trials, so I made 5 graphs.)
Final script
(Included for easy copy/paste.)
import pandas as pd
import matplotlib.pyplot as plt
import itertools
def grouper(n, iterable, fillvalue=None):
"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return itertools.zip_longest(fillvalue=fillvalue, *args)
def get_trials(df, column_group_names):
"""Get columns from dataframe, dropping missing values."""
column_group = df[list(column_group_names)]
column_group = column_group.dropna(how='all')
return column_group
def subplot_axes_iterator(n_rows, n_columns):
for i in range(n_rows):
for j in range(n_columns):
yield i, j
df = pd.read_csv("data.csv")
# Get max and min values
print("ANALYSIS OF MIN AND MAX VALUES")
analysis_df = pd.DataFrame()
analysis_df["min"] = df.min()
analysis_df["min_idx"] = df.idxmin()
analysis_df["max"] = df.max()
analysis_df["max_idx"] = df.idxmax()
print(analysis_df)
# Plot data
n_rows = 2
n_columns = 3
figure_size = (15, 10)
font_size = 13
fig, axes = plt.subplots(n_rows, n_columns, figsize=figure_size)
col_iterator = grouper(3, df.columns)
axes_iterator = subplot_axes_iterator(n_rows, n_columns)
plot_names = [
"Group 1",
"Group 2",
"Group 3",
"Group 4",
"Group 5",
]
for column_group_names, axes_position, plot_name in \
zip(col_iterator, axes_iterator, plot_names):
print(f"plotting {column_group_names} at {axes_position}")
column_group = get_trials(df, column_group_names)
column_group.plot(ax=axes[axes_position])
axes[axes_position].set_title(plot_name, fontweight="bold", size=font_size)
axes[axes_position].set_xlabel("Time (s)", fontsize=font_size)
axes[axes_position].set_ylabel("Wavelength (nm)", fontsize=font_size)
plt.tight_layout()
plt.show()

How would I offset a pandas column of data by different amounts?

I am plotting some columns within a pandas dataframe using matplotlib. I have a strategy for plotting whereby I'm zeroing to the initial value and then offset each chosen variable by a set amount. For example, this is my current plotting method:
fig, ax = plt.subplots()
# data is in a dataframe called inputData
timeseries_plots=['var1','var3','var8']
offsetFactor = 20
for ii,var in enumerate(timeseries_plots)
offsetRef = inputData[var].loc[~inputData[var].isnull()].iloc[0]
ax.plot(inputData[TimeIndex], offsetFactor*(len(timeseries_plots_avg)-ii-1)+inputData[timeseries_plots_avg[ii]]-offsetRef, label=var,markersize=1,marker='None',linestyle = 'solid',color=colour)
plt.show()
This produces something like this (with some matplotlib finessing):
As you can see, it removes the offsetRef (which in this case is the initial value of the variable), and then adds a constant offsetFactor (equal to 20 in this case) to each variable. The result is lines which start vertically offset by 20.
However, this can be a problem when the values start to drift over time, and one variable might cross another. What I'd like to do is reset the vertical offset - such as by changing the offsetRef beyond a certain date.
I have tried to do this in the following way. I start by initialising an array equal to the size of the variable. I then fill it with the offsetRef recalculated at the resetDates. I've included comments marked #PSEUDOCODE where I'm roughly writing what I want to do - but sorry in advance for them being pretty rough. Thank you in advance!
fig, ax = plt.subplots()
inputData = pd.DataFrame(np.random.randint(100, size=(100, 5)), columns=['timestamp','var2','var3','var4','var5'])
inputData['timestamp'][:]=pd.date_range('2020-may-01','2020-aug-08')
timeseries_plots=['var1','var3','var4']
offsetFactor = 20
resetDates = ['2020-jun-23','2020-jul-05']
for ii,var in enumerate(timeseries_plots)
offsetRef = np.zeros(inputData[var].size)
for tt,ttdate in enumerate(resetDates):
if tt=0:
#PSEUDO CODE: offsetRef[ inputData['timestamp'] <resetDates[tt]] = inputData[var].loc[~inputData[var].isnull()].iloc[0]
#PSEUDO CODE: offsetRef[ inputData['timestamp'] >=resetDates[tt]] = inputData[var].loc[~inputData[var].isnull()].iloc[ttdate]
#PSEUDO CODE: offsetRef[ inputData['timestamp'] >=resetDates[tt]] = inputData[var].loc[~inputData[var].isnull()].iloc[ttdate]
ax.plot(inputData[TimeIndex], offsetFactor*(len(timeseries_plots_avg)-ii-1)+inputData[timeseries_plots_avg[ii]]-offsetRef, label=var,markersize=1,marker='None',linestyle = 'solid',color=colour)
plt.show()
This is the current solution that I'll stick here so that it might be useful to others:
fig, ax = plt.subplots()
# set up df
inputData = pd.DataFrame(np.random.randint(100, size=(100, 5)), columns=['timestamp','var2','var3','var4','var5'])
inputData['timestamp'][:]=pd.date_range('2020-may-01','2020-aug-08')
inputData['var2']=np.arange(0,100,1)
inputData['var2'][0:3]=49
inputData['var4']=np.arange(0,200,2)
inputData['var2'][0:3]=np.nan
# set constants and settings
dispFactor=20
timeseries_plots=['var2','var4']
resetDates=['2020-05-05','2020-05-20', '2020-08-04']
offsetFactor = dispFactor
#begin
fig, ax=plt.subplots()
for ii,var in enumerate(timeseries_plots):
offsetRef = np.zeros(inputData[var].size)
for tt,ttdate in enumerate(resetDates):
if tt==0:
if inputData[var].loc[inputData['timestamp']==ttdate].isna().bool(): #if date is nan
print('a',inputData[var].loc[~inputData[var].isnull()].iloc[0],inputData[var].bfill().loc[inputData['timestamp']==ttdate])
offsetRef[(inputData['timestamp']<ttdate)]= inputData[var].loc[~inputData[var].isnull()].iloc[0]
offsetRef[(inputData['timestamp']>=ttdate)]=inputData[var].bfill().loc[inputData['timestamp']==ttdate]
else:
print('b',inputData[var].loc[~inputData[var].isnull()].iloc[0],inputData[var].loc[inputData['timestamp']==ttdate])
offsetRef[(inputData['timestamp']<ttdate)]= inputData[var].loc[~inputData[var].isnull()].iloc[0]
offsetRef[(inputData['timestamp']>=ttdate)]= inputData[var].loc[inputData['timestamp']==ttdate]
else:
if inputData[var].loc[inputData['timestamp']==ttdate].isna().bool(): #if date is nan
print('c')
offsetRef[ inputData['timestamp'] >=resetDates[tt]] = inputData[var].bfill().loc[inputData['timestamp']==ttdate]
else:
print('d',inputData[var].loc[inputData['timestamp']==ttdate])
offsetRef[ inputData['timestamp'] >=resetDates[tt]] = inputData[var].loc[inputData['timestamp']==ttdate]
print(offsetRef)
ax.plot(inputData['timestamp'], offsetFactor*(len(timeseries_plots)-ii-1)+inputData[var]-offsetRef)
plt.show()
This 'resets' the offset to 20 at the chosen resetDates to produce the following figure:
I possibly don't need the if-logic catches for nan data (and just rely on .bfill()) to work in either case - but this makes me feel that it's safer. I will edit as I improve the solution.

How to plot data frame columns with exactly two unique values?

I am trying to loop through a Pandas data frame and produce a bar chart only for columns that contain exactly two unique values. I envision the final bar chart to contain the two unique values on the X axis, and the Y axis to show the number of rows.
I've been able to produce a Series off my data frame (df_clean) which shows me the number of unique values per column:
col_values = df_clean.apply(lambda x: len(x.unique()))
But I am completely lost how to:
loop through my df_clean to only plot the columns with two unique values
how to produce multiple graphs in one figure (I think matplotlib subplot would help?)
In the same code, I have been able to successfully loop through my df_clean and successfully plot all the int and float type columns. I am struggling with how to modify this working code for the above issue.
i = 1
c_num_cols = len(df_clean.select_dtypes(["int64","float64"]).columns)
for column in df_clean.select_dtypes(["int64","float64"]).columns:
plt.subplot(c_num_cols,(c_num_cols % 2) + 1,i)
plt.subplots_adjust(hspace=0.5)
df_clean[column].plot(kind = 'hist', figsize = [15,c_num_cols * 4], title = column)
i += 1
Try using Series.nunique and Series.value_counts:
binary_cols = df.nunique()[lambda x: x == 2].index
for i, col in enumerate(binary_cols):
plt.subplot(len(binary_cols), (len(binary_cols) % 2) + 1, i+1)
plt.subplots_adjust(hspace=0.5)
df[col].value_counts().plot(kind='bar')
Example
# Setup
df = pd.DataFrame({'col1': list('aaaaaaabbbbbbbb'),
'col2': list('aaabbbcccdddeee'),
'col3': [1] * 9 + [3] * 6})
binary_cols = df.nunique()[lambda x: x == 2].index
for i, col in enumerate(binary_cols):
plt.subplot(len(binary_cols), (len(binary_cols) % 2) + 1, i+1)
plt.subplots_adjust(hspace=0.5)
df[col].value_counts().plot(kind='bar')

Averaging several time-series together with confidence interval (with test code)

Sounds very complicated but a simple plot will make it easy to understand:
I have three curves of cumulative sum of some values over time, which are the blue lines.
I want to average (or somehow combine in a statistically correct way) the three curves into one smooth curve and add confidence interval.
I tried one simple solution - combining all the data into one curve, average it with the "rolling" function in pandas, getting the standard deviation for it. I plotted those as the purple curve with the confidence interval around it.
The problem with my real data, and as illustrated in the plot above is the curve isn't smooth at all, also there are sharp jumps in the confidence interval which also isn't a good representation of the 3 separate curves as there is no jumps in them.
Is there a better way to represent the 3 different curves in one smooth curve with a nice confidence interval?
I supply a test code, tested on python 3.5.1 with numpy and pandas (don't change the seed in order to get the same curves).
There are some constrains - increasing the number of points for the "rolling" function isn't a solution for me because some of my data is too short for that.
Test code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
np.random.seed(seed=42)
## data generation - cumulative analysis over time
df1_time = pd.DataFrame(np.random.uniform(0,1000,size=50), columns=['time'])
df1_values = pd.DataFrame(np.random.randint(0,10000,size=100), columns=['vals'])
df1_combined_sorted = pd.concat([df1_time, df1_values], axis = 1).sort_values(by=['time'])
df1_combined_sorted_cumulative = np.cumsum(df1_combined_sorted['vals'])
df2_time = pd.DataFrame(np.random.uniform(0,1000,size=50), columns=['time'])
df2_values = pd.DataFrame(np.random.randint(1000,13000,size=100), columns=['vals'])
df2_combined_sorted = pd.concat([df2_time, df2_values], axis = 1).sort_values(by=['time'])
df2_combined_sorted_cumulative = np.cumsum(df2_combined_sorted['vals'])
df3_time = pd.DataFrame(np.random.uniform(0,1000,size=50), columns=['time'])
df3_values = pd.DataFrame(np.random.randint(0,4000,size=100), columns=['vals'])
df3_combined_sorted = pd.concat([df3_time, df3_values], axis = 1).sort_values(by=['time'])
df3_combined_sorted_cumulative = np.cumsum(df3_combined_sorted['vals'])
## combining the three curves
df_all_vals_cumulative = pd.concat([df1_combined_sorted_cumulative,.
df2_combined_sorted_cumulative, df3_combined_sorted_cumulative]).reset_index(drop=True)
df_all_time = pd.concat([df1_combined_sorted['time'],
df2_combined_sorted['time'], df3_combined_sorted['time']]).reset_index(drop=True)
df_all = pd.concat([df_all_time, df_all_vals_cumulative], axis = 1)
## creating confidence intervals
df_all_sorted = df_all.sort_values(by=['time'])
ma = df_all_sorted.rolling(10).mean()
mstd = df_all_sorted.rolling(10).std()
## plotting
plt.fill_between(df_all_sorted['time'], ma['vals'] - 2 * mstd['vals'],
ma['vals'] + 2 * mstd['vals'],color='b', alpha=0.2)
plt.plot(df_all_sorted['time'],ma['vals'], c='purple')
plt.plot(df1_combined_sorted['time'], df1_combined_sorted_cumulative, c='blue')
plt.plot(df2_combined_sorted['time'], df2_combined_sorted_cumulative, c='blue')
plt.plot(df3_combined_sorted['time'], df3_combined_sorted_cumulative, c='blue')
matplotlib.use('Agg')
plt.show()
First of all, your sample code could be re-written to make better use of pd. For example
np.random.seed(seed=42)
## data generation - cumulative analysis over time
def get_data(max_val, max_time=1000):
times = pd.DataFrame(np.random.uniform(0,max_time,size=50), columns=['time'])
vals = pd.DataFrame(np.random.randint(0,max_val,size=100), columns=['vals'])
df = pd.concat([times, vals], axis = 1).sort_values(by=['time']).\
reset_index().drop('index', axis=1)
df['cumulative'] = df.vals.cumsum()
return df
# generate the dataframes
df1,df2,df3 = (df for df in map(get_data, [10000, 13000, 4000]))
dfs = (df1, df2, df3)
# join
df_all = pd.concat(dfs, ignore_index=True).sort_values(by=['time'])
# render function
def render(window=10):
# compute rolling means and confident intervals
mean_val = df_all.cumulative.rolling(window).mean()
std_val = df_all.cumulative.rolling(window).std()
min_val = mean_val - 2*std_val
max_val = mean_val + 2*std_val
plt.figure(figsize=(16,9))
for df in dfs:
plt.plot(df.time, df.cumulative, c='blue')
plt.plot(df_all.time, mean_val, c='r')
plt.fill_between(df_all.time, min_val, max_val, color='blue', alpha=.2)
plt.show()
The reason your curves aren't that smooth is maybe your rolling window is not large enough. You can increase this window size to get smoother graphs. For example render(20) gives:
while render(30) gives:
Although, the better way might be imputing each of df['cumulative'] to the entire time window and compute the mean/confidence interval on these series. With that in mind, we can modify the code as follows:
np.random.seed(seed=42)
## data generation - cumulative analysis over time
def get_data(max_val, max_time=1000):
times = pd.DataFrame(np.random.uniform(0,max_time,size=50), columns=['time'])
vals = pd.DataFrame(np.random.randint(0,max_val,size=100), columns=['vals'])
# note that we set time as index of the returned data
df = pd.concat([times, vals], axis = 1).dropna().set_index('time').sort_index()
df['cumulative'] = df.vals.cumsum()
return df
df1,df2,df3 = (df for df in map(get_data, [10000, 13000, 4000]))
dfs = (df1, df2, df3)
# rename column for later plotting
for i,df in zip(range(3),dfs):
df.rename(columns={'cumulative':f'cummulative_{i}'}, inplace=True)
# concatenate the dataframes with common time index
df_all = pd.concat(dfs,sort=False).sort_index()
# interpolate each cumulative column linearly
df_all.interpolate(inplace=True)
# plot graphs
mean_val = df_all.iloc[:,1:].mean(axis=1)
std_val = df_all.iloc[:,1:].std(axis=1)
min_val = mean_val - 2*std_val
max_val = mean_val + 2*std_val
fig, ax = plt.subplots(1,1,figsize=(16,9))
df_all.iloc[:,1:4].plot(ax=ax)
plt.plot(df_all.index, mean_val, c='purple')
plt.fill_between(df_all.index, min_val, max_val, color='blue', alpha=.2)
plt.show()
and we get:

Categories