I have multiple datasources that report a true/false value at specific timestamps like this:
{1338: [
(1377259958, False),
(1378703557, True)],
1343: [
(1377259911, True),
(1377812511, False),
(1377814321, True)],
1354: [
(1377260040, False),
(1377296033, True),
(1377382446, False),
(1377566041, True),
(1377582236, False),
(1377638031, True),
(1377641637, False),
(1377652434, True),
(1377814443, False),
(1377987234, True),
(1378073645, False),
(1378160039, True),
(1378246440, False),
(1378257238, True),
(1378341839, False),
(1378421045, True),
(1378514636, False),
(1378613637, True)],
1431: [
(1377260039, False),
(1377729842, True),
(1377731646, False),
(1378703641, True)]
}
Now I would like to plot this data in one graph so that every data source is on the y axis and the time is on the x axis. Every data source should colour the timeperiods between the data changing from True to False.
At the moment I have this:
In Reality there are more timestamps and a lot of more data sources but it's always the same procedure.
I'm using matplotlib with the following script:
def timelines(y, xstart, xstop, ok):
color = 'r' if ok else 'g'
print 'Graph', y, xstart, xstop, color
plt.hlines(y,xstart,xstop,color,lw=3)
plt.vlines(xstart, y+0.45,y-0.45,color,lw=0.5)
plt.vlines(xstop, y+0.45,y-0.45,color,lw=0.5)
maxtime = 0
mintime = time()
for probe, tl in timeline.iteritems():
newmax = max(tl, key=itemgetter(0))[0]
newmin = min(tl, key=itemgetter(0))[0]
if newmax > maxtime:
print "New maximum time: %s (Old: %s)" % (newmax, maxtime)
maxtime = newmax
if newmin < mintime:
print "New minimum time: %s (Old: %s)" % (newmin, mintime)
mintime = newmin
maxprobe = 0
probelist = []
for probe, tl in timeline.iteritems():
print probe, tl
maxprobe += 1
probelist.append(probe)
first = True
startpoint = mintime
for ts in tl:
if ts[0] <= mintime:
first = False
continue
print maxprobe, startpoint, ts[0], ts[1]
if first:
first = False
timelines(maxprobe, int(startpoint), int(ts[0]), not ts[1])
else:
timelines(maxprobe, int(startpoint), int(ts[0]), ts[1])
startpoint=ts[0]
if startpoint < maxtime:
print maxprobe, 'End', startpoint, maxtime, ts[1]
timelines(maxprobe, int(startpoint), maxtime, not ts[1])
label = ['']
label += probelist
plt.ylim((0, maxprobe+1))
plt.yticks(np.arange(0,maxprobe+1), label)
fig = plt.gcf()
fig.set_size_inches(18.5,0.15*maxprobe+1)
plt.savefig('timeline.png')
What I didn't figure out is how I would display a formatted date instead of the timestamp in the xaxis. Also, is there another way to scale the image instead of set_size_inches?
Related
summary = [['Metrics','Status']]
try:
for i in output['responsetimes']:
if i['metric'] == 'ResponseTime':
k = i['value'].split(' ')
if int(k[0])<1000:
temp = ['Response Times','Green']
summary.append(temp)
else:
temp = ['Response Times','Red']
summary.append(temp)
except:
summary.append(['Response Times','NA'])
try:
for i in output['runtimedumps']:
if i['metric'] == 'Shortdumps Frequency':
k = i['value'].split(' ')
if int(k[0])==0:
temp = ['Runtime Dumps','Green']
summary.append(temp)
else:
temp = ['Runtime Dumps','Red']
summary.append(temp)
except:
summary.append(['Runtime Dumps','NA'])
try:
temp = []
for i in output['buffer']:
if (i['metric'] == 'HitRatio'):
k = i['value'].split(' ')
if int(k[0])>95:
temp.append('green')
else:
temp.append('red')
if 'red' in temp:
summary.append(['Buffer','Red'])
else:
summary.append(['Buffer','Green'])
except:
summary.append(['Buffer','NA'])
try:
for i in output['updatemonitoring']:
if i['metric'] == 'ErrorsInWpUD1':
if int(i['value'])==0:
temp = ['Update Monitoring','Green']
summary.append(temp)
else:
temp = ['Update Monitoring','Red']
summary.append(temp)
except:
summary.append(['Update Monitoring','NA'])
try:
for i in output['memory']:
if i['metric'] == 'Physical':
total = int(i['value'].split(' ')[0])
if i['metric'] == 'Free (Value)':
free = int(i['value'].split(' ')[0])
if int((free*100)/total)<5:
summary.append(['Memory Utilization','Red'])
else:
summary.append(['Memory Utilization','Green'])
except:
summary.append(['Memory Utilization','Green'])
try:
for i in output['cpu']:
if i['metric'] == 'CPU_Utilization':
used = int(i['value'].split(' ')[0])
if used>80:
summary.append(['CPU Utilization','Red'])
else:
summary.append(['CPU Utilization','Green'])
except:
summary.append(['CPU Utilization','NA'])
try:
temp = []
for i in output['fs']:
if int(i['perc'].split(' ')[0])>85:
temp.append('red')
else:
temp.append('green')
if 'red' in temp:
summary.append(['File System','Red'])
else:
summary.append(['File System','Green'])
except:
summary.append(['File System','NA'])
t=Table(summary,hAlign='LEFT')
GRID_STYLE = TableStyle(
[
('GRID',(0,0),(-1,-1),0.5,colors.black),
('BACKGROUND', (0, 0), (-1, 0), '#2766A8'),
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
]
)
t.setStyle(GRID_STYLE)
Story.append(t)
Story.append(Spacer(1, 12))
I am creating a table using report lab,
You can see how it looks in the image below:
I want to highlight the cell based on their values.
For example, Response Times cell would be green if its value is green or red otherwise.
I'm new at this, and can use some guidance on how to achieve this.
Found the answer myself after some more searching stackoverflow questions.
Might help someone else..
In my case, list summary has the heading by default which is ['Metrics','Status']
and I'm appending the rest of the values based on validations like this,
for i in output['responsetimes']:
if i['metric'] == 'ResponseTime':
k = i['value'].split(' ')
if int(k[0])<1000:
temp = ['Response Times','Green']
summary.append(temp)
else:
temp = ['Response Times','Red']
summary.append(temp)
So in the end my summary list looks something like this,
[
['Metrics','Status'],
['Response','Green'],
['Metrics2','Red'],
['Metrics4','Green'],
['Metrics3','Red']
]
And now I just need to loop over this summary list and add styles to the already existing GRID_STYLE with the help of TableStyle class like this,
GRID_STYLE = TableStyle(
[
('GRID',(0,0),(-1,-1),0.5,colors.black),
('BACKGROUND', (0, 0), (-1, 0), '#2766A8'),
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
]
)
for row, values, in enumerate(summary):
for column, value in enumerate(values):
if value == "Red":
GRID_STYLE.add('BACKGROUND', (column, row), (column, row), colors.red)
if value == "Green":
GRID_STYLE.add('BACKGROUND', (column, row), (column, row), colors.green)
t.setStyle(GRID_STYLE)
And voila, now the table looks like this,
I have the following sample DataFrame:
data = {'ID': ['14','15','16','18','19','20','21'],
'LOB': ["BNK", "BNK", "BNK", "BNK", "XYZ", "XYZ", "XYZ",],
'US_ALL': [False, False, True, True, True, False, True],
'US_50_States': [True, False, True, False, True, False, False],
'Primary': [False, True, True, False, True, False, True],
'Secondary': [True, False, True, False, False, True, True]}
I have the following defined function. My goal is to pass arguments based on the LOB column using conditional results.
def logic_for_bnk():
# Country and State Flag Logic
country_state_flag_conditions = [
(df['US_ALL'] == True) & (df['US_50_States'] == True),
(df['US_ALL'] == False) & (df['US_50_States'] == False),
(df['US_ALL'] == True) & (df['US_50_States'] == False),
(df['US_ALL'] == False) & (df['US_50_States'] == True),
]
country_state_flag_values = [
"""%keep(criteria="country = 'US' and states_50 = 1", desc="Keep only US and in 50 states customers");""",
"",
"""%keep(criteria="country = 'US'",desc="Keep customers in the US");""",
"""%keep(criteria="states_50 = 1", desc="Keep customers in 50 states");"""
]
df['Country_State_Logic'] = np.select(country_state_flag_conditions, country_state_flag_values, None)
# Primary and Secondary Logic
primary_secondary_flag_conditions = [
(df['Primary'] == True) & (df['Secondary'] == True),
(df['Primary'] == False) & (df['Secondary'] == False),
(df['Primary'] == True) & (df['Secondary'] == False),
(df['Primary'] == False) & (df['Secondary'] == True)
]
primary_secondary_flag_values = [
"""%keep(criteria="acct_ownership = '1' or acct_ownership = '2'",desc="Keep primary and secondary ownership");""",
"""%keep(criteria="acct_ownership = '1' or acct_ownership = '2'",desc="Keep primary and secondary ownership");""",
"""%keep(criteria="acct_ownership = '1'",desc="Keep primary ownership");""",
"""%keep(criteria="acct_ownership = '2'",desc="Keep secondary ownership");"""
]
df['Primary_Secondary_Logic'] = np.select(primary_secondary_flag_conditions, primary_secondary_flag_values, None)
# concatenating columns with SAS language output
df['SAS'] = df['Country_State_Logic'].astype(str) + df['Primary_Secondary_Logic'].astype(str)
# replacing all 'None' values with empty string ""
df.fillna("",inplace=True)
Following the function, I have the following which is where I am having issues. I'm trying to pass the logic_for_bnk() function into the following new column using np.where():
df['SAS Program Language'] = np.where((df['LOB'] == "BNK"), logic_for_bnk(),
np.where(df['LOB'] == "XYZ", "Pending XYZ Logic",
0))
I want my output to have 3 columns: ID, LOB, and SAS Program Language so I'm then adding the following drop argument to remove excess columns in the DataFrame:
df.drop(['US_ALL','US_50_States','Primary', 'Secondary','Country_State_Logic','Primary_Secondary_Logic'], axis = 1, inplace = True)
The issue here is that the resulting DataFrame contains 4 columns: ID LOB SAS SAS Program Language.
SAS is coming from the def logic_for_bnk() while SAS Program Language is coming from the new column I'm using along with np.where() arguments.
The SAS Program Language is passing None for BNK=LOB instead of the concatenated df['SAS'] and looks like this:
ID LOB SAS SAS Program Language
0 14 BNK %keep(criteria="states_50 = 1", desc="Keep cus... None
1 15 BNK %keep(criteria="acct_ownership = '1'",desc="Ke... None
2 16 BNK %keep(criteria="country = 'US' and states_50 =... None
3 18 BNK %keep(criteria="country = 'US'",desc="Keep cus... None
4 19 XYZ %keep(criteria="country = 'US' and states_50 =... Pending XYZ Logic
5 20 XYZ %keep(criteria="acct_ownership = '2'",desc="Ke... Pending XYZ Logic
6 21 XYZ %keep(criteria="country = 'US'",desc="Keep cus... Pending XYZ Logic
My goal is for the SAS Program Language column to have the concatenation defined in def logic_for_bnk() where LOB=BNK and have Pending XYZ Logic where LOB=XYZ.
Your function doesn't return anything! Add return df to the last line of your function. Other than that, it seems pd.DataFrame.apply is enough to create your desired output
sas_lang = df.LOB.apply(lambda x: logic_for_bnk() if x == 'BNK' else "Pending XYZ Logic")
sas_lang.name = 'SAS Program Language'
new_df = df.join(sas_lang)
Your desired output:
new_df[['ID', 'LOB', 'SAS Program Language']]
Below is the starting code, dataframes and TA indicator. Using plotly to make all my graphs. The dataframes are 'df' and 'd15'. I do have others but will keep it simple for the help.
import yfinance as yf
import plotly.graph_objs as go
#Importing my data
df = yf.download(tickers='EURUSD=X', period='1d', interval='5m')
d15 = yf.download(tickers='EURUSD=X', period='3d',interval='15m')
def Supertrend(df, atr_period, multiplier):
high = df['High']
low = df['Low']
close = df['Close']
# calculate ATR
price_diffs = [high - low,
high - close.shift(),
close.shift() - low]
true_range = pd.concat(price_diffs, axis=1)
true_range = true_range.abs().max(axis=1)
# default ATR calculation in supertrend indicator
atr = true_range.ewm(alpha=1/atr_period,min_periods=atr_period).mean()
# df['atr'] = df['tr'].rolling(atr_period).mean()
# HL2 is simply the average of high and low prices
hl2 = (high + low) / 2
# upperband and lowerband calculation
# notice that final bands are set to be equal to the respective bands
final_upperband = upperband = hl2 + (multiplier * atr)
final_lowerband = lowerband = hl2 - (multiplier * atr)
# initialize Supertrend column to True
supertrend = [True] * len(df)
for i in range(1, len(df.index)):
curr, prev = i, i-1
# if current close price crosses above upperband
if close[curr] > final_upperband[prev]:
supertrend[curr] = True
# if current close price crosses below lowerband
elif close[curr] < final_lowerband[prev]:
supertrend[curr] = False
# else, the trend continues
else:
supertrend[curr] = supertrend[prev]
# adjustment to the final bands
if supertrend[curr] == True and final_lowerband[curr] < final_lowerband[prev]:
final_lowerband[curr] = final_lowerband[prev]
if supertrend[curr] == False and final_upperband[curr] > final_upperband[prev]:
final_upperband[curr] = final_upperband[prev]
# to remove bands according to the trend direction
if supertrend[curr] == True:
final_upperband[curr] = np.nan
else:
final_lowerband[curr] = np.nan
return pd.DataFrame({
'Supertrend': supertrend,
'Final Lowerband': final_lowerband,
'Final Upperband': final_upperband
}, index=df.index)
atr_period = 10
atr_multiplier = 6.0
df = yf.download(tickers='EURUSD=X', period='1d', interval='5m')
supertrend = Supertrend(df, atr_period, atr_multiplier)
df = df.join(supertrend)
#15 Minute Indicator
def Supertrend(df, atr_period, multiplier):
high = df['High']
low = df['Low']
close = df['Close']
# calculate ATR
price_diffs = [high - low,
high - close.shift(),
close.shift() - low]
true_range = pd.concat(price_diffs, axis=1)
true_range = true_range.abs().max(axis=1)
# default ATR calculation in supertrend indicator
atr = true_range.ewm(alpha=1/atr_period,min_periods=atr_period).mean()
# df['atr'] = df['tr'].rolling(atr_period).mean()
# HL2 is simply the average of high and low prices
hl2 = (high + low) / 2
# upperband and lowerband calculation
# notice that final bands are set to be equal to the respective bands
final_upperband = upperband = hl2 + (multiplier * atr)
final_lowerband = lowerband = hl2 - (multiplier * atr)
# initialize Supertrend column to True
supertrend = [True] * len(df)
for i in range(1, len(df.index)):
curr, prev = i, i-1
# if current close price crosses above upperband
if close[curr] > final_upperband[prev]:
supertrend[curr] = True
# if current close price crosses below lowerband
elif close[curr] < final_lowerband[prev]:
supertrend[curr] = False
# else, the trend continues
else:
supertrend[curr] = supertrend[prev]
# adjustment to the final bands
if supertrend[curr] == True and final_lowerband[curr] < final_lowerband[prev]:
final_lowerband[curr] = final_lowerband[prev]
if supertrend[curr] == False and final_upperband[curr] > final_upperband[prev]:
final_upperband[curr] = final_upperband[prev]
# to remove bands according to the trend direction
if supertrend[curr] == True:
final_upperband[curr] = np.nan
else:
final_lowerband[curr] = np.nan
return pd.DataFrame({
'Supertrend': supertrend,
'Final Lowerband': final_lowerband,
'Final Upperband': final_upperband
}, index=df.index)
atr_period = 10
atr_multiplier = 6.0
df = yf.download(tickers='EURUSD=X', period='1d', interval='5m')
supertrend = Supertrend(df, atr_period, atr_multiplier)
df = df.join(supertrend)
This next part is the plot which I think is where I need the help. I need to add 2 buttons and add these 2 charts to each other?
Button 1: 5m (Shows 05m TF Plot)
Button 2: 15m (Shows 15m TF Plot)
#5 Minute TF plot
fig = go.Figure()
fig.add_trace(go.Candlestick(x=df.index,
open=df['Open'],
high=df['High'],
low=df['Low'],
close=df['Close'],
increasing_line_color= '#04b29b',
decreasing_line_color= '#ff2d5d',
increasing_fillcolor = '#04b29b',
decreasing_fillcolor = '#ff2d5d',
name='EURUSD'
))
fig.add_trace(go.Scatter(x=df.index,
y=df['Final Lowerband'],
mode='lines',
line=dict(color='#04b29b'),
name='Bullish'
))
fig.add_trace(go.Scatter(x=df.index,
y=df['Final Upperband'],
mode='lines',
line=dict(color='#ff2d5d'),
name='Bearish'
))
fig.update_layout(xaxis_rangeslider_visible=False,
plot_bgcolor = 'black', showlegend = False,
margin = dict(l=10, r=10,t=10,b=10),
paper_bgcolor='black',
xaxis=dict(showgrid=False, zerolinecolor = 'white',
color='white'),
yaxis=dict(showticklabels=False, showgrid=False))
fig.update_xaxes(
rangebreaks=[
dict(bounds=["sat", "mon"]), #hide weekends
dict(values=["2015-12-25", "2016-01-01"]) # hide Christmas
and New Year's
]
)
fig.show()
This is the 15 minute tf
15 Minute TF Plot
fig15 = go.Figure()
fig15.add_trace(go.Candlestick(x=d15.index,
open=d15['Open'],
high=d15['High'],
low=d15['Low'],
close=d15['Close'],
increasing_line_color= '#04b29b',
decreasing_line_color= '#ff2d5d',
increasing_fillcolor = '#04b29b',
decreasing_fillcolor = '#ff2d5d',
name='EURUSD'
))
fig15.add_trace(go.Scatter(x=d15.index,
y=d15['Final Lowerband'],
mode='lines',
line=dict(color='#04b29b'),
name='Bullish'
))
fig15.add_trace(go.Scatter(x=d15.index,
y=d15['Final Upperband'],
mode='lines',
line=dict(color='#ff2d5d'),
name='Bearish'
))
fig15.update_layout(xaxis_rangeslider_visible=False,
plot_bgcolor = 'black', showlegend = False,
margin = dict(l=10, r=10,t=10,b=10),
paper_bgcolor='black',
xaxis=dict(showgrid=False, zerolinecolor = 'white',
color='white'),
yaxis=dict(showticklabels=False, showgrid=False))
fig15.update_xaxes(
rangebreaks=[
dict(bounds=["sat", "mon"]), #hide weekends
dict(values=["2015-12-25", "2016-01-01"]) # hide Christmas
and New Year's
]
)
fig15.show()
If you want to get an answer quickly, you will get a quicker answer if you put a more simplified code on it. Or, if you create a reproducible situation with sample data and code for graphs, you will have a better chance of getting an answer. To answer the main question of how to make each graph a button, you can use the stock price data from the official reference, draw three different stocks, and set the show/hide control for each to show For example, if you only have AAPL, set the others to False to hide them. That is simply the only setting.
import plotly.graph_objects as go
import plotly.express as px
df = px.data.stocks()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['date'], y=df['AAPL'], name='AAPL'))
fig.add_trace(go.Scatter(x=df['date'], y=df['GOOG'], name='GOOG'))
fig.add_trace(go.Scatter(x=df['date'], y=df['AMZN'], name='AMZN'))
fig.update_layout(
updatemenus=[
dict(
type="buttons",
direction="right",
active=0,
x=0.25,
y=1.2,
buttons=list([
dict(label="All",
method="update",
args=[{"visible": [True, True, True]},
{"title": "All"}
]),
dict(label="AAPL",
method="update",
args=[{"visible": [True, False, False]},
{"title": "AAPL"}
]),
dict(label="GOOG",
method="update",
args=[{"visible": [False, True, False]},
{"title": "GOOG"}
]),
dict(label="AMZN",
method="update",
args=[{"visible": [False, False, True]},
{"title": "AMZN"}
]),
]),
)
])
fig.show()
When I am making a graph on plotly, my first graph (All) is working great. It has the 5 categories and, when you hover, adds up to 100%. first graph. When I move onto the second graph, this is where I start running into issues. When you click the 20 buttons, it comes up with this The second graph. Now it has the 5 corresponding categories to the age group of 20 but it also includes the some categories for the 30s age group. How can I set it up so that it only shows the 5 categories for each corresponding age group?
Entire Code:
p_info = pd.read_csv('PatientInfo.csv',parse_dates=['symptom_onset_date', 'confirmed_date', 'released_date', 'deceased_date'])
p_info = p_info.dropna(subset=['age'])
p_info = p_info.dropna(subset=['infection_case'])
#Lets Group these together to make analysis easier
def group(x):
if "Shincheonji Church" in x:
x="Church"
return x
if "Onchun Church" in x:
x="Church"
return x
if "Dongan Church" in x:
x="Church"
return x
if "Geochang Church" in x:
x="Church"
return x
if "SMR Newly Planted Churches Group" in x:
x="Church"
return x
if "Pilgrimage to Israel" in x:
x="Church"
return x
if "River of Grace Community Church" in x:
x="Church"
return x
if "Biblical Language study meeting" in x:
x="Church"
return x
if "etc" in x:
x="etc"
return x
if "contact with patient" in x:
x="Direct Contact with Patient"
return x
if "overseas inflow" in x:
x="Overseas"
return x
else:
x="Group"
return x
p_info['infection_type'] = p_info['infection_case'].apply(group)
p_info['week'] = p_info['confirmed_date'].dt.weekofyear
type_by_time = p_info.groupby(['week', 'infection_type']).size().unstack().fillna(0)
# type_by_time = type_by_time.div(type_by_time.sum(axis=1), axis=0) * 100
type_by_time
type_by_time_age = []
# df = PatientRoute.merge(PatientInfo, on='patient_id')
df = p_info
for age_group in ['20','30','40','50','60','70','80']:
new_type_by_time = df.groupby(['week', 'infection_type']).size().unstack().fillna(0)
type_by_time_age.append(new_type_by_time)
colors = px.colors.qualitative.Light24
x = type_by_time.index.tolist()
categories = ['Church', 'Direct Contact with Patient', 'Group', 'Overseas', 'etc']
fig = go.Figure()
for i, cat in enumerate(categories):
fig.add_trace(go.Scatter(x=x, y=type_by_time[cat],
hoverinfo='x+y',
mode='lines',
line=dict(width=0.5, color=colors[i]),
name=cat,
stackgroup='one',
groupnorm='percent'
))
for age, df in enumerate(type_by_time_age):
for i, cat in enumerate(categories):
fig.add_trace(go.Scatter(x=df.index.tolist(), y=df[cat],
hoverinfo='y',
mode='lines',
line=dict(width=0.5, color=colors[i]),
name=cat,
stackgroup=age,
groupnorm='percent',
visible=False))
fig.update_layout(
title='Where do most people get infected?',
showlegend=True,
xaxis=dict(
range=[4, 19],
ticksuffix=' week'
),
yaxis=dict(
type='linear',
range=[1, 100],
ticksuffix='%'))
menus = []
for i, name in enumerate(['All','20','30','40','50','60','70','80']):
d = dict(label=name,
method="update",
args=[{"visible": [False]*i*9 + [True]*9 + [False]*(8-i-1)*9},
{"title": f"Where do most people get infected? (Age: {name})"}])
menus.append(d)
fig.update_layout(
updatemenus=[
dict(
type="buttons",
direction="right",
active=0,
x=1,
y=1.2,
buttons=menus,
)
],
xaxis_title="weeks",
yaxis_title="% in group of people who get infected",
)
fig.show()
Here is the link to the kaggle dataset: https://www.kaggle.com/kimjihoo/coronavirusdataset
With this dataset:
start,end,rms,state,maxTemp,minTemp
2019-02-20T16:16:31.752Z,2019-02-20T17:33:34.750Z,4.588481,charge,35.0,32.0
2019-02-20T17:33:34.935Z,2019-02-20T18:34:49.737Z,5.770562,discharge,35.0,33.0
And this:
[{"EventDate":"2019-02-02T16:17:00.579Z","Value":"23"},
{"EventDate":"2019-02-02T16:18:01.579Z","Value":"23"},
{"EventDate":"2019-02-02T16:19:02.581Z","Value":"23"},
{"EventDate":"2019-02-02T16:20:03.679Z","Value":"23"},
{"EventDate":"2019-02-02T16:21:04.684Z","Value":"23"},
{"EventDate":"2019-02-02T17:40:05.693Z","Value":"23"},
{"EventDate":"2019-02-02T17:40:06.694Z","Value":"23"},
{"EventDate":"2019-02-02T17:40:07.698Z","Value":"23"},
{"EventDate":"2019-02-02T17:40:08.835Z","Value":"23"}]
schema = StructType([
StructField('EventDate', TimestampType(), True),
StructField('Value', FloatType(), True)
])
I want to add max and min values of the json dataset as columns into the csv dataset.
I have tried:
cyclesWithValues = csvDf\
.withColumn("max", jsondata.filter((col("EventDate") >= csvDf.start) & (col("EventDate") <= csvDf.end)).agg({"value": "max"}).head()["max(Value)"])\
.withColumn("min", jsondata.filter((col("EventDate") >= csvDf.start) & (col("EventDate") <= csvDf.end)).agg({"value": "min"}).head()["min(Value)"])
But I get this error:
AnalysisException: 'Resolved attribute(s) start#38271,end#38272 missing from EventDate#38283,Value#38286 in operator !Filter ((EventDate#38283 >= start#38271) && (EventDate#38283 <= end#38272)).;;\n!Filter ((EventDate#38283 >= start#38271) && (EventDate#38283 <= end#38272))\n+- Project [EventDate#38283, cast(Value#38280 as float) AS Value#38286]\n +- Project [to_timestamp(EventDate#38279, None) AS EventDate#38283, Value#38280]\n +- Relation[EventDate#38279,Value#38280] json\n'
I have a solution based on arrays, but it seems very slow, so I was hoping something like this would speed things up a bit.
Right now I am using this solution:
dfTemperature = spark.read.option("multiline", "true").json("path")
dfTemperatureCast = dfTemperature.withColumn("EventDate", to_timestamp(dfTemperature.EventDate)).withColumn("Value", dfTemperature.Value.cast('float'))
def AddVAluesToDf(row):
temperatures = dfTemperatureCast.filter((col("EventDate") >= row["start"]) & (col("EventDate") <= row["end"]))
maxTemp = temperatures.agg({"value": "max"}).head()["max(value)"]
minTemp = temperatures.agg({"value": "min"}).head()["min(value)"]
return (row.start, row.end, row.rms, row.state, maxTemp, minTemp)
pool = ThreadPool(10)
withValues = pool.map(AddVAluesToDf, rmsDf)
schema = StructType([
StructField('start', TimestampType(), True),
StructField('end', TimestampType(), True),
StructField('maxTemp', FloatType(), True),
StructField('minTemp', FloatType(), True)
])
cyclesDF = spark.createDataFrame(withValues, schema)