How to improve speed of slicing for interactive dataframe - python

I have a dataframe that is imported from csv/excel file. The file is a simulation data that sweeps variables. Different versions of the file sweep different variables. Sometimes, it sweeps variables a,c,e, other times, it sweeps variables b,c,d, .etc.
Currently, when slicing the interactive dataframe idf (idf = df.interactive()), I manually change the code, depending on the prior knowledge of what is swept.(slicing on all the columns in one step with "&" operator.)The execution time of slicing is 7.67 seconds in the example below. However, if I want to detect what variables are swept and slice the dataframe interactively, the execution speed increases a lot. (382 seconds.)
I wonder whether there is a syntax that enjoy both the fast speed and automatic detection of what variables are swept.
import pandas as pd
import numpy as np
import time
import panel as pn
pn.extension('tabulator')
import hvplot.pandas
import holoviews as hv
list_alphabet = ['a','b','c','d','e']
list_year = [2000,2001,2002,2003,2004,2005]
list_country = ['US','BE','NL','CN']
list_color = ['white','blue','red','green','black']
list_fruit = ['apple','banana','pear','peach']
list_price = ['high','medium','low']
list_currency = ['USD','EUR','CNY','CAD']
list_transport = ['walking','biking','car','bus','train','tram','plane']
list_age = [10,20,30,40,50,60,70]
dict_lst ={'alphabet':list_alphabet,'year':list_year,'country':list_country,'color':list_color,'fruit':list_fruit,'price':list_price,\
'currency':list_currency,'transport':list_transport,'age':list_age}
list_itertools = [dict_lst[x] for x in dict_lst.keys()]
df = pd.DataFrame(list(itertools.product(*list_itertools)), columns=[x for x in dict_lst.keys()])
df['value'] = df['year']+10*df['age']
if 'data' not in pn.state.cache.keys():
pn.state.cache['data'] = df.copy()
else:
df = pn.state.cache['data']
dict_selector = {}
for selector_name in lst_possible_swept_variables:
if selector_name in df.columns:
selector = pn.widgets.Select(name=selector_name, options=dict_lst[selector_name])
dict_selector[selector_name] = selector
idf = df.interactive()
# fast-execution but hard-coded
start_time = time.time()
df_pipeline = (
idf[
(idf.alphabet == dict_selector['alphabet']) & \
(idf.year == dict_selector['year']) & \
(idf.country == dict_selector['country']) & \
(idf.color == dict_selector['color']) & \
(idf.fruit == dict_selector['fruit']) & \
(idf.price == dict_selector['price']) & \
(idf.currency == dict_selector['currency']) & \
(idf.transport == dict_selector['transport']) & \
(idf.age == dict_selector['age']) \
].groupby(['alphabet','year','country','color','fruit','price','currency','transport','age'])
['value'].mean().to_frame().reset_index().sort_values(by='age').reset_index(drop=True)
)
print("--- %s seconds ---" % (time.time() - start_time))
## --- 7.60701847076416 seconds ---
# self-adjusting code but executed much slower
start_time = time.time()
idf_slice = idf
for key_name in dict_selector.keys():
idf_slice = idf_slice[idf_slice[key_name] == dict_selector[key_name]]
df_pipeline_slice = (idf_slice.groupby([x for x in dict_selector.keys()])['value'].mean().to_frame().reset_index().sort_values(by='age').reset_index(drop=True)
)
print("--- %s seconds ---" % (time.time() - start_time))
## --- 381.84749722480774 seconds ---

Related

Export data to gsheet workbooks / worksheets after looping through a script 10 times

I have the following script I'm running to get data from Google's pagespeed insights tool via API:
from datetime import datetime
from urllib import request
import requests
import pandas as pd
import numpy as np
import re
from os import truncate
import xlsxwriter
import time
import pygsheets
import pickle
domain_strip = 'https://www.example.co.uk'
gc = pygsheets.authorize(service_file='myservicefile.json')
API = "myapikey"
strat = "mobile"
def RunCWV():
with open('example_urls_feb_23.txt') as pagespeedurls:
content = pagespeedurls.readlines()
content = [line.rstrip('\n') for line in content]
#Dataframes
dfCWV2 = pd.DataFrame({'Page':[],'Overall Performance Score':[],'FCP (seconds) CRUX':[],'FCP (seconds) Lab':[],'FID (seconds)':[],'Max Potential FID (seconds)':[],'LCP (seconds) CRUX':[],'LCP (seconds) Lab':[],'LCP Status':[],'CLS Score CRUX':[],'Page CLS Score Lab':[],'CLS Status':[],'Speed Index':[],'Uses Efficient Cache Policy?':[],'Landing Page':[]})
dfCLSPath2 = pd.DataFrame({'Page':[],'Path':[],'Selector':[],'Node Label':[],'Element CLS Score':[],'Landing Page':[],'large_uid':[]})
dfUnsizedImages2 = pd.DataFrame({'Page':[],'Image URL':[],'Landing Page':[],'unsized_uid':[]})
dfNCAnim2 = pd.DataFrame({'Page':[],'Animation':[],'Failure Reason':[],'Landing Page':[]})
dfLCP_Overview = pd.DataFrame({'Page':[],'Preload LCP Savings (seconds)':[],'Resize Images Savings (seconds)':[],'Text Compression Savings (seconds)':[],'Preload Key Requests Savings (seconds)':[],'Preconnect Savings (seconds)':[],'Unused CSS Savings (seconds)':[],'Unused JS Savings (seconds)':[],'Unminified CSS Savings (seconds)':[],'Unminified JS Savings (seconds)':[],'Efficiently Animated Content Savings':[],'Landing Page':[]})
dfLCPOb2 = pd.DataFrame({'Page':[],'LCP Tag':[],'LCP Tag Type':[],'LCP Image Preloaded?':[],'Wasted Seconds':[],'Action':[],'Landing Page':[]})
dfresize_img = pd.DataFrame({'Page':[],'Image URL':[],'Total Bytes':[],'Wasted Bytes':[],'Overall Savings (seconds)':[],'Action':[],'Landing Page':[]})
dfFontDisplay2 = pd.DataFrame({'Page':[],'Resource':[],'Font Display Utilised?':[],'Wasted Seconds':[],'Action':[],'Landing Page':[]})
dfTotalBW2 = pd.DataFrame({'Page':[],'Total Byte Weight of Page':[],'Large Network Payloads?':[],'Resource':[],'Total KB':[],'Landing Page':[]})
dfRelPreload2 = pd.DataFrame({'Page':[],'Resource':[],'Wasted Seconds':[],'Landing Page':[]})
dfRelPreconnect2 = pd.DataFrame({'Page':[],'Resource':[],'Wasted Ms':[],'Passed Audit':[],'Landing Page':[]})
dfTextCompression2 = pd.DataFrame({'Page':[],'Text Compression Optimal?':[],'Action':[],'Savings':[],'Landing Page':[]})
dfUnusedCSS2 = pd.DataFrame({'Page':[],'CSS File':[],'Unused CSS Savings KiB':[],'Unused CSS Savings (seconds)':[],'Wasted %':[],'Landing Page':[]})
dfUnusedJS2 = pd.DataFrame({'Page':[],'JS File':[],'Unused JS Savings (seconds)':[],'Total Bytes':[],'Wasted Bytes':[],'Wasted %':[],'Landing Page':[]})
dfUnminCSS2 = pd.DataFrame({'Page':[],'CSS File':[],'Total Bytes':[],'Wasted Bytes':[],'Wasted %':[],'Landing Page':[]})
dfUnminJS2 = pd.DataFrame({'Page':[],'JS File':[],'Total Bytes':[],'Wasted Bytes':[],'Wasted %':[],'Landing Page':[]})
dfCritRC2 = pd.DataFrame({'Page':[],'Resource':[],'Start Time':[],'End Time':[],'Total Time':[],'Transfer Size':[],'Landing Page':[]})
dfAnimContent2 = pd.DataFrame({'Page':[],'Efficient Animated Content?':[],'Resource':[],'Total Bytes':[],'Wasted Bytes':[],'Landing Page':[]})
dfSRT2 = pd.DataFrame({'Page':[],'Passed Audit?':[],'Server Response Time ms':[],'Server Response Time Savings':[],'Landing Page':[]})
dfRedirects2 = pd.DataFrame({'Page':[],'Redirects':[],'Wasted ms':[],'Landing Page':[]})
dfFID_Summary2 = pd.DataFrame({'Page':[],'FID (seconds)':[],'Total Blocking Time (seconds)':[],'FID Rating':[],'Total Tasks':[],'Total Task Time of Page (seconds)':[],'Tasks over 50ms':[],'Tasks over 100ms':[],'Tasks over 500ms':[],'3rd Party Total Wasted Seconds':[],'Bootup Time (seconds)':[],'Number of Dom Elements':[],'Mainthread work Total Seconds':[],'Duplicate JS Savings (Seconds)':[],'Legacy JS Savings (seconds)':[],'Landing Page':[]})
dflongTasks2 = pd.DataFrame({'Page':[],'Task':[],'Task Duration Seconds':[],'Total Tasks':[],'Total Task Time of Page (seconds)':[],'Tasks over 50ms':[],'Tasks over 100ms':[],'Tasks over 500ms':[],'Landing Page':[]})
dfthirdP2 = pd.DataFrame({'Page':[],'3rd Party Total wasted Seconds':[],'3rd Party Total Blocking Time (seconds)':[],'3rd Party Resource Name':[],'Landing Page':[]})
dfbootup2 = pd.DataFrame({'Page':[],'Page Bootup Time Score':[],'Resource':[],'Time spent Parsing / Compiling Ms':[]})
dfthread2 = pd.DataFrame({'Page':[],'Score':[],'Mainthread work total seconds':[],'Mainthread work Process Type':[],'Duration (Seconds)':[],'Landing Page':[]})
dfDOM2 = pd.DataFrame({'Page':[],'Dom Size Score':[],'DOM Stat':[],'DOM Value':[],'Landing Page':[],})
dfdupJS2 = pd.DataFrame({'Page':[],'Score':[],'Audit Status':[],'Duplicate JS Savings (seconds)':[], 'Landing Page':[]})
dflegacyJS2 = pd.DataFrame({'Page':[],'Audit Status':[],'Legacy JS Savings (seconds)':[],'JS File of Legacy Script':[],'Wasted Bytes':[],'Landing Page':[]})
#Run PSI
for line in content:
x = f'https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url={line}&strategy={strat}&key={API}'
print(f'Running CWV Audit on {strat} from: {line} - Please Wait...')
r = requests.get(x)
data = r.json()
line_stripped = line
if domain_strip in line:
line_stripped = line_stripped.replace(domain_strip, '')
else:
pass
#CWV Overview
try:
op_score = data["lighthouseResult"]["categories"]["performance"]["score"] * 100
fcp_score_CRUX = data["loadingExperience"]["metrics"]["FIRST_CONTENTFUL_PAINT_MS"]["percentile"] / 1000
fcp_score_LAB = data["lighthouseResult"]["audits"]["first-contentful-paint"]["numericValue"] / 1000
fid_score = data["loadingExperience"]["metrics"]["FIRST_INPUT_DELAY_MS"]["percentile"] / 1000
Max_P_FID = data["lighthouseResult"]["audits"]["max-potential-fid"]["numericValue"] / 1000
lcp_score_CRUX_ms = data["loadingExperience"]["metrics"]["LARGEST_CONTENTFUL_PAINT_MS"]["percentile"]
lcp_score_CRUX = data["loadingExperience"]["metrics"]["LARGEST_CONTENTFUL_PAINT_MS"]["percentile"] / 1000
lcp_score_LAB = data["lighthouseResult"]["audits"]["first-contentful-paint"]["numericValue"] / 1000
cls_score_Sitewide = data["loadingExperience"]["metrics"]["CUMULATIVE_LAYOUT_SHIFT_SCORE"]["percentile"] / 100
cls_score_Page_mult = data["lighthouseResult"]["audits"]["cumulative-layout-shift"]["numericValue"] * 1000
cls_score_Page = data["lighthouseResult"]["audits"]["cumulative-layout-shift"]["numericValue"]
speed_index = data["lighthouseResult"]["audits"]["speed-index"]["numericValue"] / 1000
efficient_cache = data["lighthouseResult"]["audits"]["uses-long-cache-ttl"]["score"]
if efficient_cache == 1:
efficient_cache = "Yes"
else:
efficient_cache = "No"
lcp_status = lcp_score_CRUX_ms
if lcp_score_CRUX_ms <=2500:
lcp_status = "Good"
elif lcp_score_CRUX_ms in range (2501, 4000):
lcp_status = "Needs Improvement"
else:
lcp_status = "Poor"
cls_status = cls_score_Page_mult
if cls_score_Page_mult <=100:
cls_status = "Good"
elif cls_score_Page_mult in range (101,150):
cls_status = "Needs Improvement"
else:
cls_status = "Poor"
new_row = pd.DataFrame({'Page':line_stripped,'Overall Performance Score':op_score, 'FCP (seconds) CRUX':round(fcp_score_CRUX,4),'FCP (seconds) Lab':round(fcp_score_LAB,4), 'FID (seconds)':round(fid_score,4),
'Max Potential FID (seconds)':round(Max_P_FID,4), 'LCP (seconds) CRUX':round(lcp_score_CRUX,4),'LCP (seconds) Lab':round(lcp_score_LAB,4), 'LCP Status':lcp_status, 'CLS Score CRUX':round(cls_score_Sitewide,4),
'Page CLS Score Lab':round(cls_score_Page,4),'CLS Status':cls_status,'Speed Index':round(speed_index,4),'Uses Efficient Cache Policy?':efficient_cache, 'Landing Page':line_stripped}, index=[0])
dfCWV2 = pd.concat([dfCWV2, new_row], ignore_index=True) #, ignore_index=True
except KeyError:
print(f'<KeyError> CWV Summary One or more keys not found {line}.')
except TypeError:
print(f'TypeError on {line}.')
print ('CWV Summary')
print (dfCWV2)
#Export to GSheets line by line
sh = gc.open('CWV Overview AWP - example Feb 2023')
worksheet = sh.worksheet_by_title('CWV')
df_worksheet = worksheet.get_as_df()
result = pd.concat([df_worksheet, dfCWV2], ignore_index=True)
result=result.drop_duplicates(keep='last')
worksheet.set_dataframe(result, 'A1')
# #End test
#CLS
#Large Shifts
try:
for x in range (len(data["lighthouseResult"]["audits"]["layout-shift-elements"]["details"]["items"])):
path = data["lighthouseResult"]["audits"]["layout-shift-elements"]["details"]["items"][x]["node"]["path"]
selector = data["lighthouseResult"]["audits"]["layout-shift-elements"]["details"]["items"][x]["node"]["selector"]
nodeLabel = data["lighthouseResult"]["audits"]["layout-shift-elements"]["details"]["items"][x]["node"]["nodeLabel"]
score = data["lighthouseResult"]["audits"]["layout-shift-elements"]["details"]["items"][x]["score"]
i = 1
new_row = pd.DataFrame({'Page':line_stripped, 'Path':path, 'Selector':selector, 'Node Label':nodeLabel,'Element CLS Score':round(score,4), 'Landing Page':line_stripped, 'large_uid':i}, index=[0])
dfCLSPath2 = pd.concat([dfCLSPath2, new_row], ignore_index=True)
except KeyError:
print(f'<KeyError> Layout Shift Elements - One or more keys not found {line}.')
except TypeError:
print(f'TypeError on {line}.')
print ('Large Shifts')
print (dfCLSPath2)
sh = gc.open('CLS Audit AWP - example Feb 2023')
worksheet = sh.worksheet_by_title('Large CLS Elements')
df_worksheet = worksheet.get_as_df()
result = pd.concat([df_worksheet, dfCLSPath2], ignore_index=True)
result=result.drop_duplicates(keep='last')
worksheet.set_dataframe(result, 'A1')
#Unsized Images
try:
for x in range (len(data["lighthouseResult"]["audits"]["unsized-images"]["details"]["items"])):
unsized_url = data["lighthouseResult"]["audits"]["unsized-images"]["details"]["items"][x]["url"]
i = 1
new_row = pd.DataFrame({'Page':line_stripped, 'Image URL':unsized_url, 'Landing Page':line_stripped, 'unsized_uid':i}, index=[0])
dfUnsizedImages2 = pd.concat([dfUnsizedImages2, new_row], ignore_index=True)
except KeyError:
print(f'<KeyError> Unsized Images One or more keys not found {line}.')
except TypeError:
print(f'TypeError on {line}.')
print ('Unsized Images')
print(dfUnsizedImages2)
sh = gc.open('CLS Audit AWP - example Feb 2023')
worksheet = sh.worksheet_by_title('Unsized Images')
df_worksheet = worksheet.get_as_df()
result = pd.concat([df_worksheet, dfUnsizedImages2], ignore_index=True)
result=result.drop_duplicates(keep='last')
worksheet.set_dataframe(result, 'A1')
I've only included the first few TRY blocks as the script is very long. Essentially what I want to do is the same as I have here, but rather than exporting the results from the dataframes after every URL has run, I want to export it, say, every 10 urls (or more). I have around 4000 urls in total and I need to capture the results from the audit for every url
I used to have the script set up to export to gsheets at the end of the whole script with every loop, but I always end up with the script crashing before it loops through every URL I'm auditing which is why I set it up as above to export line by line - it's SUPER slow though, taking over 2 weeks to run through all urls in my text file so I want to speed it up by only exporting every 10 urls worth of data at a time. That way if the script crashes, I've not lost everything, only the last 10 urls.
I tried setting a counter on each of the export blocks:
results = []
results_to_export = []
for i in range(10):
counter = 0
while counter < 5000:
print("Starting loop iteration")
results.append(dfCWV2)
counter += 1
if counter % 10 == 0:
print("Running after 10 loops")
result=pd.concat(results, ignore_index=True)
result=result.drop_duplicates(keep='last')
# add results to export list
results_to_export.append(result)
if results_to_export:
sh = gc.open('CWV Overview AWP - example Feb 2023')
worksheet = sh.worksheet_by_title('CWV')
combined_results = pd.concat(results_to_export, ignore_index=True)
worksheet.set_dataframe(combined_results, 'A1')
results_to_export.clear()
results =[]
But this just kept looping through the while loop and not moving onto the next Try block or throwing up errors (I tried every version of unindenting the if statements too but nothing worked).
Please help!
A shorter program would be more likely to get an expert answer
It may be a long time until you find somebody stumbling on to this page who is willing to read so much text, and who knows how to solve the problem. To improve your chances, it is best to trim your program to the absolute smallest size that allows the problem to manifest.
Is your if counter statement not indented enough?
Currently you have:
results = []
results_to_export = []
for i in range(10):
counter = 0
while counter < 5000:
# your other code here
print("Starting loop iteration")
results.append(dfCWV2)
counter += 1
if counter % 10 == 0:
print("Running after 10 loops")
But the if counter, positioned where it is in the above code, will only be reached after 5000 steps of "while counter".
Did you mean this?
results = []
results_to_export = []
for i in range(10):
counter = 0
while counter < 5000:
# your other code here
print("Starting loop iteration")
results.append(dfCWV2)
counter += 1
if counter % 10 == 0:
print("Running after 10 loops")

Python multiprocessing multiple iterations

I am trying to use multiprocessing to speed up my data processing. I am working on a machine with 6 Cores, so I want to iterate through a table of 12 million rows, and for each of these rows I iterate through several time steps doing a calculation (executing a function).
This line I would like to split up that it runs in parallel on different cores:
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
I tried something with
from multiprocessing import Pool
but I did not manage to pass the arguments of the function and the iterator.
I would appreciate any idea. I am new to Python.
This is what i have:
import pyreadr
import pandas as pd
import numpy as np
import time
from datetime import timedelta
import functools
from pathlib import Path
def read_data():
current_path = os.getcwd()
myfile = os.path.join(str(Path(current_path).parents[0]), 'dummy.RData')
result = pyreadr.read_r(myfile)
pc = result["pc"]
u = result["u"]
return pc, u
# add one column per time
def prepare_output_structure(pc):
ini_cols = pc.columns
pc = pc.reindex(columns=[*pc.columns, *np.arange(0, 11), 'cat'], fill_value=0)
pc.reset_index(level=0, inplace=True)
# print(pc.columns, pc.shape, pc.dtypes)
return pc, ini_cols
def conjunction(*conditions):
return functools.reduce(np.logical_and, conditions)
def timeloop(t_final: int, count_final: int, tipo):
if tipo == 'A':
count_ini = 35
else: # B:
count_ini = 30
yy_list = []
for t in np.arange(0, 11):
yy = ((count_final - count_ini) / t_final) * t + count_ini
yy_list.append(int(yy))
return yy_list
def rowiteration(i, output, ini_cols, cols):
c_2: bool = pc.loc[i, 'tipo'] == u.iloc[:, 0].str[:1] # first character of category e.g. 'A1'
c_5: bool = pc.loc[i, 't_final'] >= u.iloc[:, 1] # t_min (u)
c_6: bool = pc.loc[i, 't_final'] <= (u.iloc[:, 2]) # t_max (u)
pc.loc[i, 'cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
pc.iloc[i, (0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(pc.loc[i, 't_final']), int(pc.loc[i, 'count_final']), pc.loc[i, 'tipo'])
out = pd.DataFrame(pc.iloc[i, :])
out = pd.DataFrame(out.transpose(), columns=cols)
output = output.append(out.iloc[0, :])
return output
if __name__ == '__main__':
start_time = time.time()
pc, u = read_data()
nrowpc = len(pc.index)
a = np.arange(0, nrowpc) # filas tabla pc
# print(a, nrowpc, len(pc.index))
pc, ini_cols = prepare_output_structure(pc)
cols = pc.columns
output = pd.DataFrame()
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
pc2 = pd.concat(test, ignore_index=True)
pc2 = pc2.iloc[:, np.r_[5, (len(ini_cols)+1):(len(pc2.columns))]]
print(pc2.head)
elapsed_time_secs = time.time() - start_time
msg = "Execution took: %s secs (Wall clock time)" % timedelta(milliseconds=elapsed_time_secs)
print(msg)```
Replace your [rowiteration(i, output, ini_cols, cols) for i in a] with:
from multiprocessing import Pool
n_cpu = 10 # put in the number of threads of cpu
with Pool(processes=n_cpu) as pool:
ret = pool.starmap(rowiteration,
[(i, output, ini_cols, cols) for i in a])
Here is an approach that I think solves the problem and that only sends what is necessary to the worker processes. I haven't tested this as is (which would be difficult without the data your code reads in) but this is basic idea:
import multiprocessing as mp
p = mp.Pool(processes=mp.cpu_count())
# Note that you already define the static cols and ini_cols
# in global scope so you don't need to pass them to the Pool.
# ... Other functions you've defined ...
def rowiteration(row):
c_2: bool = row['tipo'] == u.iloc[:, 0].str[:1]
c_5: bool = row['t_final'] >= u.iloc[:, 1]
c_6: bool = row['t_final'] <= (u.iloc[:, 2])
row['cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
row[(0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(row['t_final']), int(row['count_final']), row['tipo'])
return row
out = []
for row in p.imap_unordered(rowiteration, [r for _, r in pc.iterrows()]):
row.index = cols
out.append(cols)
pc2 = pd.DataFrame(out, ignore_index=True)

How to implement dependant columns in hypothesis dataframes

I am using hypothesis dataframes to implement a dataframe in which start_time and end_time are two columns. Here is a chunck:
import hypothesis.strategies as st
import logging
import datetime
from hypothesis import given
from hypothesis.extra.pandas import column, data_frames, range_indexes
current_time = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
datetime_st = st.integers(
min_value=(current_time + datetime.timedelta(hours=4)).timestamp(),
max_value=(current_time + datetime.timedelta(hours=20)).timestamp(),
)
df_columns = {
# other fields omitted
"start_time": {"elements": datetime_st, "unique": False},
"end_time": {"elements": datetime_st, "unique": False},
}
test_dfs = data_frames(
index=range_indexes(min_size=20, max_size=100),
columns=[column(key, **value) for key, value in df_columns.items()],
)
#given(df=test_dfs)
def test_hyothesis(df):
logging.info(df)
assert 1
I am not able to find a solution to assert that each start_time should be greater than its corresponding end_time by atleast delta. I have tried composite, but I am not sure on how to implement it on each rows of the dataframes.
Is there a way that I enforce the delta as a rule when initialising start_time and end_time?
Here's a way to generate a dataframe of two time-stamp columns, where the difference between the first one and the second one is at least 3600 seconds (or some other amount of time). I'm using st.flatmap for that.
import hypothesis.strategies as st
from hypothesis.extra.pandas import column, data_frames, range_indexes, columns
current_time = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0).timestamp()
MIN_DIFF_SECONDS = 3600
two_timestamps_with_diff = st.integers(
min_value = current_time + 3600 * 4,
max_value = current_time + 4600 * 20).flatmap(
lambda n: st.tuples(
st.integers(min_value = n, max_value=n),
st.integers(min_value = n + MIN_DIFF_SECONDS, max_value = n + 3600*10)
))
# sample code to examine the results of this strategy
# for _ in range(10):
# x, y = two_timestamps_with_diff.example()
# print(x, y, y-x)
test_dfs = data_frames(
index=range_indexes(min_size=20, max_size=100),
columns=columns(["start_time", "end_time"], dtype=int),
rows=two_timestamps_with_diff,
)
# sample code to examine the results of this strategy
# res = test_dfs.example()
# res.assign(d = res.end_time - res.start_time)
# a test with an assertion that validates this constraint.
#given(df=test_dfs)
def test_hyothesis(df):
logging.info(df)
assert ((df.end_time - df.start_time) >= MIN_DIFF_SECONDS).all()
# run the test. It passes.
test_hyothesis()
If you'd like to add additional columns to the autogenerated dataframe, do the following (the new columns are 'a' and 'b' in this example):
from hypothesis.strategies import composite
#composite
def test_df_with_additional_columns(draw, elements=test_dfs):
df = draw(test_dfs)
class GetIndex(st.SearchStrategy[pd.core.indexes.range.RangeIndex]):
def do_draw(self, _):
return df.index
more_col_strategy = data_frames([column('A', dtype=int),
column('B', dtype=float)],
index = GetIndex()
)
more_cols = draw(more_col_strategy)
return pd.concat([df, more_cols], axis=1)
test_df_with_additional_columns().example()

Getting Error in runHeartBreathRateKraskov.py code IndentationError: unexpected indent?

I am running below code to taking for runHeartBreathRateKraskov, I am facing the issue and below error.
Want to run below code to calculate Transfer Entropy in runHeartBreathRateKraskov program. I am new and not so much knowledge about Entropy transfer and Mutual Information. I also attached my data set for Information.
from jpype import *
^
IndentationError: unexpected indent
# Run e.g. python runHeartBreathRateKraskov.py 2 2 1,2,3,4,5,6,7,8,9,10
from jpype import *
import sys
import os
import random
import math
import string
import numpy
# Import our readFloatsFile utility in the above directory:
sys.path.append(os.path.relpath(".."))
import readFloatsFile
# Change location of jar to match yours:
#jarLocation = "../../../infodynamics.jar"
jarLocation = "/home/humair/Documents/Transfer Entropy/infodynamics-dist-1.5/infodynamics.jar"
# Start the JVM (add the "-Xmx" option with say 1024M if you get crashes due to not enough memory space)
startJVM(getDefaultJVMPath(), "-ea", "-Djava.class.path=" + jarLocation)
# Read in the command line arguments and assign default if required.
# first argument in argv is the filename, so program arguments start from index 1.
if (len(sys.argv) < 2):
kHistory = 1;
else:
kHistory = int(sys.argv[1]);
if (len(sys.argv) < 3):
lHistory = 1;
else:
lHistory = int(sys.argv[2]);
if (len(sys.argv) < 4):
knns = [4];
else:
knnsStrings = sys.argv[3].split(",");
knns = [int(i) for i in knnsStrings]
if (len(sys.argv) < 5):
numSurrogates = 0;
else:
numSurrogates = int(sys.argv[4]);
# Read in the data
datafile = '/home/humair/Documents/Transfer Entropy/SFI-heartRate_breathVol_bloodOx.txt'
rawData = readFloatsFile.readFloatsFile(datafile)
# As numpy array:
data = numpy.array(rawData)
# Heart rate is first column, and we restrict to the samples that Schreiber mentions (2350:3550)
heart = data[2349:3550,0]; # Extracts what Matlab does with 2350:3550 argument there.
# Chest vol is second column
chestVol = data[2349:3550,1];
# bloodOx = data[2349:3550,2];
timeSteps = len(heart);
print("TE for heart rate <-> breath rate for Kraskov estimation with %d samples:" % timeSteps);
# Using a KSG estimator for TE is the least biased way to run this:
teCalcClass = JPackage("infodynamics.measures.continuous.kraskov").TransferEntropyCalculatorKraskov
teCalc = teCalcClass();
teHeartToBreath = [];
teBreathToHeart = [];
for knnIndex in range(len(knns)):
knn = knns[knnIndex];
# Compute a TE value for knn nearest neighbours
# Perform calculation for heart -> breath (lag 1)
teCalc.initialise(kHistory,1,lHistory,1,1);
teCalc.setProperty("k", str(knn));
teCalc.setObservations(JArray(JDouble, 1)(heart),
JArray(JDouble, 1)(chestVol));
teHeartToBreath.append( teCalc.computeAverageLocalOfObservations() );
if (numSurrogates > 0):
teHeartToBreathNullDist = teCalc.computeSignificance(numSurrogates);
teHeartToBreathNullMean = teHeartToBreathNullDist.getMeanOfDistribution();
teHeartToBreathNullStd = teHeartToBreathNullDist.getStdOfDistribution();
# Perform calculation for breath -> heart (lag 1)
teCalc.initialise(kHistory,1,lHistory,1,1);
teCalc.setProperty("k", str(knn));
teCalc.setObservations(JArray(JDouble, 1)(chestVol),
JArray(JDouble, 1)(heart));
teBreathToHeart.append( teCalc.computeAverageLocalOfObservations() );
if (numSurrogates > 0):
teBreathToHeartNullDist = teCalc.computeSignificance(numSurrogates);
teBreathToHeartNullMean = teBreathToHeartNullDist.getMeanOfDistribution();
teBreathToHeartNullStd = teBreathToHeartNullDist.getStdOfDistribution();
print("TE(k=%d,l=%d,knn=%d): h->b = %.3f" % (kHistory, lHistory, knn, teHeartToBreath[knnIndex])), # , for no newline
if (numSurrogates > 0):
print(" (null = %.3f +/- %.3f)" % (teHeartToBreathNullMean, teHeartToBreathNullStd)),
print(", b->h = %.3f nats" % teBreathToHeart[knnIndex]),
if (numSurrogates > 0):
print("(null = %.3f +/- %.3f)" % (teBreathToHeartNullMean, teBreathToHeartNullStd)),
print
# Exercise: plot the results
Dataset is:
The first column is heart rate, the second is chest volume, and the third is blood oxygen concentration.
76.53 8320 7771
76.53 8117 7774
76.15 7620 7788
75.39 6413 7787
75.51 7518 7767
76.67 1247 7773
78.55 -3525 7784
79.96 2388 7764
79.71 8296 7775
78.30 7190 7784
77.02 6024 7777
76.62 5825 7784
76.53 5154 7809
76.65 7464 7805
76.95 5345 7806
78.46 -993 7813

Stack overflow error due to long lineage in loop for (on DataFrame)

I have an iterative algorithm (pyspark) in which I make update on a part of my Spark DataFrame. I do it via a loop for and at each iteration, my job becomes more expensive and have more long lineage. At iteration i, I have lineage of iteration i-1 + some steps (lineage becomes more and more long).
I tried many options to break lineage but it doesn't work. Here is my source code. I work on jupyterLab VM.
def chronologically_compute(myDataFrame, number_of_compute, spark_session):
# UDFs
find_law_to_apply_udf = udf(find_law_to_apply, IntegerType())
compute_loss_udf = udf(compute_loss, FloatType())
TIMING = []
#myDataFrame = myDataFrame.repartition(1000)
spark_session.sparkContext.setCheckpointDir("myDirectory")
#myDataFrame.explain(True)
#myDataFrame.checkpoint()
for i in range(1, number_of_compute + 1):
debutRank = time.time()
print("Itération", i)
myDataFrame = myDataFrame.withColumn("column1",
when(myDataFrame.rank == i, find_law_to_apply_udf("updatedComputed")
).otherwise(myDataFrame.column1))
myDataFrame = myDataFrame.withColumn("SelectedValue",
when(myDataFrame.rank == i, myDataFrame["column2"].getItem(col("column1") - 1)
).otherwise(myDataFrame.SelectedValue))
myDataFrame = myDataFrame.withColumn("computed",
when(myDataFrame.rank == i, compute_loss_udf("SelectedValue", "Time")
).otherwise(myDataFrame.computed))
window = Window.partitionBy('ID')
myDataFrame = myDataFrame.withColumn('computedSum', sum("computed").over(window))
myDataFrame = myDataFrame.withColumn('updatedComputed',
when(myDataFrame.rank == i, myDataFrame.computedSum + myDataFrame.updatedComputed
).otherwise(myDataFrame.updatedComputed))
myDataFrame = myDataFrame.withColumn('updatedComputed',
when(myDataFrame.rank == i + 1, myDataFrame.computedSum + myDataFrame.updatedComputed
).otherwise(myDataFrame.updatedComputed))
if i % 10 == 0:
d = time.time()
myDataFrame.checkpoint()
print(myDataFrame.count())
#myDataFrame.persist(StorageLevel.DISK_ONLY_2)
duree_lineage = time.time() - d
print("Lineage took {0}".format(duree_lineage))
TIMING.append(duree_lineage)
duree = time.time() - debutRank
print("Modif took {0}".format(duree))
print("Iteration time sum", np.sum(TIMING))
print("Iteration time avg", np.mean(TIMING))
return myDataFrame
def main(spark_session):
try:
spark_jobs(spark_session)
except Exception as ex:
print(traceback.format_exc())
raise
if __name__ == "__main__":
SPARK_SESSION = SparkSession \
.builder \
.appName("AppName") \
.enableHiveSupport() \
.config('spark.executor.memory','2g') \
.config('spark.driver.memory','2g') \
.config('spark.driver.maxResultsSize','2g') \
.config("spark.logLineage", "true") \
.config("spark.executor.extraJavaOptions","-Xss32M") \
.getOrCreate()
main(SPARK_SESSION)
SPARK_SESSION.stop()

Categories