How to implement dependant columns in hypothesis dataframes - python

I am using hypothesis dataframes to implement a dataframe in which start_time and end_time are two columns. Here is a chunck:
import hypothesis.strategies as st
import logging
import datetime
from hypothesis import given
from hypothesis.extra.pandas import column, data_frames, range_indexes
current_time = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
datetime_st = st.integers(
min_value=(current_time + datetime.timedelta(hours=4)).timestamp(),
max_value=(current_time + datetime.timedelta(hours=20)).timestamp(),
)
df_columns = {
# other fields omitted
"start_time": {"elements": datetime_st, "unique": False},
"end_time": {"elements": datetime_st, "unique": False},
}
test_dfs = data_frames(
index=range_indexes(min_size=20, max_size=100),
columns=[column(key, **value) for key, value in df_columns.items()],
)
#given(df=test_dfs)
def test_hyothesis(df):
logging.info(df)
assert 1
I am not able to find a solution to assert that each start_time should be greater than its corresponding end_time by atleast delta. I have tried composite, but I am not sure on how to implement it on each rows of the dataframes.
Is there a way that I enforce the delta as a rule when initialising start_time and end_time?

Here's a way to generate a dataframe of two time-stamp columns, where the difference between the first one and the second one is at least 3600 seconds (or some other amount of time). I'm using st.flatmap for that.
import hypothesis.strategies as st
from hypothesis.extra.pandas import column, data_frames, range_indexes, columns
current_time = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0).timestamp()
MIN_DIFF_SECONDS = 3600
two_timestamps_with_diff = st.integers(
min_value = current_time + 3600 * 4,
max_value = current_time + 4600 * 20).flatmap(
lambda n: st.tuples(
st.integers(min_value = n, max_value=n),
st.integers(min_value = n + MIN_DIFF_SECONDS, max_value = n + 3600*10)
))
# sample code to examine the results of this strategy
# for _ in range(10):
# x, y = two_timestamps_with_diff.example()
# print(x, y, y-x)
test_dfs = data_frames(
index=range_indexes(min_size=20, max_size=100),
columns=columns(["start_time", "end_time"], dtype=int),
rows=two_timestamps_with_diff,
)
# sample code to examine the results of this strategy
# res = test_dfs.example()
# res.assign(d = res.end_time - res.start_time)
# a test with an assertion that validates this constraint.
#given(df=test_dfs)
def test_hyothesis(df):
logging.info(df)
assert ((df.end_time - df.start_time) >= MIN_DIFF_SECONDS).all()
# run the test. It passes.
test_hyothesis()
If you'd like to add additional columns to the autogenerated dataframe, do the following (the new columns are 'a' and 'b' in this example):
from hypothesis.strategies import composite
#composite
def test_df_with_additional_columns(draw, elements=test_dfs):
df = draw(test_dfs)
class GetIndex(st.SearchStrategy[pd.core.indexes.range.RangeIndex]):
def do_draw(self, _):
return df.index
more_col_strategy = data_frames([column('A', dtype=int),
column('B', dtype=float)],
index = GetIndex()
)
more_cols = draw(more_col_strategy)
return pd.concat([df, more_cols], axis=1)
test_df_with_additional_columns().example()

Related

Same sqlalchemy orm query written different ways read into a pandas dataframe, question about optimizations

I've tried timing the defined queries below and comparing the results. The results are equal but the v2 query is slightly slower and I'm thinking the overall speed of both queries should be way faster than what I'm getting.
def get_measurements(session, start_time: int, end_time: int):
query = session.query(Measurement, Sensor.sensor_node_id, Room.name).filter(
Measurement.timestamp >= start_time,
Measurement.timestamp <= end_time,
Measurement.sensor_id == Sensor.id,
Sensor.room_id == Room.id
)
return query
def get_measurements_v2(session, start_time: int, end_time: int):
query = session.query(Measurement).filter(
Measurement.timestamp >= start_time,
Measurement.timestamp <= end_time
).options(
joinedload('sensor').load_only('sensor_node_id').options(
joinedload('room').load_only('name'))
)
return query
Below is the code for comparisons:
start_time = datetime.now() - timedelta(days=360)
start_time = int(
pd.Timestamp(
datetime(start_time.year, start_time.month, start_time.day, 0, 0, 0)
).timestamp()
)
end_time = datetime.now()
end_time = int(
pd.Timestamp(
datetime(end_time.year, end_time.month, end_time.day, 23, 59, 59)
).timestamp()
)
start_v1 = timeit.default_timer()
query = get_measurements(session, start_time, end_time)
query_df = pd.read_sql(query.statement, query.session.bind)
stop_v1 = timeit.default_timer()
print(f"Query time for V1: {stop_v1 - start_v1}")
start_v2 = timeit.default_timer()
query_v2 = get_measurements_v2(session, start_time, end_time)
query_df_v2 = pd.read_sql(query_v2.statement, query_v2.session.bind)
stop_v2 = timeit.default_timer()
print(f"Query time for V2: {stop_v2 - start_v2}")
# drop id columns
query_df_v2.drop(columns=['id', 'id_1'], inplace=True)
# Rearrange columns to ['timestamp', 'temperature', 'humidity', 'ambient_light', 'air_quality','co2_index', 'pressure', 'iaq_accuracy', 'battery_level', 'sensor_id','created_at', 'sensor_node_id', 'name']
query_df_v2 = query_df_v2[['timestamp', 'temperature', 'humidity', 'ambient_light', 'air_quality','co2_index', 'pressure', 'iaq_accuracy', 'battery_level', 'sensor_id','created_at', 'sensor_node_id', 'name']]
#Are the results equal
print(query_df.equals(query_df_v2))
print(f"length of dataframes {len(query_df)} and {len(query_df_v2)}")
The results:
Query time for V1: 8.65921110000636
Query time for V2: 8.950237099998049
True
length of dataframes 1502636 and 1502636
As you can see the query returns a dataframe with about 1,5 mill rows (which isnt that much). The measurement table has indexes on timestamp and sensor id, the sensor table has index on sensor_node_id. For my purposes this is too slow. I've also tried using timescaledb with hypertables as that should've improved performance by my understanding, but didn't. (Probably a fault from my side)

xml with pandas: sum values by condition

recent_cases is supposed to sum the new covid cases in last 10 days for a given location
somehow my code prints None. i cant find the problem
import json
import pandas as pd
import plotly.express as ex
from datetime import *
from datetime import timedelta
class Covid:
dt = timedelta(days=1)
ten_days = timedelta(days=10)
covid_data = pd.read_excel("owid-covid-data.xlsx", usecols="C:F,H,I")
def recent_cases(self, cntry):
today = datetime.today()
temp = today - self.ten_days # 10 days before today
sum_of_cases = 0
for ind in self.covid_data.index:
if temp <= today:
if (self.covid_data["date"][ind] == temp) and (self.covid_data['location'][ind] == cntry):
# if 'date' is temp and 'location' is the location input, sum new cases
sum_of_cases = sum_of_cases + int(self.covid_data["new_cases"][ind])
temp = temp + self.dt # move to the next day
else: # if temp passed today, all past ten days cases are summed
break
if __name__ == '__main__':
c = Covid()
print(c.recent_cases('Italy'))

How can I print the results from this script? I can't get any results in my IDE

I am trying to see the results for this script in Spyder, but I can't get it to print and I'm not sure how. I tried print(options), print(opt), print(exps), but nothing seems to be working. I don't get any "errors" either... I just get the normal In[number]: runfile(my path)
import pandas as pd
import yfinance as yf
import datetime
def options_chain(symbol):
tk = yf.Ticker(symbol)
# Expiration dates
exps = tk.options
# Get options for each expiration
options = pd.DataFrame()
for e in exps:
opt = tk.option_chain(e)
opt = pd.DataFrame().append(opt.calls).append(opt.puts)
opt['expirationDate'] = e
options = options.append(opt, ignore_index=True)
# Bizarre error in yfinance that gives the wrong expiration date
# Add 1 day to get the correct expiration date
options['expirationDate'] = pd.to_datetime(options['expirationDate']) + datetime.timedelta(days = 1)
options['dte'] = (options['expirationDate'] - datetime.datetime.today()).dt.days / 365
# Boolean column if the option is a CALL
options['CALL'] = options['contractSymbol'].str[4:].apply(
lambda x: "C" in x)
options[['bid', 'ask', 'strike']] = options[['bid', 'ask', 'strike']].apply(pd.to_numeric)
options['mark'] = (options['bid'] + options['ask']) / 2 # Calculate the midpoint of the bid-ask
# Drop unnecessary and meaningless columns
options = options.drop(columns = ['contractSize', 'currency', 'change', 'percentChange', 'lastTradeDate', 'lastPrice'])
return options
print(options)

Python/Sqlalchemy/Sqlite - How to add datetime field and integer seconds (timedelta) in where condition?

I have a sqlalchemy/sqlite table:
class MyTable(Base):
__tablename__ = 'mytable'
...
field_dt = Column(DateTime)
field_int = Column(Integer, default=0)
Now I would like to construct the where condition in which I want to check whether field_dt + field_int (seconds) <= utc_now.
Something like: select(MyTable).where(?).
With no sqlalchemy/sqlite I would construct condition like this:
import datetime as dt
utc_now = dt.datetime(2022,3,2,1,0,10)
field_dt = dt.datetime(2022,3,1,1,0,5)
field_int = 60
print(f" utc_now = {utc_now.isoformat()}")
print(f" field_dt = {field_dt.isoformat()}")
print(f"field_int = {field_int}")
if field_dt + dt.timedelta(seconds=field_int) < utc_now:
print('it is less than utc_now')
Output:
utc_now = 2022-03-02T01:00:10
field_dt = 2022-03-01T01:00:05
field_int = 60
it is less than utc_now
How to do the same with sqlalchemy/sqlite
SQLite 3.38.0 implements a unixepoch function that can convert a datetime to a Unix timestamp, so in theory we could do
import sqlalchemy as sa
# Untested
q = sa.select(MyTable).where(
(sa.func.unixepoch(MyTable.field_dt) + MyTable.field_int)
< sa.func.unixepoch(dt.datetime.utcnow)
)
however 3.38.0 was released on 2022-02-22 so at the time of writing it may not be widely distributed.
If unixepoch is not available we can use SQLite's datetime function to construct a new datetime. The SQL would look something like this:
select datetime(field_dt, '+' || cast(field_int as text) || ' seconds') as dt
from mytable
where dt < datetime('now');
the SQLAlchemy equivalent is:
q = sa.select(MyTable).where(
sa.func.datetime(
MyTable.field_dt,
'+' + sa.cast(MyTable.field_int, sa.String) + ' seconds',
)
< dt.datetime.utcnow()
)
If field_dt is indexed, consider moving the modifier to the RHS of the inequality:
q = sa.select(MyTable).where(
MyTable.field_dt
< sa.func.datetime(
dt.datetime.utcnow(),
'-' + sa.cast(MyTable.field_int, sa.String) + ' seconds',
)
)
It may be worth considering storing the datetime as a Unix timestamp to simplify the query.
The SQLite date functions documentation is here.

Python multiprocessing multiple iterations

I am trying to use multiprocessing to speed up my data processing. I am working on a machine with 6 Cores, so I want to iterate through a table of 12 million rows, and for each of these rows I iterate through several time steps doing a calculation (executing a function).
This line I would like to split up that it runs in parallel on different cores:
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
I tried something with
from multiprocessing import Pool
but I did not manage to pass the arguments of the function and the iterator.
I would appreciate any idea. I am new to Python.
This is what i have:
import pyreadr
import pandas as pd
import numpy as np
import time
from datetime import timedelta
import functools
from pathlib import Path
def read_data():
current_path = os.getcwd()
myfile = os.path.join(str(Path(current_path).parents[0]), 'dummy.RData')
result = pyreadr.read_r(myfile)
pc = result["pc"]
u = result["u"]
return pc, u
# add one column per time
def prepare_output_structure(pc):
ini_cols = pc.columns
pc = pc.reindex(columns=[*pc.columns, *np.arange(0, 11), 'cat'], fill_value=0)
pc.reset_index(level=0, inplace=True)
# print(pc.columns, pc.shape, pc.dtypes)
return pc, ini_cols
def conjunction(*conditions):
return functools.reduce(np.logical_and, conditions)
def timeloop(t_final: int, count_final: int, tipo):
if tipo == 'A':
count_ini = 35
else: # B:
count_ini = 30
yy_list = []
for t in np.arange(0, 11):
yy = ((count_final - count_ini) / t_final) * t + count_ini
yy_list.append(int(yy))
return yy_list
def rowiteration(i, output, ini_cols, cols):
c_2: bool = pc.loc[i, 'tipo'] == u.iloc[:, 0].str[:1] # first character of category e.g. 'A1'
c_5: bool = pc.loc[i, 't_final'] >= u.iloc[:, 1] # t_min (u)
c_6: bool = pc.loc[i, 't_final'] <= (u.iloc[:, 2]) # t_max (u)
pc.loc[i, 'cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
pc.iloc[i, (0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(pc.loc[i, 't_final']), int(pc.loc[i, 'count_final']), pc.loc[i, 'tipo'])
out = pd.DataFrame(pc.iloc[i, :])
out = pd.DataFrame(out.transpose(), columns=cols)
output = output.append(out.iloc[0, :])
return output
if __name__ == '__main__':
start_time = time.time()
pc, u = read_data()
nrowpc = len(pc.index)
a = np.arange(0, nrowpc) # filas tabla pc
# print(a, nrowpc, len(pc.index))
pc, ini_cols = prepare_output_structure(pc)
cols = pc.columns
output = pd.DataFrame()
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
pc2 = pd.concat(test, ignore_index=True)
pc2 = pc2.iloc[:, np.r_[5, (len(ini_cols)+1):(len(pc2.columns))]]
print(pc2.head)
elapsed_time_secs = time.time() - start_time
msg = "Execution took: %s secs (Wall clock time)" % timedelta(milliseconds=elapsed_time_secs)
print(msg)```
Replace your [rowiteration(i, output, ini_cols, cols) for i in a] with:
from multiprocessing import Pool
n_cpu = 10 # put in the number of threads of cpu
with Pool(processes=n_cpu) as pool:
ret = pool.starmap(rowiteration,
[(i, output, ini_cols, cols) for i in a])
Here is an approach that I think solves the problem and that only sends what is necessary to the worker processes. I haven't tested this as is (which would be difficult without the data your code reads in) but this is basic idea:
import multiprocessing as mp
p = mp.Pool(processes=mp.cpu_count())
# Note that you already define the static cols and ini_cols
# in global scope so you don't need to pass them to the Pool.
# ... Other functions you've defined ...
def rowiteration(row):
c_2: bool = row['tipo'] == u.iloc[:, 0].str[:1]
c_5: bool = row['t_final'] >= u.iloc[:, 1]
c_6: bool = row['t_final'] <= (u.iloc[:, 2])
row['cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
row[(0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(row['t_final']), int(row['count_final']), row['tipo'])
return row
out = []
for row in p.imap_unordered(rowiteration, [r for _, r in pc.iterrows()]):
row.index = cols
out.append(cols)
pc2 = pd.DataFrame(out, ignore_index=True)

Categories