How to efficiently map data between time series in python - python

I'm trying to create an efficient function for re-sampling time-series data.
Assumption: Both sets of time-series data have the same start and end time. (I do this in a separate step.)
Resample function (inefficient)
import numpy as np
def resample(desired_time_sequence, data_sequence):
downsampling_indices = np.linspace(0, len(data_sequence)-1, len(desired_time_sequence)).round().astype(int)
downsampled_array = [data_sequence[ind] for ind in downsampling_indices]
return downsampled_array
Speed testing
import timeit
def test_speed(): resample([1,2,3], [.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6])
print(timeit.timeit(test_speed, number=100000))
# 1.5003695999998854
Interested to hear any suggestions.

Replacing
downsampled_array = [data_sequence[ind] for ind in downsampling_indices]
with
downsampled_array = data_sequence[downsampling_indices]
provided 7x speedup on my testing data.
Code used to measure the speedup:
import timeit
f1 = """
def resample(output_len, data_sequence):
downsampling_indices = np.linspace(0, len(data_sequence)-1, output_len).round().astype(int)
downsampled_array = [data_sequence[ind] for ind in downsampling_indices]
return downsampled_array
resample(output_len, data_sequence)
"""
f2 = """
def resample_fast(output_len, data_sequence):
downsampling_indices = np.linspace(0, len(data_sequence)-1, output_len).round().astype(int)
downsampled_array = data_sequence[downsampling_indices]
return downsampled_array
resample_fast(output_len, data_sequence)
"""
setup="""
import numpy as np
data_sequence = np.random.randn(10000)
output_len = 752
"""
print(timeit.timeit(f1, setup, number=1000))
print(timeit.timeit(f2, setup, number=1000))
# prints:
# 0.30194038699846715
# 0.041797632933594286

Related

Numba function fails when imported from another script, but works well when compiled manually

Here is a reproducible example. The numba function "is_in_set_pnb" doesn't work when is imported from "functions.py" script. However, it works just fine when is defined in the same script.
functions.py script
import numpy as np
import numba as nb
import pandas as pd
# corrFilter function
def corrFilter(df, threshold):
corr_matrix = df.corr().abs()
flat_matrix = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
.stack()
.sort_values(ascending=False))
flat_matrix = pd.DataFrame(flat_matrix).reset_index()
flat_matrix.columns=["var1", "var2", "correlation"]
flat_matrix = flat_matrix.reindex(flat_matrix.correlation.sort_values(ascending=False).index).reset_index().drop(["index"],axis=1)
filtered_matrix = flat_matrix[flat_matrix.correlation > threshold]
pairs_to_remove = filtered_matrix[["var1", "var2"]].to_numpy()
return pairs_to_remove
# Numba function / numpy.isin() improved version
#nb.jit(parallel=True)
def is_in_set_pnb(a, b):
shape = a.shape
a = a.ravel()
n = len(a)
result = np.full(n, False)
set_b = set(b)
for i in nb.prange(n):
if a[i] in set_b:
result[i] = True
return result.reshape(shape)
main.py script
from sklearn import datasets
import pandas as pd
import numpy as np
import itertools
from functions import *
# Loading data
iris = datasets.load_iris()
data = iris.data
data = pd.DataFrame(data)
data.columns = ["var1", "var2", "var3", "var4"]
# Get pairs with a correlation higher than 0.5
remove_pairs = corrFilter(data, 0.5)
remove_pairs_check = np.sum(remove_pairs, axis=1)
# Remove triplets with pairs with a higher correlation than 0.5
triplets = [list(k) for k in itertools.combinations(data.columns, 3)]
array_triplets = np.vstack(triplets).astype(object)
n, d = array_triplets.shape
pair1 = np.sum(array_triplets[:,[0,1]], axis=1).reshape(n, 1)
pair2 = np.sum(array_triplets[:,[1,2]], axis=1).reshape(n, 1)
pair3 = np.sum(array_triplets[:,[0,2]], axis=1).reshape(n, 1)
array_triplets_check = np.concatenate((pair1, pair2, pair3), axis =1)
# Check if each pair is in the remove_pairs_check list
array_triplets_check_v1 = np.in1d(array_triplets_check, remove_pairs_check).reshape(n, d)
# Using numba function from "functions.py" script"
# This fails!!
array_triplets_check_v2 = is_in_set_pnb(array_triplets_check, remove_pairs_check)
However, if we define the numba function within the main.py script:
import numba as nb
#nb.jit(parallel=True)
def is_in_set_pnb(a, b):
shape = a.shape
a = a.ravel()
n = len(a)
result = np.full(n, False)
set_b = set(b)
for i in nb.prange(n):
if a[i] in set_b:
result[i] = True
return result.reshape(shape)
# This works properly!!!
array_triplets_check_v2 = is_in_set_pnb(array_triplets_check, remove_pairs_check)
What I am missing?? I am using:
numba 0.50.1 py38h47e9c7a_0

How to find the best line Fit Python(banister-impulse model)

I have this formula that is used to predict athletic performance base on daily stress.
It is based on 5 constant unique to each person. I'm trying to find these based on daily stress and performance testing that has been done. I'm new to programming and I don't know where to start.
see the formula
Performance= Fitness(=daily stress+yesterday fitness put decay) - Fatigue(daily stress+yesterday fatigue put decay) +P0
This is a sample of the data: data
thank you
import pandas as pd
import numpy as np
import math
from scipy import optimize
data = pd.read_csv('data_mod1.csv')
TSS = data['stress'].fillna(0)
arr = np.array(TSS)
#data = data.dropna()
a = [arr[0]]
b = [arr[0]]
x = arr[1:]
def Banister(x, t1, t2,k1,k2, c):
for v in x:
a.append(a[-1]*np.exp(-1/t1) + v)
b.append(b[-1]*np.exp(-1/t2) + v)
data['fit'] = pd.Series(a)
data['fat'] = pd.Series(b)
data['perf'] = ((data['fit']*k1)-(data['fat']*k2))+c
return data['perf']
# In[ ]:
from scipy.optimize import curve_fit
fit = curve_fit(Banister, arr,data[data.index], p0=[20, 10,1 ,2, 50])

Creating a vector of values based off a test using a for loop

This feels like it should be a simple problem but I am newer to python, in R i would use a foreach loop that gave me an option to combine.
I have tried a for loop that lets me print out all the values i need but i want them collected into a vector of values that i can use later.
from scipy.stats import gamma
import scipy.stats as stats
import numpy as np
import random
data2 = np.random.gamma(1,2, size = 500)
gammT = np.log(data2 + 1)
mean = np.mean(gammT)
sd = np.std(gammT)
a = (mean/ sd)**2
b = (sd**2)/ mean
for i in range(1,100):
gammT = random.sample(list(gammT), 500)
gamm = np.random.gamma(a,b, size = len(gammT))
s = stats.anderson_ksamp([gammT,gamm])
s = s[2]
print(s)
So i am able to print all the values i want but i want them all to be gathered together in a vector of values. I have tried to append and make lists but am not able to get them together.
from scipy.stats import gamma
import scipy.stats as stats
import numpy as np
import random
gammT = np.log(data2.iScore + 1)
mean = np.mean(gammT)
sd = np.std(gammT)
a = (mean/ sd)**2
b = (sd**2)/ mean
#initialize empty list
result=[]
for i in range(100):
# removed (1,100) you only need range(100) for 100 elements
gammT = random.sample(list(gammT), 500)
gamm = np.random.gamma(a,b, size = len(gammT))
s = stats.anderson_ksamp([gammT,gamm])
s = s[2]
#append calculation to list
result.append(s)
print(s)
print(result)

Bootstrapping: Is there a faster way?

I'm trying to compute the bootstrap statistic of the total of an array and I'm wondering if this can be improved in terms of speed, please?
from numpy import sum
from numpy.random import choice
def bootstrap(observed_array: array, number_of_bootstraps: int = 10000) -> array:
number_of_elements = len(observed_array)
bootstrap_estimates = []
for _ in range(number_of_bootstraps):
indices = choice(number_of_elements, size=number_of_elements, replace=True)
bootstrap_sample = observed_array[indices]
bootstrap_estimate = bootstrap_sample.sum()
bootstrap_estimates.append(bootstrap_estimate)
return array(bootstrap_estimates)
Thanks for any suggestions here.

Vectorized sampling of multiple binomial random variables

I would like to sample a few hundred binomially distributed random variables, each with a different n and p (using the argument names as defined in the numpy.random.binomial docs). I'll be doing this repeatedly, so I'd like to vectorize the code if possible. Here's an example:
import numpy as np
# Made up parameters
N_random_variables = 500
n_vals = np.random.random_integers(100, 200, N_random_variables)
p_vals = np.random.random_sample(N_random_variables)
# Can this portion be vectorized?
results = np.empty(N_random_variables)
for i in xrange(N_random_variables):
results[i] = np.random.binomial(n_vals[i], p_vals[i])
In the special case that n and p are the same for each random variable, I can do:
import numpy as np
# Made up parameters
N_random_variables = 500
n_val = 150
p_val = 0.5
# Vectorized code
results = np.random.binomial(n_val, p_val, N_random_variables)
Can this be generalized to the case when n and p take different values for each random variable?
Here you go,
import numpy as np
# Made up parameters
N_random_variables = 500
n_vals = np.random.random_integers(100, 200, N_random_variables)
p_vals = np.random.random_sample(N_random_variables)
# Can this portion be vectorized? Yes
results = np.empty(N_random_variables)
results = np.random.binomial(n_vals, p_vals)

Categories