I found this usefull article on polyfit which works pretty good:
http://www.emilkhatib.com/analyzing-trends-in-data-with-pandas/
import numpy as np
coefficients, residuals, _, _, _ = np.polyfit(range(len(selected.index)),selected,1,full=True)
mse = residuals[0]/(len(selected.index))
nrmse = np.sqrt(mse)/(selected.max() - selected.min())
print('Slope ' + str(coefficients[0]))
print('NRMSE: ' + str(nrmse))
now i would like to use this on a rolling base..
def test(input_list, i):
if sum(~np.isnan(x) for x in input_list) < 2:
return np.NaN
print(input_list)
coefficients, residuals, _, _, _ = np.polyfit(range(len(input_list)),input_list,1,full=True)
mse = residuals[0]/(len(input_list))
nrmse = np.sqrt(mse)/(input_list.max() - input_list.min())
print('Slope ' + str(coefficients[0]))
print('NRMSE: ' + str(nrmse))
a = coefficients[0]*i + coefficients[1]
return a
df['pred'] = df['abs'].rolling(window=2, min_periods=1, center=False).apply(lambda x: test(x, base1.index))
but i wont get it working :)
i get
IndexError: index 0 is out of bounds for axis 0 with size 0 instead of correct results :)
anybody got an idea? thanks! e.
****EDIT1****
sorry, i missed posting a concrete example...
i managed to get the function working, by transforming the numpy array in a df.
but somehow residuals is empty
import quandl
import MySQLdb
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
def test(input_list, i):
if sum(~np.isnan(x) for x in input_list) < 2:
return np.NaN
abc = pd.DataFrame(input_list)
coefficients, residuals, _, _, _ = np.polyfit(range(len(abc)),abc[0],1,full=True)
#residuals is empty... why?
a = coefficients[0]*len(abc) + coefficients[1]
return a
df = quandl.get("WIKI/GOOGL")
df = df.ix[:, ['High', 'Low', 'Close']]
#reseit index for calc
#base1['DateTime'] = base1.index
#base1.index = range(len(base1))
df['close_pred'] = df['Close'].rolling(window=15, min_periods=2, center=False).apply(lambda x: test(x, 0))
print(df.head(30).to_string())
Residuals are empty just for 1st iteration see little modified code and answer
def test(data):
if sum(~np.isnan(x) for x in data) < 2:
return np.NaN
df = pd.DataFrame(data)
coefficients, residuals, _, _, _ = np.polyfit(range(len(data)),df[0],1,full=True)
#if residuals.size == 0:
# residuals = [0]
print(coefficients[-2], residuals, data)
return coefficients[-2]
and answer
df_xx['pred'] = df_xx[0].rolling(window=5, min_periods=2, center=False).apply(lambda y: test(y))
0.9999999999999998 [] [0. 1.]
1.0 [4.29279946e-34] [0. 1. 2.]
1.0000000000000002 [3.62112419e-33] [0. 1. 2. 3.]
0.9999999999999999 [8.77574736e-31] [0. 1. 2. 3. 4.]
0.9999999999999999 [1.25461096e-30] [1. 2. 3. 4. 5.]
0.9999999999999999 [2.93468782e-30] [2. 3. 4. 5. 6.]
0.9999999999999997 [1.38665176e-30] [3. 4. 5. 6. 7.]
0.9999999999999997 [2.18347839e-30] [4. 5. 6. 7. 8.]
0.9999999999999999 [6.21693422e-30] [5. 6. 7. 8. 9.]
1.0 [1.07025673e-29] [ 6. 7. 8. 9. 10.]
1.0000000000000002 [1.4374879e-29] [ 7. 8. 9. 10. 11.]
0.9999999999999997 [1.14542951e-29] [ 8. 9. 10. 11. 12.]
1.0000000000000004 [9.73226454e-30] [ 9. 10. 11. 12. 13.]
0.9999999999999997 [1.99069506e-29] [10. 11. 12. 13. 14.]
0.9999999999999997 [1.09437894e-29] [11. 12. 13. 14. 15.]
1.0 [3.60983058e-29] [12. 13. 14. 15. 16.]
1.0000000000000002 [1.90967258e-29] [13. 14. 15. 16. 17.]
1.0000000000000002 [3.13030715e-29] [14. 15. 16. 17. 18.]
1.0 [1.25806434e-29] [15. 16. 17. 18. 19.]
simple code below fix it
if residuals.size == 0:
residuals = [0]
Related
I was trying to understand the working of the function fast_knn of impyute library. So, I tried to execute it line by line in order to understand the working. Here it is:
import numpy as np
from scipy.spatial import KDTree
def shepards(distances, power=2):
return to_percentage(1/np.power(distances, power))
def to_percentage(vec):
return vec/np.sum(vec)
data_temp = np.arange(25).reshape((5, 5)).astype(np.float)
data_temp[0][2] = np.nan
k=4
eps=0
p=2
distance_upper_bound=np.inf
leafsize=10
idw_fn=shepards
init_impute_fn=mean
nan_xy = np.argwhere(np.isnan(data_temp))
data_temp_c = init_impute_fn(data_temp)
kdtree = KDTree(data_temp_c, leafsize=leafsize)
for x_i, y_i in nan_xy:
distances, indices = kdtree.query(data_temp_c[x_i], k=k+1, eps=eps,
p=p, distance_upper_bound=distance_upper_bound)
# Will always return itself in the first index. Delete it.
distances, indices = distances[1:], indices[1:]
# Add small constant to distances to avoid division by 0
distances += 1e-3
weights = idw_fn(distances)
# Assign missing value the weighted average of `k` nearest neighbours
data_temp[x_i][y_i] = np.dot(weights, [data_temp_c[ind][y_i] for ind in indices])
data_temp
This outputs:
array([[ 0. , 1. , 10.06569379, 3. , 4. ],
[ 5. , 6. , 7. , 8. , 9. ],
[10. , 11. , 12. , 13. , 14. ],
[15. , 16. , 17. , 18. , 19. ],
[20. , 21. , 22. , 23. , 24. ]])
whereas the function has a different output. The code :
from impyute import fast_knn
import numpy as np
data_temp = np.arange(25).reshape((5, 5)).astype(np.float)
data_temp[0][2] = np.nan
fast_knn(data_temp, k=4)
and the output
array([[ 0. , 1. , 16.78451885, 3. , 4. ],
[ 5. , 6. , 7. , 8. , 9. ],
[10. , 11. , 12. , 13. , 14. ],
[15. , 16. , 17. , 18. , 19. ],
[20. , 21. , 22. , 23. , 24. ]])
``
There seems to be discrepancies with the GitHub repository code and library source code ( the repository has not been updated). The following is the library source code :
def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, **kwargs):
null_xy = find_null(data)
data_c = mean(data)
kdtree = KDTree(data_c, leafsize=leafsize)
for x_i, y_i in null_xy:
distances, indices = kdtree.query(data_c[x_i], k=k+1, eps=eps,
p=p, distance_upper_bound=distance_upper_bound)
# Will always return itself in the first index. Delete it.
distances, indices = distances[1:], indices[1:]
weights = distances/np.sum(distances)
# Assign missing value the weighted average of `k` nearest neighbours
data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices])
return data
The weights are computed in a different manner (not using the shepards function). Hence, the difference in outputs.
Maybe you used the code on the current master branch of impyute. But the impyute package version you used maybe v0.0.8 — the current recent version — whose code is at the release/0.0.8 branch.
The difference in the definition of fast_knn is below.
On the current master branch:
# Will always return itself in the first index. Delete it.
distances, indices = distances[1:], indices[1:]
# Add small constant to distances to avoid division by 0
distances += 1e-3
weights = idw_fn(distances)
On release/0.0.8 branch:
# Will always return itself in the first index. Delete it.
distances, indices = distances[1:], indices[1:]
weights = distances/np.sum(distances)
If you use the code in the release/0.0.8 branch, you will get the same result as you use the impyute package.
I have a 2D-matrix of some numbers and I want to randomly change a fraction of the non-zero members (e.x. 0.2) to become zero and then again randomly choose equal to that fraction amount (0.2) between all zeroes and give them random numbers. Is there any straight forward way to do that?
for example:
The original matrix is : x = [[1,2,3],[4,0,7],[2,10,0]]
After first step (2 randomly selected numbers change to zero): x = [[1,0,0],[4,0,7],[2,10,0]]
After second step (2 randomly selected zeros change to random numbers): x = [[1,0,5],[4,7,7],[2,10,0]]
One method:
arr = np.ones((5, 5)) # Your matrix
print("Before Replacement")
print(arr)
# Number of elements to replace
num_replaced = 3
# Random (x, y) coordinates
indices_x = np.random.randint(0, arr.shape[0], num_replaced)
indices_y = np.random.randint(0, arr.shape[1], num_replaced)
arr[indices_x, indices_y] = 0
print("After replacement")
print(arr)
Sample Output:
Before Replacement
[[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]]
After replacement
[[0. 1. 1. 1. 1.]
[1. 0. 1. 1. 1.]
[1. 1. 1. 1. 1.]
[1. 0. 1. 1. 1.]
[1. 1. 1. 1. 1.]]
EDIT
You can use np.random.choice instead on np.random.randint as follows:
indices_x = np.random.choice(range(arr.shape[0]), num_replaced, replace=REPLACE)
indices_y = np.random.choice(range(arr.shape[1]), num_replaced, replace=REPLACE)
Here, you can easily switch between sampling with or without replacement.
I would try to create a simple function for this. So you can input the number desired.
import pandas as pd
import random
def random_converter(dataframe, k, isZero=True, input_data='random_value'):
# Copy df
dataframe_local = dataframe.copy()
if input_data=='random_value':
input_data = random.randint(0,10)
ki = 0
while ki < k:
row_selected = dataframe_local.sample(1).T
# VERIFY CONDITION
if isZero:
attributes = row_selected[row_selected.iloc[:, 0] == 0]
else:
attributes = row_selected[row_selected.iloc[:, 0] != 0]
# No zero in the row
if attributes.size == 0:
continue
column_index = attributes.index
row_index = attributes.columns
dataframe_local.iloc[row_index, column_index] = input_data
ki += 0
return dataframe_local
I'm using scipy.integrate's solve_ivp method to solve an ivp, and I want to be able to evaluate a function at the time steps that I give for the integration, but I don't know how to do it.
I could go back through each of the elements in the integration, but that would take a ridiculous amount of time in addition to the time that it already takes to solve the ivp, so I would much rather be able to calculate them at the same time that the actual method calculates the values at during the integration.
import scipy.integrate
import numpy
class Foo:
def __init__(self):
self.foo_vector_1 = numpy.zeros(3)
self.foo_vector_2 = numpy.zeros(3)
self.foo_vector_3 = numpy.zeros(3)
foo = Foo()
d_vector_1 = lambda foo: # gets the derivative of foo_vector_1
d_vector_2 = lambda foo: # gets the derivative of foo_vector_2
def get_foo_vector_3_value(foo):
return # returns the ACTUAL VALUE of foo_vector_3, NOT its derivative
def dy(t, y):
foo.foo_vector_1 = numpy.array((y[0],y[1],y[2]))
foo.foo_vector_2 = numpy.array((y[3],y[4],y[5]))
return numpy.array((d_vector_1(foo),d_vector_2(foo))).flatten().tolist()
foo.foo_vector_1 = numpy.array((1,2,3))
foo.foo_vector_2 = numpy.array((4,5,6))
y0 = numpy.array((foo.foo_vector_1, foo.foo_vector_2)).flatten().tolist()
sol = scipy.integrate.solve_ivp(dy, (0,10), y0, t_eval=numpy.arange(0,1000,1))
foo_vectors_1 = numpy.column_stack((sol.y[0], sol.y[1], sol.y[2]))
foo_vectors_2 = numpy.column_stack((sol.y[3], sol.y[4], sol.y[5]))
foo_vectors_3 = ????????
Ideally, I would be able to get the value of foo_vectors_3 without having to reset foo in a loop over the whole lists of foo vectors, because for me that would actually take a significant amount of computation time.
I think the friction here is avoiding the use 1D numpy ndarray as the base object for computation. You can mentally apportion the 1D array into your 2 separate foo attributes Then the computation of foo_vectors_3 will be trivial compared to the ODE integration. You could also add helper functions to map from the 1D ndarray for solve_ivp and your foo_vectors and back.
In [65]: import scipy.integrate
...: import numpy as np
...:
...: def d_vec1(t, y):
...: # put in your function here instead of just returning 1
...: return 1 * np.ones_like(y)
...:
...: def d_vec2(t, y):
...: # put in your function here instead of just returning 2
...: return 2 * np.ones_like(y)
...:
...: def eval_foo3(t, y):
...: return y[0:3,:] + y[3:,:] # use your own function instead
...:
...: def dy(t, y):
...: return numpy.array((d_vec1(t, y[0:3]), d_vec2(t, y[3:]))).flatten()
...:
...: v1 = np.array([1, 2, 3])
...: v2 = np.array([4, 5, 6])
...: y0 = numpy.array((v1, v2)).flatten()
...: t_eval = np.linspace(0, 10, 11)
...: sol = scipy.integrate.solve_ivp(dy, (0, 10), y0, t_eval=t_eval)
...:
...: foo3 = eval_foo3(sol.t, sol.y)
...: print(sol.y[0:3])
...: print(sol.y[3:])
...: print(foo3)
[[ 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11.]
[ 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12.]
[ 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 13.]]
[[ 4. 6. 8. 10. 12. 14. 16. 18. 20. 22. 24.]
[ 5. 7. 9. 11. 13. 15. 17. 19. 21. 23. 25.]
[ 6. 8. 10. 12. 14. 16. 18. 20. 22. 24. 26.]]
[[ 5. 8. 11. 14. 17. 20. 23. 26. 29. 32. 35.]
[ 7. 10. 13. 16. 19. 22. 25. 28. 31. 34. 37.]
[ 9. 12. 15. 18. 21. 24. 27. 30. 33. 36. 39.]]
I am given this matrix and am trying to write a function to build this matrix for any size of n. I am told the height of the matrix is n, but not sure the width.
Below is my code and output, is this correct? I am slightly confused by the notation of the matrix itself.
def buildMatrix(n, a):
matrix = np.zeros([n, n], dtype=float)
x_diag, y_diag = np.diag_indices_from(matrix)
for (x,y) in zip(x_diag, y_diag):
if x > (n / 2):
matrix[x][y] = -2*a
elif x == (n / 2):
matrix[x][y] = -(1 + a)
else:
matrix[x][y] = -2
if x != n - 1:
matrix[x + 1][y] = a if x >= (n / 2) else 1
matrix[x][y + 1] = a if x >= (n / 2) else 1
return matrix
Output with buildMatrix(5, 2)
[[-2. 1. 0. 0. 0.]
[ 1. -2. 1. 0. 0.]
[ 0. 1. -3. 2. 0.]
[ 0. 0. 2. -4. 2.]
[ 0. 0. 0. 2. -4.]]
Can anyone help me out?
To answer your first question, the matrix has to have a width of n in order for the matrix-vector product to be compatible.
The picture of the matrix is ambiguous on where the switch from -2 to -(1-a) to -2a occurs. In your code, you check if x==n/2 to set the switch. This is fine in python2 but will cause problems in python3 since x/2 returns 2.5. Using safer x==n//2 since n//2 return an integer in python2 as well as python3.
For generality, I'm going to assume that the switch happens at row m. The matrix can be built easier using slicing and the np.diag command.
def buildmat(n, m, a):
diag = np.zeros(n)
offdiag = np.zeros(n-1)
offdiag[0:m] = 1
offdiag[m:n-1] = a
diag[0:m] = -2
diag[m] = -(1+a)
diag[m+1:n] = -2*a
matrix = np.diag(diag) + np.diag(offdiag, 1) + np.diag(offdiag, -1)
return matrix
Running
buildmat(5, 2, 3)
produces
[[-2. 1. 0. 0. 0.]
[ 1. -2. 1. 0. 0.]
[ 0. 1. -3. 2. 0.]
[ 0. 0. 2. -4. 2.]
[ 0. 0. 0. 2. -4.]]
'car3.csv' file download link
import csv
num = open('car3.csv')
nums = csv.reader(num)
nums_list = []
for i in nums:
nums_list.append(i)
import numpy as np
nums_arr = np.array(nums_list, dtype = np.float32)
print(nums_arr)
print(np.std(nums_arr, axis=0))
The result is this.
[[ 1. 1. 2.]
[ 1. 1. 2.]
[ 1. 1. 2.]
...,
[ 0. 0. 5.]
[ 0. 0. 5.]
[ 0. 0. 5.]]
[ 0.5 0.5 1.11803401]
There are lots of spaces that I didn't expected.
How can I handle these anyway?
That is not a spacing problem. What all you need to do is to save the output of the standard deviation. Then, you can access each value like this:
std_arr = np.std(nums_arr, axis=0) # array which holds std of each column
# now, you can access them by indexing:
print(std_arr[0]) # output here is 0.5
print(std_arr[1]) # output here is 0.5
print(std_arr[2]) # output here is 1.118034