multi-variable standard deviation with override for correlation - python

So I have a multi-variable framework where I have 3 variables with different weights and an array of variations such has :
import numpy as np
weights = np.array([-2.61540125, -0.2480875, -0.2737325])
var = np.array([[0.00660683, -0.03470032, -0.02153846],
[0.00458204, -0.02614379, -0.02830189],
[-0.00098619, -0.00671141, 0.0032362],
[0.00175217, -0.02591793, -0.01217039],
[0.00077738, 0.00886918, 0.00821355],
[0.00077677, -0.02197802, 0.00100000]])
I can easily calculate the standard deviation with np.dot:
cov = np.cov(var.T)
standard_deviation = np.sqrt(weights.T.dot(cov).dot(weights))
standard_deviation
Out:
0.0044526680008974574
but I would like to override the correlation matrix and assume a correlation of 1 for all variable and find a standard deviation based on that assumption. is there any simple matrix operation I can do with numpy to do this? I can do it with a loop but I feel like this is not efficient
the result should be :
0.015335270585229297
I did this to arrive to it:
def stdev(weights, corr, stdev):
total = 0.0
for i in range(0, len(weights)):
for j in range(0, len(weights)):
if i < j:
total = total + weights[i] * stdev[i] * weights[j] * stdev[j] * corr[i, j]
else:
total = total + weights[i] * stdev[i] * weights[j] * stdev[j] * corr[j, i]
return total ** 0.5
cor_one = np.ones(var.shape)
stdev = pd.DataFrame(var).std(ddof=1)
stdev(weights, cor_one, stdev)

Related

Gaussian fit for Python with two different function

I used the Gaussian fit with 3 gauss to adjust but datai but I utility data that sometimes my curve contains only two Gaussians in it not find the parameter remnants to use and but great an error is what there is a method that but allows to change with curve fit function use if two or three gaussians .
for my function main, i have this code :
FitGWPS = mainCurveFitGWPS(global_ws, period, All_Max_GWPS, DoupleDip)
and my code for fit is :
import numpy as np
from scipy.optimize import curve_fit
#Functions-----------------------------------------
#Gaussian function
def _1gaus(X,C,X_mean,sigma):
return C*np.exp(-(X-X_mean)**2/(2*sigma**2))
def _3gaus(x, amp1,cen1,sigma1, amp2,cen2,sigma2, amp3,cen3,sigma3):
return amp1*np.exp(-(x-cen1)**2/(2*sigma1**2)) +\
amp2*np.exp(-(x-cen2)**2/(2*sigma2**2)) + amp3*np.exp(-(x-
cen3)**2/(2*sigma3**2))
def ParamFit (Gws, P, Max, popt_Firstgauss):
#Calculating the Lorentzian PDF values given Gaussian parameters and random variableX
width=0
Amp = []
cen = []
wid = []
for j in range(len(Max-1)):
Amp.append(0.8 * (Gws[Max[j]])) # Amplitude
cen.append(P[Max[j]]) # Frequency
if j == 0 : wid.append(0.3 + width * 2.) # Width
else : wid.append(0.3 + popt_Firstgauss[2] * 2.)
return Amp,wid,cen
def mainCurveFitGWPS(global_ws_in, period_in, All_Max_GWPS, DoupleDip):
#Calculating the Gaussian PDF values given Gaussian parameters and random variable X
# For the first fit we calculate with function of the max values
mean = sum(period_in*(global_ws_in))/sum((global_ws_in ))
sigma = np.sqrt(sum((global_ws_in)*(period_in-mean)**2)/sum((global_ws_in)))
Cst = 1 / ( 2* np.pi * sigma)
width=0
Amp = 0.8 * (global_ws_in[All_Max_GWPS[0]]) # Amplitude
cen = period_in[All_Max_GWPS[0]] # Frequency
wid = 0.3 + width * 2. #Width
Amp = []
cen = []
wid = []
for j in range(len(All_Max_GWPS-1)):
Amp.append(0.8 * (global_ws_in[All_Max_GWPS[j]])) # Amplitude
cen.append(period_in[All_Max_GWPS[j]]) # Frequency
if j == 0 : wid.append(0.3 + width * 2.)
else : wid.append(0.3 + popt_gauss[2] * 2.)
#do the fit!
popt_gauss, pcov_gauss = curve_fit(_1gaus, period_in, global_ws_in, p0 = [Cst,
mean, sigma])
FitGauss = _1gaus(period_in, *popt_gauss)
#I use the center, amplitude, and sigma values which I used to create the fake
#data
popt_3gauss, pcov_3gauss = curve_fit(_3gaus, period_in, global_ws_in, p0=[Amp[0],
cen[0], wid[0],Amp[1], cen[1], wid[1],Amp[2], cen[2], wid[2]], maxfev =5000)
Fit3Gauss = _3gaus(period_in, *popt_3gauss)
return Fit3Gauss
for example picture :
and

How do I calculate standard deviation in python without using numpy?

I'm trying to calculate standard deviation in python without the use of numpy or any external library except for math. I want to get better at writing algorithms and am just doing this as a bit of "homework" as I improve my python skills. My goal is to translate this formula into python but am not getting the correct result.
I'm using an array of speeds where speeds = [86,87,88,86,87,85,86]
When I run:
std_dev = numpy.std(speeds)
print(std_dev)
I get: 0.903507902905. But I don't want to rely on numpy. So...
My implementation is as follows:
import math
speeds = [86,87,88,86,87,85,86]
def get_mean(array):
sum = 0
for i in array:
sum = sum + i
mean = sum/len(array)
return mean
def get_std_dev(array):
# get mu
mean = get_mean(array)
# (x[i] - mu)**2
for i in array:
array = (i - mean) ** 2
return array
sum_sqr_diff = 0
# get sigma
for i in array:
sum_sqr_diff = sum_sqr_diff + i
return sum_sqr_diff
# get mean of squared differences
variance = 1/len(array)
mean_sqr_diff = (variance * sum_sqr_diff)
std_dev = math.sqrt(mean_sqr_diff)
return std_dev
std_dev = get_std_dev(speeds)
print(std_dev)
Now when I run:
std_dev = get_std_dev(speeds)
print(std_dev)
I get: [0] but I am expecting 0.903507902905
What am I missing here?
The problem in your code is the reuse of array and return in the middle of the loop
def get_std_dev(array):
# get mu
mean = get_mean(array) <-- this is 86.4
# (x[i] - mu)**2
for i in array:
array = (i - mean) ** 2 <-- this is almost 0
return array <-- this is the value returned
Now let us look at the algorithm you are using. Note that there are two std deviation formulas that are commonly used. There are various arguments as to which one is correct.
sqrt(sum((x - mean)^2) / n)
or
sqrt(sum((x - mean)^2) / (n -1))
For big values of n, the first formula is used since the -1 is insignificant. The first formula can be reduced to
sqrt(sum(x^2) /n - mean^2)
So how would you do this in python?
def std_dev1(array):
n = len(array)
mean = sum(array) / n
sumsq = sum(v * v for v in array)
return (sumsq / n - mean * mean) ** 0.5
speeds = [86,87,88,86,87,85,86]
# Calculate the mean of the values in your list
mean_speeds = sum(speeds) / len(speeds)
# Calculate the variance of the values in your list
# This is 1/N * sum((x - mean(X))^2)
var_speeds = sum((x - mean_speeds) ** 2 for x in speeds) / len(speeds)
# Take the square root of variance to get standard deviation
sd_speeds = var_speeds ** 0.5
>>> sd_speeds
0.9035079029052513
some problems in the code, one of them is the return value inside the for statement. you can try this
def get_mean(array):
return sum(array) / len(array)
def get_std_dev(array):
n = len(array)
mean = get_mean(array)
squares_arr = []
for item in array:
squares_arr.append((item - mean) ** 2)
return math.sqrt(sum(squares_arr) / n)
If you don't want to use numpy its ok give a try to statistics package in python
import statistics
st_dev = statistics.pstdev(speeds)
print(st_dev)
or if you are still willing to use a custom solution then I recommend you to use the following way using list comprehension instead of your complex buggy approach
import math
mean = sum(speeds) / len(speeds)
var = sum((l-mean)**2 for l in speeds) / len(speeds)
st_dev = math.sqrt(var)
print(st_dev)
This. You need to get rid of return inside for loops.
def get_std_dev(array):
# get mu
mean = get_mean(array)
sum_sqr_diff = 0
# get sigma
for i in array:
sum_sqr_diff = sum_sqr_diff + (i - mean)**2
# get mean of squared differences
variance = 1/len(array)
mean_sqr_diff = (variance * sum_sqr_diff)
std_dev = math.sqrt(mean_sqr_diff)
return std_dev

Python Numpy Conv confussion

I am trying to calculate the covariance matrix of two vectors a and b for which i am using numpys cov implementation R = np.cov(a,b). I got a little confussed when i noticed that np.cov(a,b)[0,0] != np.var(a) however i was able to find that, that had to do with biased vs unbiased estimators and is controlled by ddof.
However, that isn't the end of it. Why is R[0,1] != R[0,0]**0.5 * R[1,1]**0.5. Following my understanding and the definition of the covariance matrix on wikipedia https://en.wikipedia.org/wiki/Covariance_matrix
R[0,1] = R[1,0] = std(a) * std(b)
R[0,1] = R[1,0] = var(a)**0.5 * var(b)**0.5
R[0,1] = R[1,0] = R[0,0]**0.5 * R[1,1]**0.5
Where am i mistaken?
import numpy as np
rng = np.random.default_rng(seed=43)
a = rng.random((1,3))
b = rng.random((1,3))
R = np.cov(a,b,ddof=1)
print(R)
print('var a: ' + str(np.var(a,ddof=1)))
print('var b: ' + str(np.var(b,ddof=1)))
print('cov a,b: ' + str(np.var(a,ddof=1)**0.5*np.var(b,ddof=1)**0.5))
print('cov a,b: ' + str(R[0,0]**0.5*R[1,1]**0.5))
print('cov a,b: ' + str(np.std(a,ddof=1)*np.std(b,ddof=1)))
I apologist in advance for any spelling or stack overflow ethnic mistakes on my part. Any help is appreciated.
I'm not sure where the formula var(a)**0.5 * var(b)**0.5 is coming from, but it's not the formula I've seen for cross-covariance. I've seen this as the expected value of the products of x - mean_of_x and y - mean_of_y.
In a loop style (for clarity) this might look like:
a_mean = np.mean(a)
b_mean = np.mean(b)
s = 0
n = len(a[0])
for i, _ in enumerate(a[0]):
s += (a[0][i] - a_mean) * (b[0][i] - b_mean)
s / (n-1)
# 0.09175517729176114
In Numpy you can also do:
a_mean = np.mean(a)
b_mean = np.mean(b)
(a - a_mean) # (b - b_mean).T / (n-1)
# array([[0.09175518]])
This corresponds to the values you get in the corners.
If you want to divide by n rather than n-2, you can pass in the bias arg to cov()
np.cov(a, b, bias=True)
# array([[0.08562558, 0.06117012],
[0.06117012, 0.06361328]])
The corners here are the results you will get with the above code by dividing the results by 3 (n) rather than 2 (n-1)

How to calculate average values of list elements in python?

I'd like to calculate average of each values in a list. To do so, I wrote a function which gets list as parameter and calculate the average and returns the list of average again.
Here is the signal:
random_data = [10 * random.uniform(0,1) for i in range(1000)]
random_peak = [100 * random.uniform(0,1) for i in range(50)] + [0] * 950
random.shuffle(peak)
for i in range(0, len(signal)):
signal = [peak[x] + random_data[x] for x in range(len(random_data))]
And now, I'd like to calculate m as following.
'''
m1 = 1/(number of signal) * x1
m2 = 1/(number of signal) * (x1+x2)
m3 = 1/(number of signal) * (x1+x2+x3)
...
'''
I wrote a following function to calculate m. How would I change the function to return list of m s?
def mean_values(s):
for i in range(len(s)):
m[i] = 1/len(s)*s[i]
return m[i]
mean_values(signal)
#mean_values(np.array(signal)
use m as a float instead of list it make more sense
to get mean
s = 1 / n * Σxi
you can use this to get new mean from a previous one
s' = s + (x1 - s) / n1
where s is the lastest mean, x1 the new value and n1 the new length
However in numpy their is a prebuilt function np.mean() which do that and manage python list too

Calculating Matthew correlation coefficient for a matrix takes too long

I would like to calculate Matthew correlation coefficient for two matrices A and B. Looping over columns of A, and calculate MCC for that column and all 2000 rows of matrix B, then take the max index. The code is:
import numpy as np
import pandas as pd
from sklearn.metrics import matthews_corrcoef as mcc
A = pd.read_csv('A.csv', squeeze=True)
B = pd.read_csv('B.csv', squeeze=True)
ind = {}
for col in A:
ind[col] = np.argmax(list(mcc(B.iloc[i], A[col]) for i in range(2000)))
print(ind[col])
My problem is that it takes really long time (one second for each column). I saw almost the same code in R running much faster (like in 5 seconds). How can this be? Can I improve my Python code?
R Code:
A <- as.matrix(read.csv(file='A.csv'))
B <- t(as.matrix(read.csv(file='B.csv', check.names = FALSE)))
library('mccr')
C <- rep(NA, ncol(A))
for (query in 1:ncol(A)) {
mcc <- sapply(1:ncol(B), function(i)
mccr(A[, query], B[, i]))
C[query] <- which.max(mcc)
}
Maybe try this using numpy and dot products in python
def compute_mcc(true_labels, pred_labels):
"""Compute matthew's correlation coefficient.
:param true_labels: 2D integer array (features x samples)
:param pred_labels: 2D integer array (features x samples)
:return: mcc (samples1 x samples2)
"""
# prep inputs for confusion matrix calculations
pred_labels_1 = pred_labels == 1; pred_labels_0 = pred_labels == 0
true_labels_1 = true_labels == 1; true_labels_0 = true_labels == 0
# dot product of binary matrices
confusion_dot = lambda a,b: np.dot(a.T.astype(int), b.astype(int)).T
TP = confusion_dot(pred_labels_1, true_labels_1)
TN = confusion_dot(pred_labels_0, true_labels_0)
FP = confusion_dot(pred_labels_1, true_labels_0)
FN = confusion_dot(pred_labels_0, true_labels_1)
mcc = (TP * TN) - (FP * FN)
denom = np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
# avoid dividing by 0
denom[denom == 0] = 1
return mcc / denom

Categories