Mean average precision - python - python

I am calculating mean average precision at top k retrieve objects. Here is my code. At this stage, I am computing R#K.
The code read the two lists from csv files, then take a sample from a list, compute euclidean distance with all samples from other list, sort them and finally take top k objects to see if the object is available in the retrieved samples.
import csv
from scipy.spatial import distance
from sklearn.utils import shuffle
from numpy import dot
from numpy.linalg import norm
from sklearn.preprocessing import StandardScaler
import numpy as np
from numpy import array
def parse_features_from_csv(csv_file):
feat_lst = []
id_lst = []
row_lst = []
with open(csv_file) as fr:
reader = csv.reader(fr, delimiter=',')
for row in reader:
s_feat = row[:-1]
identifier = row[-1]
s_feat = [float(i) for i in s_feat]
feat_lst.append(s_feat)
id_lst.append(identifier)
row_lst.append(row[-1])
return feat_lst, id_lst,row_lst
def compute_distances(et_item, feat_lst, id_lst):
dist_list = []
for id_img_item, img_item in enumerate(feat_lst):
dist = distance.euclidean(img_item,et_item)
#print (dist)
dist_list.append((id_lst[id_img_item], dist))
return dist_list
def main():
top_k = 10
feat_file = "list_1.csv"
test_file = "list_2.csv"
et_feat_lst, et_id_list, _ = parse_features_from_csv(test_file)
feat_list, id_list,row_lst_et = parse_features_from_csv(feat_file)
print (len(feat_list))
print (len(et_feat_lst))
correct = 0
for id_et_item, et_item in enumerate(et_feat_lst):
distances = compute_distances(et_item, feat_list, row_lst_et)
sort_dst = sorted(distances, key=lambda x: x[1])
#print("Target: " + et_id_list[id_et_item] + ", Distances: " + str(sort_dst[:top_k]))
eucl_dist = sort_dst[:top_k]
gt = et_id_list[id_et_item]
for idx in eucl_dist:
tar = idx[0]
if gt == tar:
correct+= 1
break
print ("correct", str(correct) + '/' + str(id_et_item))
if __name__ == '__main__':
main()
Can some one tell how I can use sklearn.metrics.average_precision_score
function to compute Mean average precision for top K retrieved objects.
I have confusion related to (y_true, y_scores) . I would appreciate if someone can explain these two parametres of the function.

Related

Building a weighted histogram using two binary files

I have two binary files that I need to iterate through simultaneously so that the value yielded in one file corresponds correctly (same location) to the value yielded in the other. I'm sorting values into histogram bins and the value from one file corresponds to the weight of the value from the other file.
I tried the following syntax:
import numpy as np
import struct
import matplotlib.pyplot as plt
low = np.inf
high = -np.inf
struct_fmt = 'f'
struct_len = struct.calcsize(struct_fmt)
struct_unpack = struct.Struct(struct_fmt).unpack_from
file = "/projects/current/real-core-snaps/core4_256_velx_0009.bin"
file2 = "/projects/current/real-core-snaps/core4_256_dens_0009.bin"
def read_chunks(f, length):
while True:
data = f.read(length)
if not data: break
yield data
loop = 0
with open(file,"rb") as f:
for chunk in read_chunks(f, struct_len):
x = struct_unpack(chunk)
low = np.minimum(x, low)
high = np.maximum(x, high)
loop += 1
nbins = math.ceil(math.sqrt(loop))
bin_edges = np.linspace(low, high, nbins + 1)
total = np.zeros(nbins, np.int64)
f = open(file,"rb")
f2 = open(file2,"rb")
for chunk1,chunk2 in zip(read_chunks(f, struct_len),read_chunks(f2, struct_len)):
subtotal,e = np.histogram(struct_unpack(chunk1),bins=bin_edges,weights=struct_unpack(chunk2))
total = np.add(total,subtotal,out=total,casting="unsafe")
plt.hist(bin_edges[:-1], bins=bin_edges, weights=total)
plt.savefig('hist-veldens.svg')
but the histogram produced is ridiculous (see below). What am I doing wrong?
The data files are located at https://drive.google.com/file/d/1fhia2CGzl_aRX9Q9Ng61W-4XJGQe1OCV/view?usp=sharing and https://drive.google.com/file/d/1CrhQjyG2axSFgK9LGytELbxjy3Ndon1S/view?usp=sharing.
The mistake is that total = np.zeros(nbins, np.int64) is assigning an integer type to each of the elements of the array total. Given that subtotal does not contain the count number in a weighted histogram but a float-type, total should also be of type float.

How to find the best line Fit Python(banister-impulse model)

I have this formula that is used to predict athletic performance base on daily stress.
It is based on 5 constant unique to each person. I'm trying to find these based on daily stress and performance testing that has been done. I'm new to programming and I don't know where to start.
see the formula
Performance= Fitness(=daily stress+yesterday fitness put decay) - Fatigue(daily stress+yesterday fatigue put decay) +P0
This is a sample of the data: data
thank you
import pandas as pd
import numpy as np
import math
from scipy import optimize
data = pd.read_csv('data_mod1.csv')
TSS = data['stress'].fillna(0)
arr = np.array(TSS)
#data = data.dropna()
a = [arr[0]]
b = [arr[0]]
x = arr[1:]
def Banister(x, t1, t2,k1,k2, c):
for v in x:
a.append(a[-1]*np.exp(-1/t1) + v)
b.append(b[-1]*np.exp(-1/t2) + v)
data['fit'] = pd.Series(a)
data['fat'] = pd.Series(b)
data['perf'] = ((data['fit']*k1)-(data['fat']*k2))+c
return data['perf']
# In[ ]:
from scipy.optimize import curve_fit
fit = curve_fit(Banister, arr,data[data.index], p0=[20, 10,1 ,2, 50])

Creating a vector of values based off a test using a for loop

This feels like it should be a simple problem but I am newer to python, in R i would use a foreach loop that gave me an option to combine.
I have tried a for loop that lets me print out all the values i need but i want them collected into a vector of values that i can use later.
from scipy.stats import gamma
import scipy.stats as stats
import numpy as np
import random
data2 = np.random.gamma(1,2, size = 500)
gammT = np.log(data2 + 1)
mean = np.mean(gammT)
sd = np.std(gammT)
a = (mean/ sd)**2
b = (sd**2)/ mean
for i in range(1,100):
gammT = random.sample(list(gammT), 500)
gamm = np.random.gamma(a,b, size = len(gammT))
s = stats.anderson_ksamp([gammT,gamm])
s = s[2]
print(s)
So i am able to print all the values i want but i want them all to be gathered together in a vector of values. I have tried to append and make lists but am not able to get them together.
from scipy.stats import gamma
import scipy.stats as stats
import numpy as np
import random
gammT = np.log(data2.iScore + 1)
mean = np.mean(gammT)
sd = np.std(gammT)
a = (mean/ sd)**2
b = (sd**2)/ mean
#initialize empty list
result=[]
for i in range(100):
# removed (1,100) you only need range(100) for 100 elements
gammT = random.sample(list(gammT), 500)
gamm = np.random.gamma(a,b, size = len(gammT))
s = stats.anderson_ksamp([gammT,gamm])
s = s[2]
#append calculation to list
result.append(s)
print(s)
print(result)

Python Code - SVD with numpy

I would like to get some help with my code on Python. I am a novice to Python.
At high level - I read a (.png) file from command line , put into original array , compute svd , find high rank of svd based on command line , multiple with original array and then finally put the file and the array out.
My issue : The generated file is distorted and does not look like the real picture i intended to generate.
My question : I have put the snippet of code I am using , can you please point to what I am doing incorrectly ?
import sys
import os
import numpy
import numpy.linalg
import scipy.misc
def getOutputPngName(path, rank):
filename, ext = os.path.splitext(path)
return filename + '.' + str(rank) + '.png'
def getOutputNpyName(path, rank):
filename, ext = os.path.splitext(path)
return filename + '.' + str(rank) + '.npy'
if len(sys.argv) < 3:
sys.exit('usage: task1.py <PNG inputFile> <rank>')
inputfile = sys.argv[1]
rank = int(sys.argv[2])
outputpng = getOutputPngName(inputfile, rank)
outputnpy = getOutputNpyName(inputfile, rank)
# Import pic.png into array im as command parameter
img = scipy.misc.imread(inputfile)
# Perform SVD on im and obtain individual matrices
P, D, Q = numpy.linalg.svd(img, full_matrices=False)
# Compute overall SVD matrix based on individual matrices
svd_decomp = numpy.dot(numpy.dot(P, numpy.diag(D)), Q)
# Keep Top entries in svd_decomp
initial = svd_decomp.argsort()
temp = numpy.array(initial)
svd_final = numpy.argpartition(temp,-rank)[-rank:]
# Multiply to obtain the best rank-k approximation of the original array
img = numpy.transpose(img)
final = (numpy.dot(svd_final,img))
#Saving the approximated array as a binary array file(1) and as a PNG file(2)
numpy.save(outputnpy, final)
scipy.misc.imsave(outputpng, final)
The biggest issue is the svd_decomp.argsort(). argsort() without any arguments flattens out the whole matrix and sorts it like that, it's not what you want to do.
In fact, you don't need to do any sorting, because linalg's svd() function does it for you. See the documentation.
The singular values for every matrix, sorted in descending order.
So you just have to do the following
import sys
import os
import numpy
import numpy.linalg
import scipy.misc
def getOutputPngName(path, rank):
filename, ext = os.path.splitext(path)
return filename + '.' + str(rank) + '.png'
def getOutputNpyName(path, rank):
filename, ext = os.path.splitext(path)
return filename + '.' + str(rank) + '.npy'
if len(sys.argv) < 3:
sys.exit('usage: task1.py <PNG inputFile> <rank>')
inputfile = sys.argv[1]
rank = int(sys.argv[2])
outputpng = getOutputPngName(inputfile, rank)
outputnpy = getOutputNpyName(inputfile, rank)
# Import pic.png into array im as command parameter
img = scipy.misc.imread(inputfile)
# Perform SVD on im and obtain individual matrices
P, D, Q = numpy.linalg.svd(img, full_matrices=True)
# Select top "rank" singular values
svd_decomp = numpy.matrix(P[:, :rank]) * numpy.diag(D[:rank]) * numpy.matrix(Q[:rank, :])
# Save the output
numpy.save(outputnpy, svd_decomp)
scipy.misc.imsave(outputpng, svd_decomp)
Notice that all we do is select "rank" singular values, no need to sort.
Example outputs:
Base Image:
Rank = 1
Rank = 10
No Need to sort. Just compute your matrix over rank
svd_decomp = np.zeros((len(P), len(Q)))
for i in range(rank):
svd_decomp += D[i] * np.outer(P.T[i], Q[i])

Python program uses too much memory

Function coinT() tests if two time series are stationary using ADF test and Hurst exponent. Time series are stored in 1511x6 CSV files, but for testing only a vector of the 5th column is returned by the function stock(). There are 50 files in total. It seems that the program is using too much memory as it makes the PC crash after running for ~30 secs. It works fine on 15 files, but crashes on larger sets(>50).
Can somebody please help me find out what's using so much memory? I've tried splitting computations into multiple functions and deleting the object, but it didn't help much.
import numpy as np
import pandas as pd
import statsmodels.tsa.stattools as ts
import csv
import timeit
from numpy import log, polyfit, sqrt, std, subtract
from pandas.stats.api import ols
import os
src = 'C:/Users/PC/Desktop/Magistr/Ibpython/testing/'
filenames = next(os.walk(src))[2] #load all stock file names into array
cointegratedPairs = []
def hurst(ts):
"""Returns the Hurst Exponent of the time series vector ts
H<0.5 - The time series is mean reverting
H=0.5 - The time series is a Geometric Brownian Motion
H>0.5 - The time series is trending"""
# Create the range of lag values
lags = range(2, 100)
# Calculate the array of the variances of the lagged differences
tau = [sqrt(std(subtract(ts[lag:], ts[:-lag]))) for lag in lags]
# Use a linear fit to estimate the Hurst Exponent
poly = polyfit(log(lags), log(tau), 1)
del lags
del tau
# Return the Hurst exponent from the polyfit output
return poly[0]*2.0
#Convert file into an array
def stock(filename):
#read file into array and get it's length
delimiter = ","
with open(src + filename,'r') as dest_f:
data_iter = csv.reader(dest_f,
delimiter = delimiter,
quotechar = '"')
data = [data for data in data_iter]
data_array = np.asarray(data)[:,5]
return data_array
del data
del data_array
#Check if two time series are cointegrated
def coinTest(itemX, itemY):
indVar = map(float, stock(itemX)[0:1000]) #2009.05.22 - 2013.05.14
depVar = map(float, stock(itemY)[0:1000]) #2009.05.22 - 2013.05.14
#Calculate optimal hedge ratio "beta"
df = pd.DataFrame()
df[itemX] = indVar
df[itemY] = depVar
res = ols(y=df[itemY], x=df[itemX])
beta_hr = res.beta.x
alpha = res.beta.intercept
df["res"] = df[itemY] - beta_hr*df[itemX] - alpha
#Calculate the CADF test on the residuals
cadf = ts.adfuller(df["res"])
#Reject the null hypothesis at 1% confidence level
if cadf[4]['1%'] > cadf[0]:
#Hurst exponent test if residuals are mean reverting
if hurst(df["res"]) < 0.4:
cointegratedPairs.append((itemY,itemX))
del indVar
del depVar
del df[itemX]
del df[itemY]
del df["res"]
del cadf
#Main function
def coinT():
limit = 0
TotalPairs = 0
for itemX in filenames:
for itemY in filenames[limit:]:
TotalPairs +=1
if itemX == itemY:
next
else:
coinTest(itemX, itemY)
limit +=1

Categories