Channel Attribution (Markov Chain Model) in Python - python

How to do Channel Attribution (Markov Chain Model) in Python? Like we have 'ChannelAttribution' package in R.

I wrote the python code for multi channel attribution model. https://www.linkedin.com/pulse/multi-channel-attribution-model-python-sheranga-gamwasam/
Here is the dataset link:https://docs.google.com/spreadsheets/d/11pa-eQDHEX63uSEA4eWiDTOZ7lbO6Vwt-dHkuhuhbSo/edit?usp=sharing
import time
import pandas as pd
import numpy as np
import collections
from itertools import chain
import itertools
from scipy.stats import stats
import statistics
def unique(list1):
unique_list = []
for x in list1:
if x not in unique_list:
unique_list.append(x)
return(unique_list)
def split_fun(path):
return path.split('>')
def calculate_rank(vector):
a={}
rank=0
for num in sorted(vector):
if num not in a:
a[num]=rank
rank=rank+1
return[a[i] for i in vector]
def transition_matrix_func(import_data):
z_import_data=import_data.copy()
z_import_data['path1']='start>'+z_import_data['path']
z_import_data['path2']=z_import_data['path1']+'>convert'
z_import_data['pair']=z_import_data['path2'].apply(split_fun)
zlist=z_import_data['pair'].tolist()
zlist=list(chain.from_iterable(zlist))
zlist=list(map(str.strip, zlist))
T=calculate_rank(zlist)
M = [[0]*len(unique(zlist)) for _ in range(len(unique(zlist)))]
for (i,j) in zip(T,T[1:]):
M[i][j] += 1
x_df=pd.DataFrame(M)
np.fill_diagonal(x_df.values,0)
x_df=pd.DataFrame(x_df.values/x_df.values.sum(axis=1)[:,None])
x_df.columns=sorted(unique(zlist))
x_df['index']=sorted(unique(zlist))
x_df.set_index("index", inplace = True)
x_df.loc['convert',:]=0
return(x_df)
def simulation(trans,n):
sim=['']*n
sim[0]= 'start'
i=1
while i<n:
sim[i] = np.random.choice(trans.columns, 1, p=trans.loc[sim[i-1],:])[0]
if sim[i]=='convert':
break
i=i+1
return sim[0:i+1]
def markov_chain(data_set,no_iteration=10,no_of_simulation=10000,alpha=5):
import_dataset_v1=data_set.copy()
import_dataset_v1=(import_dataset_v1.reindex(import_dataset_v1.index.repeat(import_dataset_v1.conversions))).reset_index()
import_dataset_v1['conversions']=1
import_dataset_v1=import_dataset_v1[['path','conversions']]
import_dataset=(import_dataset_v1.groupby(['path']).sum()).reset_index()
import_dataset['probability']=import_dataset['conversions']/import_dataset['conversions'].sum()
final=pd.DataFrame()
for k in range(0,no_iteration):
start = time.time()
import_data=pd.DataFrame({'path':np.random.choice(import_dataset['path'],size=import_dataset['conversions'].sum(),p=import_dataset['probability'],replace=True)})
import_data['conversions']=1
tr_matrix=transition_matrix_func(import_data)
channel_only = list(filter(lambda k0: k0 not in ['start','convert'], tr_matrix.columns))
ga_ex=pd.DataFrame()
tr_mat=tr_matrix.copy()
p=[]
i=0
while i<no_of_simulation:
p.append(unique(simulation(tr_mat,1000)))
i=i+1
path=list(itertools.chain.from_iterable(p))
counter=collections.Counter(path)
df=pd.DataFrame({'path':list(counter.keys()),'count':list(counter.values())})
df=df[['path','count']]
ga_ex=ga_ex.append(df,ignore_index=True)
df1=(pd.DataFrame(ga_ex.groupby(['path'])[['count']].sum())).reset_index()
df1['removal_effects']=df1['count']/len(path)
#df1['removal_effects']=df1['count']/sum(df1['count'][df1['path']=='convert'])
df1=df1[df1['path'].isin(channel_only)]
df1['ass_conversion']=df1['removal_effects']/sum(df1['removal_effects'])
df1['ass_conversion']=df1['ass_conversion']*sum(import_dataset['conversions'])
final=final.append(df1,ignore_index=True)
end = time.time()
t1=(end - start)
print(t1)
'''
H0: u=0
H1: u>0
'''
unique_channel=unique(final['path'])
#final=(pd.DataFrame(final.groupby(['path'])[['ass_conversion']].mean())).reset_index()
final_df=pd.DataFrame()
for i in range(0,len(unique_channel)):
x=(final['ass_conversion'][final['path']==unique_channel[i]]).values
final_df.loc[i,0]=unique_channel[i]
final_df.loc[i,1]=x.mean()
v=stats.ttest_1samp(x,0)
final_df.loc[i,2]=v[1]/2
if v[1]/2<=alpha/100:
final_df.loc[i,3]=str(100-alpha)+'% statistically confidence'
else:
final_df.loc[i,3]=str(100-alpha)+'% statistically not confidence'
final_df.loc[i,4]=len(x)
final_df.loc[i,5]=statistics.stdev(x)
final_df.loc[i,6]=v[0]
final_df.columns=['channel','ass_conversion','p_value','confidence_status','frequency','standard_deviation','t_statistics']
final_df['ass_conversion']=sum(import_dataset['conversions']) *final_df['ass_conversion'] /sum(final_df['ass_conversion'])
return final_df,final
import_dataset=pd.read_csv('channel attribution example.csv')
data,dataset=markov_chain(import_dataset,no_iteration=10,no_of_simulation=10000,alpha=5)

Related

what is the diffrence beetween the bleu score and the vaerage sentence of bleu score

i'm having a hard time finding the bleus core for my seq to seq model for the task of question generation , my questions are the following :
if i use the sentence bleu to find the score beetween each refrence and the output and then devide the total of these sentence-bleu scores by the len of the test data , will it be the same as the corpus bleu ?
and for the corpus bleu implemented in the code as the nltk corpus bleu ?
import ntpath
import sys
import codecs
import os
import math
import operator
import functools
def fetch_data(cand, ref):
references = []
if '.eng' in ref:
reference_file = codecs.open(ref, 'r', 'utf-8')
references.append(reference_file.readlines())
else:
for root, dirs, files in os.walk(ref):
for f in files:
reference_file = codecs.open(os.path.join(root, f), 'r', 'utf-8')
references.append(reference_file.readlines())
candidate_file = codecs.open(cand, 'r', 'utf-8')
candidate = candidate_file.readlines()
return candidate, references
def count_ngram(candidate, references, n):
clipped_count = 0
count = 0
r = 0
c = 0
for si in range(len(candidate)):
# Calculate precision for each sentence
ref_counts = []
ref_lengths = []
# Build dictionary of ngram counts
for reference in references:
ref_sentence = reference[si]
ngram_d = {}
words = ref_sentence.strip().split()
ref_lengths.append(len(words))
limits = len(words) - n + 1
# loop through the sentance consider the ngram length
for i in range(limits):
ngram = ' '.join(words[i:i+n]).lower()
if ngram in ngram_d.keys():
ngram_d[ngram] += 1
else:
ngram_d[ngram] = 1
ref_counts.append(ngram_d)
# candidate
cand_sentence = candidate[si]
cand_dict = {}
words = cand_sentence.strip().split()
limits = len(words) - n + 1
for i in range(0, limits):
ngram = ' '.join(words[i:i + n]).lower()
if ngram in cand_dict:
cand_dict[ngram] += 1
else:
cand_dict[ngram] = 1
clipped_count += clip_count(cand_dict, ref_counts)
count += limits
r += best_length_match(ref_lengths, len(words))
c += len(words)
if clipped_count == 0:
pr = 0
else:
pr = float(clipped_count) / count
bp = brevity_penalty(c, r)
return pr, bp
def clip_count(cand_d, ref_ds):
"""Count the clip count for each ngram considering all references"""
count = 0
for m in cand_d.keys():
m_w = cand_d[m]
m_max = 0
for ref in ref_ds:
if m in ref:
m_max = max(m_max, ref[m])
m_w = min(m_w, m_max)
count += m_w
return count
def best_length_match(ref_l, cand_l):
"""Find the closest length of reference to that of candidate"""
least_diff = abs(cand_l-ref_l[0])
best = ref_l[0]
for ref in ref_l:
if abs(cand_l-ref) < least_diff:
least_diff = abs(cand_l-ref)
best = ref
return best
def brevity_penalty(c, r):
if c > r:
bp = 1
else:
bp = math.exp(1-(float(r)/c))
return bp
def geometric_mean(precisions):
return (functools.reduce(operator.mul, precisions)) ** (1.0 / len(precisions))
def BLEU(candidate, references):
precisions = []
for i in range(4):
pr, bp = count_ngram(candidate, references, i+1)
precisions.append(pr)
bleu = geometric_mean(precisions) * bp
return bleu
if __name__ == "__main__":
candidate, references = fetch_data(sys.argv[1], sys.argv[2])
bleu = BLEU(candidate, references)
print (bleu)
I'm not sure about the implementation you show but for implementations strictly following the original paper such as NLTKs it would not be the same: https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py#L123.
Using sentence-BLEU means basically calling corpus-BLEU with just a one-sentence-corpus, but the other way around doesn't work. The scores should not be drastically different but they do differ because of macro-average vs micro-average.
I used BLEU for Seq2Seq evaluation before and just used sentence-BLEU and it worked just fine.

How to import random seed in cycle to obtain the same result in anytime?

I need to get the same results in any iteration. I tried to use random.seed() in my script, but it doesn't work. How can I fix it?
My script:
import statistics
Oddr=[]
Oddr_fem=[]
Oddr_mal=[]
chi = []
chi_fem=[]
chi_mal=[]
for k in range(100):
random.seed(10)
result = []
exclude_hlthy = []
for i in set(sick['predicted_age']):
sick_ppl = sick.index[sick['predicted_age'] == i].tolist()
L_sick = len(sick_ppl)
if L_sick == 0:
continue
hlth_peers = healthy[healthy.predicted_age == i]
L_healthy = hlth_peers.shape[0]
if L_healthy < len(sick_ppl):
pass
else:
hlthy_subsample = list(np.random.choice([x for x in hlth_peers.index if not x in exclude_hlthy],
L_sick, replace = False))
exclude_hlthy += hlthy_subsample
result += hlthy_subsample
table_ready = healthy.loc[result]
whole_table = table_ready.append(sick, ignore_index=False)
cross_tab = pd.crosstab(index=whole_table['dc013'], columns=whole_table['rate_aging'])
oddsratio=(cross_tab[1][1]*cross_tab[0][0])/(cross_tab[1][0]*cross_tab[0][1])
#Oddr += oddsratio
Oddr.append(oddsratio)
In this script I got several random tables whole_table from one subsample.
For every iteration you're creating a new random.seed(10) so try to put this line of code out of the for cycle and then should work

Python multiprocessing multiple iterations

I am trying to use multiprocessing to speed up my data processing. I am working on a machine with 6 Cores, so I want to iterate through a table of 12 million rows, and for each of these rows I iterate through several time steps doing a calculation (executing a function).
This line I would like to split up that it runs in parallel on different cores:
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
I tried something with
from multiprocessing import Pool
but I did not manage to pass the arguments of the function and the iterator.
I would appreciate any idea. I am new to Python.
This is what i have:
import pyreadr
import pandas as pd
import numpy as np
import time
from datetime import timedelta
import functools
from pathlib import Path
def read_data():
current_path = os.getcwd()
myfile = os.path.join(str(Path(current_path).parents[0]), 'dummy.RData')
result = pyreadr.read_r(myfile)
pc = result["pc"]
u = result["u"]
return pc, u
# add one column per time
def prepare_output_structure(pc):
ini_cols = pc.columns
pc = pc.reindex(columns=[*pc.columns, *np.arange(0, 11), 'cat'], fill_value=0)
pc.reset_index(level=0, inplace=True)
# print(pc.columns, pc.shape, pc.dtypes)
return pc, ini_cols
def conjunction(*conditions):
return functools.reduce(np.logical_and, conditions)
def timeloop(t_final: int, count_final: int, tipo):
if tipo == 'A':
count_ini = 35
else: # B:
count_ini = 30
yy_list = []
for t in np.arange(0, 11):
yy = ((count_final - count_ini) / t_final) * t + count_ini
yy_list.append(int(yy))
return yy_list
def rowiteration(i, output, ini_cols, cols):
c_2: bool = pc.loc[i, 'tipo'] == u.iloc[:, 0].str[:1] # first character of category e.g. 'A1'
c_5: bool = pc.loc[i, 't_final'] >= u.iloc[:, 1] # t_min (u)
c_6: bool = pc.loc[i, 't_final'] <= (u.iloc[:, 2]) # t_max (u)
pc.loc[i, 'cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
pc.iloc[i, (0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(pc.loc[i, 't_final']), int(pc.loc[i, 'count_final']), pc.loc[i, 'tipo'])
out = pd.DataFrame(pc.iloc[i, :])
out = pd.DataFrame(out.transpose(), columns=cols)
output = output.append(out.iloc[0, :])
return output
if __name__ == '__main__':
start_time = time.time()
pc, u = read_data()
nrowpc = len(pc.index)
a = np.arange(0, nrowpc) # filas tabla pc
# print(a, nrowpc, len(pc.index))
pc, ini_cols = prepare_output_structure(pc)
cols = pc.columns
output = pd.DataFrame()
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
pc2 = pd.concat(test, ignore_index=True)
pc2 = pc2.iloc[:, np.r_[5, (len(ini_cols)+1):(len(pc2.columns))]]
print(pc2.head)
elapsed_time_secs = time.time() - start_time
msg = "Execution took: %s secs (Wall clock time)" % timedelta(milliseconds=elapsed_time_secs)
print(msg)```
Replace your [rowiteration(i, output, ini_cols, cols) for i in a] with:
from multiprocessing import Pool
n_cpu = 10 # put in the number of threads of cpu
with Pool(processes=n_cpu) as pool:
ret = pool.starmap(rowiteration,
[(i, output, ini_cols, cols) for i in a])
Here is an approach that I think solves the problem and that only sends what is necessary to the worker processes. I haven't tested this as is (which would be difficult without the data your code reads in) but this is basic idea:
import multiprocessing as mp
p = mp.Pool(processes=mp.cpu_count())
# Note that you already define the static cols and ini_cols
# in global scope so you don't need to pass them to the Pool.
# ... Other functions you've defined ...
def rowiteration(row):
c_2: bool = row['tipo'] == u.iloc[:, 0].str[:1]
c_5: bool = row['t_final'] >= u.iloc[:, 1]
c_6: bool = row['t_final'] <= (u.iloc[:, 2])
row['cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
row[(0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(row['t_final']), int(row['count_final']), row['tipo'])
return row
out = []
for row in p.imap_unordered(rowiteration, [r for _, r in pc.iterrows()]):
row.index = cols
out.append(cols)
pc2 = pd.DataFrame(out, ignore_index=True)

Adaptive DBSCAN achievement

I am doing the DBSCAN clustering in python. I want to achieve an adaptive way to return the number of clusters by self calculating its eps and Minpts parameters. Below is my code.
import math
import copy
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
def loadDataSet(fileName, splitChar='\t'):
dataSet = []
with open(fileName) as fr:
for line in fr.readlines():
curline = line.strip().split(splitChar)
fltline = list(map(float, curline))
dataSet.append(fltline)
return dataSet
def dist(a,b):
return math.sqrt(math.pow(a[0]-b[0],2) + math.pow(a[1]-b[1],2))
def returnDk(matrix,k):
Dk = []
for i in range(len(matrix)):
Dk.append(matrix[i][k])
return Dk
def returnDkAverage(Dk):
sum = 0
for i in range(len(Dk)):
sum = sum + Dk[i]
return sum/len(Dk)
def CalculateDistMatrix(dataset):
DistMatrix = [[0 for j in range(len(dataset))] for i in range(len(dataset))]
for i in range(len(dataset)):
for j in range(len(dataset)):
DistMatrix[i][j] = dist(dataset[i], dataset[j])
return DistMatrix
def returnEpsCandidate(dataSet):
DistMatrix = CalculateDistMatrix(dataSet)
tmp_matrix = copy.deepcopy(DistMatrix)
for i in range(len(tmp_matrix)):
tmp_matrix[i].sort()
EpsCandidate = []
for k in range(1,len(dataSet)):
Dk = returnDk(tmp_matrix,k)
DkAverage = returnDkAverage(Dk)
EpsCandidate.append(DkAverage)
return EpsCandidate
def returnMinptsCandidate(DistMatrix,EpsCandidate):
MinptsCandidate = []
for k in range(len(EpsCandidate)):
tmp_eps = EpsCandidate[k]
tmp_count = 0
for i in range(len(DistMatrix)):
for j in range(len(DistMatrix[i])):
if DistMatrix[i][j] <= tmp_eps:
tmp_count = tmp_count + 1
MinptsCandidate.append(tmp_count/len(dataSet))
return MinptsCandidate
def returnClusterNumberList(dataset,EpsCandidate,MinptsCandidate):
np_dataset = np.array(dataset)
ClusterNumberList = []
for i in range(len(EpsCandidate)):
clustering = DBSCAN(eps= EpsCandidate[i],min_samples= MinptsCandidate[i]).fit(np_dataset)
num_clustering = max(clustering.labels_)
ClusterNumberList.append(num_clustering)
return ClusterNumberList
if __name__ == '__main__':
data = pd.read_csv('/Users/Desktop/Mic/recorder_test1/New folder/MFCCresultsforclustering/MFCCresultsforclustering.csv')
dataSet = data.iloc[:,0:13].values
EpsCandidate = returnEpsCandidate(dataSet)
DistMatrix = CalculateDistMatrix(dataSet)
MinptsCandidate = returnMinptsCandidate(DistMatrix,EpsCandidate)
ClusterNumberList = returnClusterNumberList(dataSet,EpsCandidate,MinptsCandidate)
print(EpsCandidate)
print(MinptsCandidate)
print('cluster number list is')
print(ClusterNumberList)
However, the output with the loading data set is all [-1]s. I am wondering where is the mistake. Am I right for this general direction? If not, how can I achieve the adaptive DBSCAN clustering?

Function returning same values for supposedly different inputs

I'm using pyalgotrade to create a trading strategy. I'm going through a list of tickers(testlist) and adding them to a dictionary(list_large{}) alongside their score which I'm getting using a get_score function. My latest problem is that each ticker in the dictionary(list_large{}) is getting the same score. Any idea why?
Code:
from pyalgotrade import strategy
from pyalgotrade.tools import yahoofinance
import numpy as np
import pandas as pd
from collections import OrderedDict
from pyalgotrade.technical import ma
from talib import MA_Type
import talib
smaPeriod = 10
testlist = ['aapl','ddd','gg','z']
class MyStrategy(strategy.BacktestingStrategy):
def __init__(self, feed, instrument):
super(MyStrategy, self).__init__(feed, 1000)
self.__position = []
self.__instrument = instrument
self.setUseAdjustedValues(True)
self.__prices = feed[instrument].getPriceDataSeries()
self.__sma = ma.SMA(feed[instrument].getPriceDataSeries(), smaPeriod)
def get_score(self,slope):
MA_Score = self.__sma[-1] * slope
return MA_Score
def onBars(self, bars):
global bar
bar = bars[self.__instrument]
slope = 8
for instrument in bars.getInstruments():
list_large = {}
for tickers in testlist: #replace with real list when ready
list_large.update({tickers : self.get_score(slope)})
organized_list = OrderedDict(sorted(list_large.items(), key=lambda t: -t[1]))#organize the list from highest to lowest score
print list_large
def run_strategy(inst):
# Load the yahoo feed from the CSV file
feed = yahoofinance.build_feed([inst],2015,2016, ".") # feed = yahoofinance.build_feed([inst],2015,2016, ".")
# Evaluate the strategy with the feed.
myStrategy = MyStrategy(feed, inst)
myStrategy.run()
print "Final portfolio value: $%.2f" % myStrategy.getBroker().getEquity()
def main():
instruments = ['ddd','msft']
for inst in instruments:
run_strategy(inst)
if __name__ == '__main__':
main()
Check this code of the onBars() function:
slope = 8 # <---- value of slope = 8
for instrument in bars.getInstruments():
list_large = {}
for tickers in testlist: #replace with real list when ready
list_large.update({tickers : self.get_score(slope)})
# Updating dict of each ticker based on ^
Each time self.get_score(slope) is called, it returns the same value and hence, all the value of tickers hold the same value in dict
I do not know how you want to deal with slope and how you want to update it's value. But this logic can be simplified without using .update as:
list_large = {}
for tickers in testlist:
list_large[tickers] = self.get_score(slope)
# ^ Update value of `tickers` key

Categories