Flajolet Martin Algorithm Implementation - python

I am trying to implement Flajolet Martin algorithm. I have a dataset with over 6000 records but the output of the following code is 4096. Please help me in understanding the mistake being made by me.
import xxhash
import math
def return_trailing_zeroes(s):
s = str(s)
rev = s[::-1]
count = 0
for i in rev:
if i is '0':
count = count + 1
else:
break
return count
def gethash(line):
num=abs(xxhash.xxh32(line).intdigest())
return num
fp=open("/content/drive/MyDrive/Data.txt","r")
h_max=0
for line in fp:
hash_value_1 = gethash(line)
binary_1 = format(hash_value_1, '032b')
t1 = return_trailing_zeroes(binary_1)
if t1>h_max:
h_max=t1
fp.close()
print(2**h_max)
I tried this implementation of HyperLogLog algorithm and the output of the following code is 2560.
def return_trailing_zeroes(s): s = str(s) rev = s[::-1] count = 0
for i in rev: if i is '0': count = count + 1 else: break
return count
h1_m=0
h2_m=0
h3_m=0
h4_m=0
fp=open("/content/drive/MyDrive/Data.txt","r")
for line in fp:
hash_value_1 = abs(xxhash.xxh32(line).intdigest())
hash_value_2 = abs(hash32(line))
hash_value_3 = abs(jhashcode.hashcode(line))
hash_value_4 = abs(mmh3.hash(line))
binary_1 = format(hash_value_1, '032b')
binary_2 = format(hash_value_2, '032b')
binary_3 = format(hash_value_3, '032b')
binary_4 = format(hash_value_4, '032b')
t1 = return_trailing_zeroes(binary_1)
t2 = return_trailing_zeroes(binary_2)
t3 = return_trailing_zeroes(binary_3)
t4 = return_trailing_zeroes(binary_4)
if t1>h1_m: h1_m=t1
if t2>h2_m: h2_m=t2
if t3>h3_m: h3_m=t3
if t4>h4_m: h4_m=t4
fp.close()
avg_hash12 = (2**(h1_m) + 2**(h2_m))/ float(2)
avg_hash34 = (2**(h3_m) + 2**(h4_m))/ float(2)
distinct_elements = math.ceil(statistics.median([avg_hash12,
avg_hash34]))
print(distinct_elements)

Related

I want to parallelize this code to execute faster for 800000 sentences

from app import getPhonemes
import pandas as pd
import sys
triphones = []
def phonemize(sentence):
tokens = sentence.split(' ')
phonemes = getPhonemes(tokens)
return '$'.join(phonemes)
def generateTriphones(phonemes):
triphones = []
for i in range(len(phonemes)):
for j in range(len(phonemes)):
for k in range(len(phonemes)):
triphones.append(phonemes[i] + ' ' + phonemes[j] + ' ' + phonemes[k])
return triphones
def scoreSentence(sentence,phonemes):
flag = 0
global triphones
score = 0
tokens = sentence.split('$')
uniqueTokens = set(tokens)
triphoneticTokens = [token for token in uniqueTokens if token.count(' ') > 1]
for token in triphoneticTokens:
for triphone in triphones:
if token.find(triphone) != -1:
score += 1
triphones.remove(triphone)
if triphones == []:
flag = -1
return score, flag
def Process(fil):
global triphones
file = open('itudict/vocab.phoneme', 'r',encoding='utf-8')
data = []
for line in file:
data.append(line.strip())
file.close()
phonemes = data[4:]
triphones = generateTriphones(phonemes)
data = pd.read_csv(fil+'.csv')
data = data.drop(['score','covered_vocab'],axis=1)
i = 1
while len(data) > 0:
print('Processing File: '+str(i))
sentencee = data[:10000]
data = data[10000:]
sentences = sentencee['sentence'].tolist()
phonemes = []
scores = []
for j in range(len(sentences)):
if j%1000 == 0:
print('Processing Sentence: '+str(j))
print(len(triphones))
phones = phonemize(sentences[j])
score, flag = scoreSentence(phones,phonemes)
if flag == -1:
data = []
phonemes.append(phones)
scores.append(score)
data['Phonemes'] = phonemes
data['score'] = scores
data.to_csv(fil+'phonemized'+str(i)+'.csv', index=False)
i += 1
if __name__ == '__main__':
Process(sys.argv[1])
I am trying to generate the phonemes for 800000 sentences. The model which am using is G2P which phonemizes the sentence. after phonemization i am calculating the scores. the phoneme array which i am using for calculating scores is of size 2620000.
The length of sentences are 800000 and the code is taking days, can somebody parallelize this code or suggest some solution
I want to parallelize this code to execute faster.

How can I read this barcode

How can I read this barcode in python:
This is my code:
def do_one(image):
print(image)
image = KImage(image)
l = time()
def tick():
nonlocal l
n = time()
t = n - l
l = n
return t
union = set()
first = []
first_t = 0
total_t = 0
results = []
for scalar in [0.5, 0.2, 0.1, 1, 1.5]:
tick()
if scalar != 1:
image_scaled = scale_image(image.image, scalar=scalar).convert("L")
else:
image_scaled = image.image.convert("L")
res = qrdecode(image_scaled)
st = tick()
union.update(res); total_t += st
if res:
first = res
first_t = total_t
for sharpness in [0.1, 0.5,1]:
tick()
image_scaled_sharp = sharpen(image_scaled, sharpness)
res = qrdecode(image_scaled_sharp)
t = tick()
union.update(res); total_t += st + t
if res:
first = res
first_t = total_t
tick()
image_scaled_autocontrast = autocontrast(image_scaled)
res = qrdecode(image_scaled_autocontrast)
t = tick()
union.update(res); total_t += st + t
if res:
first = res
first_t = total_t
results.append({"file": image.filename,
"what": f"do-all-the-things",
"result": list(union),
"time": total_t})
# queue.put(results)
return results
I am able to read several bar codes but I have not been able to do this because of more improvements and changes that I make. Does anyone know how I can read this bar code?

could not convert string to float in tabu search algorithm

I am having this error for the part in bold:
Traceback (most recent call last):
File "C:/Users/appan/OneDrive/Documents/Year 3/AI Assignment Semester 1/Tabu Search/Tabu-search-on-Travelling-Salesman-Problem-master/TabuSearch2.py", line 238, in
solution, value, exec_time = tabu_search("five_d.txt")
File "C:/Users/appan/OneDrive/Documents/Year 3/AI Assignment Semester 1/Tabu Search/Tabu-search-on-Travelling-Salesman-Problem-master/TabuSearch2.py", line 175, in tabu_search
graph, max_weight = read_data(input_file_path)
File "C:/Users/appan/OneDrive/Documents/Year 3/AI Assignment Semester 1/Tabu Search/Tabu-search-on-Travelling-Salesman-Problem-master/TabuSearch2.py", line 64, in read_data
link.append(float(tmp[0]))
ValueError: could not convert string to float:
Process finished with exit code 1
can you help please
import math
from random import randint
import time
from random import shuffle
#import numpy as np
### Data Format is dict:
# data[node_name] = gives you a list of link info
# data[link_index][0] = name of node that edge goes to
# data[link_index][1] = weight of that edge
def read_data(path):
linkset = []
links = {}
max_weight = 0
'''
with open(path, "r") as f:
for line in f:
print (line)
link = []
#tmp = list(map(float,line.strip().split(' ')))
tmp=line.strip().split(' ')
arr=np.array(tmp)
print(arr)
link.append(float(tmp[0]))
link.append(float(tmp[1]))
link.append(float(tmp[2]))
linkset.append(link)
if float(tmp[2]) > max_weight:
max_weight = float(tmp[2])
link.append(int(tmp[0]))
link.append(int(tmp[1]))
link.append(int(tmp[2]))
linkset.append(link)
if int(tmp[2]) > max_weight:
max_weight = int(tmp[2])
'''
**with open(path,'r') as f:
for line in f:
#print(line)
link = []
#tmp = list(map(float,line.strip().split(' ')))
tmp = line.strip().split(' ')
#tmp = np.array()
print(tmp)
'''
for i in tmp:
link.append([i])
'''
link.append(float(tmp[0]))
link.append(float(tmp[1]))
link.append(float(tmp[2]))
linkset.append(link)
#print(link)
'''
link.append(list(map(float,tmp[0])))
link.append(list(map(float,tmp[1])))
link.append(list(map(float,tmp[2])))
linkset.append(link)
'''
if float(tmp[2]) > max_weight:
max_weight = float(tmp[2])**
for link in linkset:
try:
linklist = links[str(link[0])]
linklist.append(link[1:])
links[str(link[0])] = linklist
except:
links[str(link[0])] = [link[1:]]
return links, max_weight
def getNeighbors(state):
# return hill_climbing(state)
return two_opt_swap(state)
def hill_climbing(state):
node = randint(1, len(state) - 1)
neighbors = []
for i in range(len(state)):
if i != node and i != 0:
tmp_state = state.copy()
tmp = tmp_state[i]
tmp_state[i] = tmp_state[node]
tmp_state[node] = tmp
neighbors.append(tmp_state)
return neighbors
def two_opt_swap(state):
global neighborhood_size
neighbors = []
for i in range(neighborhood_size):
node1 = 0
node2 = 0
while node1 == node2:
node1 = randint(1, len(state) - 1)
node2 = randint(1, len(state) - 1)
if node1 > node2:
swap = node1
node1 = node2
node2 = swap
tmp = state[node1:node2]
tmp_state = state[:node1] + tmp[::-1] + state[node2:]
neighbors.append(tmp_state)
return neighbors
def fitness(route, graph):
path_length = 0
for i in range(len(route)):
if (i + 1 != len(route)):
dist = weight_distance(route[i], route[i + 1], graph)
if dist != -1:
path_length = path_length + dist
else:
return max_fitness # there is no such path
else:
dist = weight_distance(route[i], route[0], graph)
if dist != -1:
path_length = path_length + dist
else:
return max_fitness # there is no such path
return path_length
# not used in this code but some datasets has 2-or-more dimensional data points, in this case it is usable
def euclidean_distance(city1, city2):
return math.sqrt((city1[0] - city2[0]) ** 2 + ((city1[1] - city2[1]) ** 2))
def weight_distance(city1, city2, graph):
global max_fitness
neighbors = graph[str(city1)]
for neighbor in neighbors:
if neighbor[0] == int(city2):
return neighbor[1]
return -1 # there can't be minus distance, so -1 means there is not any city found in graph or there is not such edge
def tabu_search(input_file_path):
global max_fitness, start_node
graph, max_weight = read_data(input_file_path)
## Below, get the keys (node names) and shuffle them, and make start_node as start
s0 = list(graph.keys())
shuffle(s0)
if int(s0[0]) != start_node:
for i in range(len(s0)):
if int(s0[i]) == start_node:
swap = s0[0]
s0[0] = s0[i]
s0[i] = swap
break;
# max_fitness will act like infinite fitness
max_fitness = ((max_weight) * (len(s0))) + 1
sBest = s0
vBest = fitness(s0, graph)
bestCandidate = s0
tabuList = []
tabuList.append(s0)
stop = False
best_keep_turn = 0
start_time = time.time()
while not stop:
sNeighborhood = getNeighbors(bestCandidate)
bestCandidate = sNeighborhood[0]
for sCandidate in sNeighborhood:
if (sCandidate not in tabuList) and ((fitness(sCandidate, graph) < fitness(bestCandidate, graph))):
bestCandidate = sCandidate
if (fitness(bestCandidate, graph) < fitness(sBest, graph)):
sBest = bestCandidate
vBest = fitness(sBest, graph)
best_keep_turn = 0
tabuList.append(bestCandidate)
if (len(tabuList) > maxTabuSize):
tabuList.pop(0)
if best_keep_turn == stoppingTurn:
stop = True
best_keep_turn += 1
exec_time = time.time() - start_time
return sBest, vBest, exec_time
## Tabu Search Takes edge-list in a given format:
# nodefrom nodeto weight
# 0 1 5
# 3 2 4
# 1 0 3
# Undirectional edges should be written 2 times for both nodes.
# maxTabuSize = 10000
maxTabuSize = 500
neighborhood_size = 500
stoppingTurn = 500
max_fitness = 0
start_node = 0
# solution, value, exec_time = tabu_search("test.txt")
solution, value, exec_time = tabu_search("five_d.txt")
print(solution)
print(value)
print(exec_time)

python files - who to open corectly

We have file with some math problems like: 46 + 19 (only + or - and it built up this way: number, space, sign, space, number) and we need to transform it into a new file and solve them (46 + 19 = 65). We don't know how many exercises there will be or the number of digits in every number. Here is my code:
enter code here
input_file = open(r'C:\try\bla.txt', 'r')
nums = input_file.read()
y = 0
dig1 = ''
dig2 = ''
sign = ''
x1 = nums.find(' ')
x2 = x1 + 1
def one(dig1, dig2, y):
for i in xrange(x1):
dig1 += nums[y]
y += 1
for m in xrange(abs(-x2)):
dig2 += nums[y + 1]
y += 1
sign = nums[x2]
if sign == '+':
sum = int(dig1) + int(dig2)
if sign == '-':
sum = int(dig1) - int(dig2)
print dig1, dig2, '=', sum
for a in xrange(0):
one(dig1, dig2, y)
one(dig1, dig2, y)
print 'f', nums[21]
#print dig1, dig2, '=', sum
Maybe you are want to get this(python3):
test.txt:
10 + 15
22 - 71
33 + 64
code:
import operator
op = {'+': operator.add, '-': operator.sub}
with open('test.txt', 'r') as f:
lines = f.readlines()
for i in lines:
args = i.split()
val = op[args[1]](int(args[0]), int(args[-1]))
r = f'{i.strip()} = {val}'
print(r)

python scripts showing different result( with one error ) in two similar input files

The script, originally taken and modified from (http://globplot.embl.de/):
#!/usr/bin/env python
# Copyright (C) 2003 Rune Linding - EMBL
# GlobPlot TM
# GlobPlot is licensed under the Academic Free license
from string import *
from sys import argv
from Bio import File
from Bio import SeqIO
import fpformat
import sys
import tempfile
import os
from os import system,popen3
import math
# Russell/Linding
RL = {'N':0.229885057471264,'P':0.552316012226663,'Q':-0.187676577424997,'A':-0.261538461538462,'R':-0.176592654077609, \
'S':0.142883029808825,'C':-0.0151515151515152,'T':0.00887797506611258,'D':0.227629796839729,'E':-0.204684629516228, \
'V':-0.386174834235195,'F':-0.225572305974316,'W':-0.243375458622095,'G':0.433225711769886,'H':-0.00121743364986608, \
'Y':-0.20750516775322,'I':-0.422234699606962,'K':-0.100092289621613,'L':-0.337933495925287,'M':-0.225903614457831}
def Sum(seq,par_dict):
sum = 0
results = []
raws = []
sums = []
p = 1
for residue in seq:
try:
parameter = par_dict[residue]
except:
parameter = 0
if p == 1:
sum = parameter
else:
sum = sum + parameter#*math.log10(p)
ssum = float(fpformat.fix(sum,10))
sums.append(ssum)
p +=1
return sums
def getSlices(dydx_data, DOM_join_frame, DOM_peak_frame, DIS_join_frame, DIS_peak_frame):
DOMslices = []
DISslices = []
in_DOMslice = 0
in_DISslice = 0
beginDOMslice = 0
endDOMslice = 0
beginDISslice = 0
endDISslice = 0
for i in range( len(dydx_data) ):
#close dom slice
if in_DOMslice and dydx_data[i] > 0:
DOMslices.append([beginDOMslice, endDOMslice])
in_DOMslice = 0
#close dis slice
elif in_DISslice and dydx_data[i] < 0:
DISslices.append([beginDISslice, endDISslice])
in_DISslice = 0
# elseif inSlice expandslice
elif in_DOMslice:
endDOMslice += 1
elif in_DISslice:
endDISslice += 1
# if not in slice and dydx !== 0 start slice
if dydx_data[i] > 0 and not in_DISslice:
beginDISslice = i
endDISslice = i
in_DISslice = 1
elif dydx_data[i] < 0 and not in_DOMslice:
beginDOMslice = i
endDOMslice = i
in_DOMslice = 1
#last slice
if in_DOMslice:
DOMslices.append([beginDOMslice, endDOMslice])
if in_DISslice:
DISslices.append([beginDISslice,endDISslice])
k = 0
l = 0
while k < len(DOMslices):
if k+1 < len(DOMslices) and DOMslices[k+1][0]-DOMslices[k][1] < DOM_join_frame:
DOMslices[k] = [ DOMslices[k][0], DOMslices[k+1][1] ]
del DOMslices[k+1]
elif DOMslices[k][1]-DOMslices[k][0]+1 < DOM_peak_frame:
del DOMslices[k]
else:
k += 1
while l < len(DISslices):
if l+1 < len(DISslices) and DISslices[l+1][0]-DISslices[l][1] < DIS_join_frame:
DISslices[l] = [ DISslices[l][0], DISslices[l+1][1] ]
del DISslices[l+1]
elif DISslices[l][1]-DISslices[l][0]+1 < DIS_peak_frame:
del DISslices[l]
else:
l += 1
return DOMslices, DISslices
def SavitzkyGolay(window,derivative,datalist):
SG_bin = 'sav_gol'
stdin, stdout, stderr = popen3(SG_bin + '-D' + str(derivative) + ' -n' + str(window)+','+str(window))
for data in datalist:
stdin.write(`data`+'\n')
try:
stdin.close()
except:
print stderr.readlines()
results = stdout.readlines()
stdout.close()
SG_results = []
for result in results:
SG_results.append(float(fpformat.fix(result,6)))
return SG_results
def reportSlicesTXT(slices, sequence, maskFlag):
if maskFlag == 'DOM':
coordstr = '|GlobDoms:'
elif maskFlag == 'DIS':
coordstr = '|Disorder:'
else:
raise SystemExit
if slices == []:
#by default the sequence is in uppercase which is our search space
s = sequence
else:
# insert seq before first slide
if slices[0][0] > 0:
s = sequence[0:slices[0][0]]
else:
s = ''
for i in range(len(slices)):
#skip first slice
if i > 0:
coordstr = coordstr + ', '
coordstr = coordstr + str(slices[i][0]+1) + '-' + str(slices[i][1]+1)
#insert the actual slice
if maskFlag == 'DOM':
s = s + lower(sequence[slices[i][0]:(slices[i][1]+1)])
if i < len(slices)-1:
s = s + upper(sequence[(slices[i][1]+1):(slices[i+1][0])])
#last slice
elif slices[i][1] < len(sequence)-1:
s = s + lower(sequence[(slices[i][1]+1):(len(sequence))])
elif maskFlag == 'DIS':
s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)])
#insert untouched seq between disorder segments, 2-run labelling
if i < len(slices)-1:
s = s + sequence[(slices[i][1]+1):(slices[i+1][0])]
#last slice
elif slices[i][1] < len(sequence)-1:
s = s + sequence[(slices[i][1]+1):(len(sequence))]
return s,coordstr
def runGlobPlot():
try:
smoothFrame = int(sys.argv[1])
DOM_joinFrame = int(sys.argv[2])
DOM_peakFrame = int(sys.argv[3])
DIS_joinFrame = int(sys.argv[4])
DIS_peakFrame = int(sys.argv[5])
file = str(sys.argv[6])
db = open(file,'r')
except:
print 'Usage:'
print ' ./GlobPipe.py SmoothFrame DOMjoinFrame DOMpeakFrame DISjoinFrame DISpeakFrame FASTAfile'
print ' Optimised for ELM: ./GlobPlot.py 10 8 75 8 8 sequence_file'
print ' Webserver settings: ./GlobPlot.py 10 15 74 4 5 sequence_file'
raise SystemExit
for cur_record in SeqIO.parse(db, "fasta"):
#uppercase is searchspace
seq = upper(str(cur_record.seq))
# sum function
sum_vector = Sum(seq,RL)
# Run Savitzky-Golay
smooth = SavitzkyGolay('smoothFrame',0, sum_vector)
dydx_vector = SavitzkyGolay('smoothFrame',1, sum_vector)
#test
sumHEAD = sum_vector[:smoothFrame]
sumTAIL = sum_vector[len(sum_vector)-smoothFrame:]
newHEAD = []
newTAIL = []
for i in range(len(sumHEAD)):
try:
dHEAD = (sumHEAD[i+1]-sumHEAD[i])/2
except:
dHEAD = (sumHEAD[i]-sumHEAD[i-1])/2
try:
dTAIL = (sumTAIL[i+1]-sumTAIL[i])/2
except:
dTAIL = (sumTAIL[i]-sumTAIL[i-1])/2
newHEAD.append(dHEAD)
newTAIL.append(dTAIL)
dydx_vector[:smoothFrame] = newHEAD
dydx_vector[len(dydx_vector)-smoothFrame:] = newTAIL
globdoms, globdis = getSlices(dydx_vector, DOM_joinFrame, DOM_peakFrame, DIS_joinFrame, DIS_peakFrame)
s_domMask, coordstrDOM = reportSlicesTXT(globdoms, seq, 'DOM')
s_final, coordstrDIS = reportSlicesTXT(globdis, s_domMask, 'DIS')
sys.stdout.write('>'+cur_record.id+coordstrDOM+coordstrDIS+'\n')
print s_final
print '\n'
return
runGlobPlot()
My input and output files are here: link
This script takes a input (input1.fa) and gives following output output1.txt
But when I try to run this script with similar type but larger input file (input2.fa) .. It shows following error:
Traceback (most recent call last):
File "final_script_globpipe.py", line 207, in <module>
runGlobPlot()
File "final_script_globpipe.py", line 179, in runGlobPlot
smooth = SavitzkyGolay('smoothFrame',0, sum_vector)
File "final_script_globpipe.py", line 105, in SavitzkyGolay
stdin.write(`data`+'\n')
IOError: [Errno 22] Invalid argument
I have no idea where the problem is. Any type of suggestion is appriciated.
I am using python 2.7 in windows 7 machine. I have also attached the Savitzky Golay module which is needed to run the script.
Thanks
UPDATE:
After trying to reproduce the error on linux it's showing a similar behavior, working fine with the first file but with the second is returning Errno32.
Traceback:
Traceback (most recent call last):
File "Glob.py", line 207, in <module>
runGlobPlot()
File "Glob.py", line 179, in runGlobPlot
smooth = SavitzkyGolay('smoothFrame',0, sum_vector)
File "Glob.py", line 105, in SavitzkyGolay
stdin.write(`data`+'\n')
IOError: [Errno 32] Broken pipe
Update:
Some calls of the SG_bin return that the -n parameter is the wrong type.
Wrong type of parameter for flag -n. Has to be unsigned,unsigned
This parameter comes from the window variable that is passed to the SavitzkyGolay function.
Surrounding the stdin.write with a trycatch block reveals that it breaks a hadnfull of times.
try:
for data in datalist:
stdin.write(repr(data)+'\n')
except:
print "It broke"

Categories