Finding Semantic Similarity between Sentences in a Document - python

I have put together some code from this link which is nicely colour coded with 4 minor changes to fix some errors. I also used some code from 2 previous forums.
What the code is supposed to do is calculate the semantic similarity between consecutive sentences across a whole text then display all the similarity values obtained like this;
'the yellow door.', 'The red hammer' 0.65
'pink fox in the woods.', 'commander fox is blue.' 0.32
Here is the code;
ALPHA = 0.2
BETA = 0.45
ETA = 0.4
PHI = 0.2
DELTA = 0.85
brown_freqs = dict()
N = 0
######################### word similarity ##########################
def get_best_synset_pair(word_1, word_2):
"""
Choose the pair with highest path similarity among all pairs.
Mimics pattern-seeking behavior of humans.
"""
max_sim = -1.0
synsets_1 = wn.synsets(word_1)
synsets_2 = wn.synsets(word_2)
if len(synsets_1) == 0 or len(synsets_2) == 0:
return None, None
else:
max_sim = -1.0
best_pair = None, None
for synset_1 in synsets_1:
for synset_2 in synsets_2:
sim = wn.path_similarity(synset_1, synset_2)
if sim > max_sim:
max_sim = sim
best_pair = synset_1, synset_2
return best_pair
def length_dist(synset_1, synset_2):
l_dist = sys.maxint
if synset_1 is None or synset_2 is None:
return 0.0
if synset_1 == synset_2:
# if synset_1 and synset_2 are the same synset return 0
l_dist = 0.0
else:
wset_1 = set([str(x.name()) for x in synset_1.lemmas()])
wset_2 = set([str(x.name()) for x in synset_2.lemmas()])
if len(wset_1.intersection(wset_2)) > 0:
# if synset_1 != synset_2 but there is word overlap, return 1.0
l_dist = 1.0
else:
# just compute the shortest path between the two
l_dist = synset_1.shortest_path_distance(synset_2)
if l_dist is None:
l_dist = 0.0
# normalize path length to the range [0,1]
return math.exp(-ALPHA * l_dist)
def hierarchy_dist(synset_1, synset_2):
h_dist = sys.maxint
if synset_1 is None or synset_2 is None:
return h_dist
if synset_1 == synset_2:
# return the depth of one of synset_1 or synset_2
h_dist = max([x[1] for x in synset_1.hypernym_distances()])
else:
# find the max depth of least common subsumer
hypernyms_1 = {x[0]:x[1] for x in synset_1.hypernym_distances()}
hypernyms_2 = {x[0]:x[1] for x in synset_2.hypernym_distances()}
lcs_candidates = set(hypernyms_1.keys()).intersection(
set(hypernyms_2.keys()))
if len(lcs_candidates) > 0:
lcs_dists = []
for lcs_candidate in lcs_candidates:
lcs_d1 = 0
if lcs_candidate in hypernyms_1:
lcs_d1 = hypernyms_1[lcs_candidate]
lcs_d2 = 0
if lcs_candidate in hypernyms_2:
lcs_d2 = hypernyms_2[lcs_candidate]
lcs_dists.append(max([lcs_d1, lcs_d2]))
h_dist = max(lcs_dists)
else:
h_dist = 0
return ((math.exp(BETA * h_dist) - math.exp(-BETA * h_dist)) /
(math.exp(BETA * h_dist) + math.exp(-BETA * h_dist)))
def word_similarity(word_1, word_2):
synset_pair = get_best_synset_pair(word_1, word_2)
return (length_dist(synset_pair[0], synset_pair[1]) *
hierarchy_dist(synset_pair[0], synset_pair[1]))
######################### sentence similarity ##########################
def most_similar_word(word, word_set):
max_sim = -1.0
sim_word = ""
for ref_word in word_set:
sim = word_similarity(word, ref_word)
if sim > max_sim:
max_sim = sim
sim_word = ref_word
return sim_word, max_sim
def info_content(lookup_word):
global N
if N == 0:
# poor man's lazy evaluation
for sent in brown.sents():
for word in sent:
word = word.lower()
if not word in brown_freqs:
brown_freqs[word] = 0
brown_freqs[word] = brown_freqs[word] + 1
N = N + 1
lookup_word = lookup_word.lower()
n = 0 if not lookup_word in brown_freqs else brown_freqs[lookup_word]
return 1.0 - (math.log(n + 1) / math.log(N + 1))
def semantic_vector(words, joint_words, info_content_norm):
sent_set = set(words)
semvec = np.zeros(len(joint_words))
i = 0
for joint_word in joint_words:
if joint_word in sent_set:
# if word in union exists in the sentence, s(i) = 1 (unnormalized)
semvec[i] = 1.0
if info_content_norm:
semvec[i] = semvec[i] * math.pow(info_content(joint_word), 2)
else:
# find the most similar word in the joint set and set the sim value
sim_word, max_sim = most_similar_word(joint_word, sent_set)
semvec[i] = PHI if max_sim > PHI else 0.0
if info_content_norm:
semvec[i] = semvec[i] * info_content(joint_word) * info_content(sim_word)
i = i + 1
return semvec
def semantic_similarity(sentence_1, sentence_2, info_content_norm):
words_1 = nltk.word_tokenize(sentence_1)
words_2 = nltk.word_tokenize(sentence_2)
joint_words = set(words_1).union(set(words_2))
vec_1 = semantic_vector(words_1, joint_words, info_content_norm)
vec_2 = semantic_vector(words_2, joint_words, info_content_norm)
return np.dot(vec_1, vec_2.T) / (np.linalg.norm(vec_1) * np.linalg.norm(vec_2))
######################### word order similarity ##########################
def word_order_vector(words, joint_words, windex):
wovec = np.zeros(len(joint_words))
i = 0
wordset = set(words)
for joint_word in joint_words:
if joint_word in wordset:
# word in joint_words found in sentence, just populate the index
wovec[i] = windex[joint_word]
else:
# word not in joint_words, find most similar word and populate
# word_vector with the thresholded similarity
sim_word, max_sim = most_similar_word(joint_word, wordset)
if max_sim > ETA:
wovec[i] = windex[sim_word]
else:
wovec[i] = 0
i = i + 1
return wovec
def word_order_similarity(sentence_1, sentence_2):
"""
Computes the word-order similarity between two sentences as the normalized
difference of word order between the two sentences.
"""
words_1 = nltk.word_tokenize(sentence_1)
words_2 = nltk.word_tokenize(sentence_2)
joint_words = list(set(words_1).union(set(words_2)))
windex = {x[1]: x[0] for x in enumerate(joint_words)}
r1 = word_order_vector(words_1, joint_words, windex)
r2 = word_order_vector(words_2, joint_words, windex)
return 1.0 - (np.linalg.norm(r1 - r2) / np.linalg.norm(r1 + r2))
######################### overall similarity ##########################
def similarity(sentence_1, sentence_2, info_content_norm):
"""
Calculate the semantic similarity between two sentences. The last
parameter is True or False depending on whether information content
normalization is desired or not.
"""
return DELTA * semantic_similarity(sentence_1, sentence_2, info_content_norm) + \
(1.0 - DELTA) * word_order_similarity(sentence_1, sentence_2)
THIS IS THE LOOPING PART
with open ("C:\\Users\\Lenovo2\\Desktop\\Test123.txt", "r") as sentence_file:
# Initialize a list to hold the results
results = []
# Loop until we hit the end of the file
while True:
# Read two lines
x = sentence_file.readline()
y = sentence_file.readline()
# Check if we've reached the end of the file, if so, we're done
if not y:
# Break out of the infinite loop
break
else:
# The .rstrip('\n') removes the newline character from each line
x = x.rstrip('\n')
y = y.rstrip('\n')
# Calculate your similarity value
similarity_value = similarity(x, y, True)
# Add the two lines and similarity value to the results list
results.append([x, y, similarity_value])
# Loop through the pairs in the results list and print them
for pair in results:
print(pair)
When I run the code on a text file I get an error code and instead of obtaining a number for the value of similarity between sentences, I get nan;
Warning (from warnings module):
File "C:\Users\Lenovo2\Desktop\Semantic Analysis (1).py", line 191
return np.dot(vec_1, vec_2.T) / (np.linalg.norm(vec_1) * np.linalg.norm(vec_2))
RuntimeWarning: invalid value encountered in double_scalars
In a previous forum, I understood that this error probably meant that I was dividing by zero so we have a zero vector. I am pretty much stuck there and with limited python experience, I don't know how to fix the program easily and without changing too much.

My guess is that you are passing an empty string. Do you have any blank lines in your text? You don't strip the newline until after checking for empty string, so a string containing only a newline will not be caught.
Since you appear to be on Windows, there also might be '\r\n' style newlines, so your rstrip might not work as expected.
I'd recommend adding the following modification (also do a print for debugging):
# Loop until we hit the end of the file
while True:
# Read two lines, removing trailing whitespace
x = sentence_file.readline().rstrip()
y = sentence_file.readline().rstrip()
# Check if we've reached the end of the file, if so, we're done
if not x or not y:
# Break out of the infinite loop
break
else:
print(x, y)
# Calculate your similarity value
similarity_value = similarity(x, y, True)
# Add the two lines and similarity value to the results list
results.append([x, y, similarity_value])
Note that the code appears to have a bug, because you are not comparing sentences pairwise. That is, if you had sentences [a, b, c, d], you are only comparing (a, b) and (c, d), but you really want to compare (a, b), (b, c), (c, d).
You can clean this up a bit by using the itertools library:
from itertools import pairwise
lines = open ("C:\\Users\\Lenovo2\\Desktop\\Test123.txt", "r")
for a, b in pairwise(lines):
x = a.rstrip()
y = b.rstrip()
# ... rest unchanged

Related

filling the gaps between list of angles (numbers)

i'll explain for simple example then go into the deep
if i have a list of number consist of
t_original = [180,174,168,166,162,94,70,80,128,131,160,180]
if we graph this so it goes down from 180 to 70 then it ups to 180 again
but if we suddenly change the fourth value (166) by 450 then the list will be
t = [180,174,168,700,162,94,70,80,128,131,160,180]
which dose not make sense in the graph
i wanna treat the fourth value (700) as a wrong value
i want to replace it with a relative value even if not as the original value but relative to the previous two elements (168,174)
i wanna do the same for the whole list if another wrong value appeared again
we can call that [Filling gaps between list of numbers]
so i'm tryig to do the same idea but for bigger example
the method i have tried
and i'll share my code with output , filtered means applied filling gap function
my code
def preprocFN(*U):
prePlst=[] # after preprocessing list
#preprocessing Fc =| 2*LF1 prev by 1 - LF2 prev by 2 |
c0 = -2 #(previous) by 2
c1 =-1 #(previous)
c2 =0 #(current)
c3 = 1 #(next)
preP = U[0] # original list
if c2 == 0:
prePlst.append(preP[0])
prePlst.append(preP[1])
c1+=2
c2+=2
c0+=2
oldlen = len(preP)
while oldlen > c2:
Equ = abs(2*preP[c1] - preP[c0]) #fn of preprocessing #removed abs()
formatted_float = "{:.2f}".format(Equ) #with .2 number only
equu = float(formatted_float) #from string float to float
prePlst.insert(c2,equu) # insert the preprocessed value to the List
c1+=1
c2+=1
c0+=1
return prePlst
with my input : https://textuploader.com/t1py9
the output will be : https://textuploader.com/t1pyk
and when printing the values higher than 180 (wrong values)
result_list = [item for item in list if item > 180]
which dosen't make sense that any joint of human can pass the angle of 180
the output was [183.6, 213.85, 221.62, 192.05, 203.39, 197.22, 188.45, 182.48, 180.41, 200.09, 200.67, 198.14, 199.44, 198.45, 200.55, 193.25, 204.19, 204.35, 200.59, 211.4, 180.51, 183.4, 217.91, 218.94, 213.79, 205.62, 221.35, 182.39, 180.62, 183.06, 180.78, 231.09, 227.33, 224.49, 237.02, 212.53, 207.0, 212.92, 182.28, 254.02, 232.49, 224.78, 193.92, 216.0, 184.82, 214.68, 182.04, 181.07, 234.68, 233.63, 182.84, 193.94, 226.8, 223.69, 222.77, 180.67, 184.72, 180.39, 183.99, 186.44, 233.35, 228.02, 195.31, 183.97, 185.26, 182.13, 207.09, 213.21, 238.41, 229.38, 181.57, 211.19, 180.05, 181.47, 199.69, 213.59, 191.99, 194.65, 190.75, 199.93, 221.43, 181.51, 181.42, 180.22]
so the filling gaps fn from proposed method dosen't do it's job
any suggestion for applying the same concept with a different way ?
Extra Info may help
the filtered graph consists of filling gap function and then applying normalize function
i don't think the problem is from the normalizing function since the output from the filling gaps function isn't correct in my opinion maybe i'm wrong but anyway i provide the normalize steps so you get how the final filtered graph has been made
fn :
My Code :
def outLiersFN(*U):
outliers=[] # after preprocessing list
#preprocessing Fc =| 2*LF1 prev by 1 - LF2 prev by 2 |
c0 = -2 #(previous) by 2 #from original
c1 =-1 #(previous) #from original
c2 =0 #(current) #from original
c3 = 1 #(next) #from original
preP = U[0] # original list
if c2 == 0:
outliers.append(preP[0])
c1+=1
c2+=1
c0+=1
c3+=1
oldlen = len(preP)
M_RangeOfMotion = 90
while oldlen > c2 :
if c3 == oldlen:
outliers.insert(c2, preP[c2]) #preP[c2] >> last element in old list
break
if (preP[c2] > M_RangeOfMotion and preP[c2] < (preP[c1] + preP[c3])/2) or (preP[c2] < M_RangeOfMotion and preP[c2] > (preP[c1] + preP[c3])/2): #Check Paper 3.3.1
Equ = (preP[c1] + preP[c3])/2 #fn of preprocessing # From third index # ==== inserting current frame
formatted_float = "{:.2f}".format(Equ) #with .2 number only
equu = float(formatted_float) #from string float to float
outliers.insert(c2,equu) # insert the preprocessed value to the List
c1+=1
c2+=1
c0+=1
c3+=1
else :
Equ = preP[c2] # fn of preprocessing #put same element (do nothing)
formatted_float = "{:.2f}".format(Equ) # with .2 number only
equu = float(formatted_float) # from string float to float
outliers.insert(c2, equu) # insert the preprocessed value to the List
c1 += 1
c2 += 1
c0 += 1
c3 += 1
return outliers
I suggest the following algorithm:
data point t[i] is considered an outlier if it deviates from the average of t[i-2], t[i-1], t[i], t[i+1], t[i+2] by more than the standard deviation of these 5 elements.
outliers are replaced by the average of the two elements around them.
import matplotlib.pyplot as plt
from statistics import mean, stdev
t = [180,174,168,700,162,94,70,80,128,131,160,180]
def smooth(t):
new_t = []
for i, x in enumerate(t):
neighbourhood = t[max(i-2,0): i+3]
m = mean(neighbourhood)
s = stdev(neighbourhood, xbar=m)
if abs(x - m) > s:
x = ( t[i - 1 + (i==0)*2] + t[i + 1 - (i+1==len(t))*2] ) / 2
new_t.append(x)
return new_t
new_t = smooth(t)
plt.plot(t)
plt.plot(new_t)
plt.show()

Numba: how to parse arbitrary logic string into sequence of jitclassed instances in a loop

Tl Dr. If I were to explain the problem in short:
I have signals:
np.random.seed(42)
x = np.random.randn(1000)
y = np.random.randn(1000)
z = np.random.randn(1000)
and human readable string tuple logic like :
entry_sig_ = ((x,y,'crossup',False),)
exit_sig_ = ((x,z,'crossup',False), 'or_',(x,y,'crossdown',False))
where:
'entry_sig_' means the output will be 1 when the time series unfolds from left to right and 'entry_sig_' is hit. (x,y,'crossup',False) means: x crossed y up at a particular time i, and False means signal doesn't have "memory". Otherwise number of hits accumulates.
'exit_sig_' means the output will again become '0' when the 'exit_sig_' is hit.
The output is generated through:
#njit
def run(x, entry_sig, exit_sig):
'''
x: np.array
entry_sig, exit_sig: homogeneous tuples of tuple signals
Returns: sequence of 0 and 1 satisfying entry and exit sigs
'''
L = x.shape[0]
out = np.empty(L)
out[0] = 0.0
out[-1] = 0.0
i = 1
trade = True
while i < L-1:
out[i] = 0.0
if reduce_sig(entry_sig,i) and i<L-1:
out[i] = 1.0
trade = True
while trade and i<L-2:
i += 1
out[i] = 1.0
if reduce_sig(exit_sig,i):
trade = False
i+= 1
return out
reduce_sig(sig,i) is a function (see definition below) that parses the tuple and returns resulting output for a given point in time.
Question:
As of now, an object of SingleSig class is instantiated in the for loop from scratch for any given point in time; thus, not having "memory", which totally cancels the merits of having a class, a bare function will do. Does there exist a workaround (a different class template, a different approach, etc) so that:
combined tuple signal can be queried for its value at a particular point in time i.
"memory" can be reset; i.e. e.g. MultiSig(sig_tuple).memory_field can be set to 0 at a constituent signals levels.
Following code adds a memory to the signals which can be wiped using MultiSig.reset() to reset the count of all signals to 0. The memory can be queried using MultiSig.query_memory(key) to return the number of hits for that signal at that time.
For the memory function to work, I had to add unique keys to the signals to identify them.
from numba import njit, int64, float64, types
from numba.types import Array, string, boolean
from numba import jitclass
import numpy as np
np.random.seed(42)
x = np.random.randn(1000000)
y = np.random.randn(1000000)
z = np.random.randn(1000000)
# Example of "human-readable" signals
entry_sig_ = ((x,y,'crossup',False),)
exit_sig_ = ((x,z,'crossup',False), 'or_',(x,y,'crossdown',False))
# Turn signals into homogeneous tuple
#entry_sig_
entry_sig = (((x,y,'crossup',False),'NOP','1'),)
#exit_sig_
exit_sig = (((x,z,'crossup',False),'or_','2'),((x,y,'crossdown',False),'NOP','3'))
#njit
def cross(x, y, i):
'''
x,y: np.array
i: int - point in time
Returns: 1 or 0 when condition is met
'''
if (x[i - 1] - y[i - 1])*(x[i] - y[i]) < 0:
out = 1
else:
out = 0
return out
kv_ty = (types.string,types.int64)
spec = [
('memory', types.DictType(*kv_ty)),
]
#njit
def single_signal(x, y, how, acc, i):
'''
i: int - point in time
Returns either signal or accumulator
'''
if cross(x, y, i):
if x[i] < y[i] and how == 'crossdown':
out = 1
elif x[i] > y[i] and how == "crossup":
out = 1
else:
out = 0
else:
out = 0
return out
#jitclass(spec)
class MultiSig:
def __init__(self,entry,exit):
'''
initialize memory at single signal level
'''
memory_dict = {}
for i in entry:
memory_dict[str(i[2])] = 0
for i in exit:
memory_dict[str(i[2])] = 0
self.memory = memory_dict
def reduce_sig(self, sig, i):
'''
Parses multisignal
sig: homogeneous tuple of tuples ("human-readable" signal definition)
i: int - point in time
Returns: resulting value of multisignal
'''
L = len(sig)
out = single_signal(*sig[0][0],i)
logic = sig[0][1]
if out:
self.update_memory(sig[0][2])
for cnt in range(1, L):
s = single_signal(*sig[cnt][0],i)
if s:
self.update_memory(sig[cnt][2])
out = out | s if logic == 'or_' else out & s
logic = sig[cnt][1]
return out
def update_memory(self, key):
'''
update memory
'''
self.memory[str(key)] += 1
def reset(self):
'''
reset memory
'''
dicti = {}
for i in self.memory:
dicti[i] = 0
self.memory = dicti
def query_memory(self, key):
'''
return number of hits on signal
'''
return self.memory[str(key)]
#njit
def run(x, entry_sig, exit_sig):
'''
x: np.array
entry_sig, exit_sig: homogeneous tuples of tuples
Returns: sequence of 0 and 1 satisfying entry and exit sigs
'''
L = x.shape[0]
out = np.empty(L)
out[0] = 0.0
out[-1] = 0.0
i = 1
multi = MultiSig(entry_sig,exit_sig)
while i < L-1:
out[i] = 0.0
if multi.reduce_sig(entry_sig,i) and i<L-1:
out[i] = 1.0
trade = True
while trade and i<L-2:
i += 1
out[i] = 1.0
if multi.reduce_sig(exit_sig,i):
trade = False
i+= 1
return out
run(x, entry_sig, exit_sig)
To reiterate what I said in the comments, | and & are bitwise operators, not logical operators. 1 & 2 outputs 0/False which is not what I believe you want this to evaluate to so I made sure the out and s can only be 0/1 in order for this to produce the expected output.
You are aware that the because of:
out = out | s if logic == 'or_' else out & s
the order of the time-series inside entry_sig and exit_sig matters?
Let (output, logic) be tuples where output is 0 or 1 according to how crossup and crossdown would evalute the passed information of the tuple and logic is or_ or and_.
tuples = ((0,'or_'),(1,'or_'),(0,'and_'))
out = tuples[0][0]
logic = tuples[0][1]
for i in range(1,len(tuples)):
s = tuples[i][0]
out = out | s if logic == 'or_' else out & s
out = s
logic = tuples[i][1]
print(out)
0
changing the order of the tuple yields the other signal:
tuples = ((0,'or_'),(0,'and_'),(1,'or_'))
out = tuples[0][0]
logic = tuples[0][1]
for i in range(1,len(tuples)):
s = tuples[i][0]
out = out | s if logic == 'or_' else out & s
out = s
logic = tuples[i][1]
print(out)
1
The performance hinges on how many times the count needs to be updated. Using n=1,000,000 for all three time series, your code had a mean run-time of 0.6s on my machine, my code had 0.63s.
I then changed the crossing logic up a bit to save the number of if/else so that the nested if/else is only triggered if the time-series crossed which can be checked by one comparison only. This further halved the difference in run-time so above code now sits at 2.5% longer run-time your original code.

Cost function equation parsing

I'm trying to solve this assignment:
For the cost function​ below, where C is the cost of producing x units of a​ product, find the​ marginal-cost function. What is the marginal cost at the given value of​ x? C(x)=0.05x^3+0.8x^2+40x+100; x=500
How could I parse the bolded values, the formula, from this string? Something where this could be repeated with similar strings.
Here is my solution, it breaks down the equation into objects that need to be summed up and then evaluates each of them on its own:
def clean(string):
# Removes unnecesarry parts of input
return string.rstrip(';').split('=')[1]
def parse(string):
parts = string.split('+')
objs = []
for part in parts:
mult_pow = part.split('x')
if len(mult_pow) == 2:
# Both multiplier and power present
obj = (
float(mult_pow[0]),
float(mult_pow[1].lstrip('^')) if mult_pow[1] else 1.0
)
else:
if '^' in mult_pow[0]:
# Only power present
obj = (
1.0,
float(mult_pow[0].lstrip('^'))
)
else:
# Only multiplier present
obj = (
float(mult_pow[0]),
0
)
objs.append(obj)
return objs
def evaluate(parsed_objects, x):
result = 0
for obj in parsed_objects:
result += obj[0] * x**obj[1]
return result
def solve(equation, x):
cleaned_str = clean(equation)
parsed_objects = parse(cleaned_str)
result = evaluate(parsed_objects, x)
return result
x = 500
input_str = 'C(x)=0.05x^3+0.8x^2+40x+100;'
result = solve(input_str, x)
print(result)
Output:
6470100.0

Python_getting a count in a sequence of numbers then deviding it by length

Okay, so basically I have a list (seq) of 19 letters of DNA.
"CGGTACAATCGATTTAGAG"
I am looking to get the right code to count 'A','T','G','C'.
I have tried.
dna_count = seq.count <i>(i have done this for each letter)</i>
then i used: dna_fraction = dna_count/len(seq)
print(dna_fraction * 100)
this results in an error of, dna_count is not defined.
I also need to incorporate round() to 2d.p of the percentage outcome. and return this.
Code (from comment):
def percentBases(dnaStrand):
seq = "CGGTACAATCGATTTAGAG"
dna_count = seq.count("A") + seq.count("T") + seq.count("G") + seq.count("C")
dna_fraction = dna_count / len(seq)
print(dna_fraction * 100)
rawPerCent = 100/seq.count
percentC = round(rawPercent, 2)
return (percentC, percentG, percentA, percentT)
Error message (from comment):
dna_fraction = dna_count / len(seq)
NameError: name 'dna_count' is not defined
You can use Counter():
from collections import Counter
seq = "CGGTACAATCGATTTAGAG"
for element, count in Counter(seq).items(): # use Counter(seq).most_common() for sorted
print(f"{element}: {count / len(seq):.2%}")
Or you can do it in one line:
print(*(f"{e}: {c / len(seq):.2%}" for e, c in Counter(seq).items()), sep="\n")
# print("\n".join(f"{e}: {c / len(seq):.2%}" for e, c in Counter(seq).items()))
seq = "CGGTACAATCGATTTAGAG"
# use set to find every unique letter
for dna in set(seq):
# count total number for each DNA in sequence
dna_count = dna_count.count(dna)
# divide by total
dna_fraction = dna_count/len(seq)
# round by 2
dna_fraction = round(dna_fraction, 2)
# print percentage of dna in sequence
print(dna, dna_fraction * 100)

How to structure python programs? Tried making it more structured, now runs 13 times slower

Im very new to programming, I wrote a simple program for a school project and wanted to make the code "prettier" by not just having the program be one giant function but instead be made up of multiple smaller functions with a singe purpose. I seemed to have messed up royally since the program now runs 13 times slower. How should I structured the program to make it run faster and just in general make programs easier to write, read and edit?
Here are the two programs:
First program (for reference values runs in ≈0:20):
import numpy as np
import matplotlib.pyplot as plt
def graf(a,b,H,p):
GM = 39.5216489684
x_0 = a + np.sqrt(a**2 - b**2)
v_0 = np.sqrt(GM*(2/x_0 - 1/a))
konstant_period = np.sqrt(a**3)*H
h = 1/H
'''starting position given by an elliptic orbit '''
stor_x_lista = [x_0]
stor_y_lista = [0]
hastighet_x = [0]
hastighet_y = [v_0]
liten_x_lista = []
liten_y_lista = []
''' a loop that approximates the points of the orbit'''
t = 0
tid_lista = []
n = 0
while n < konstant_period:
hastighet_x.append(hastighet_x[n] - h*GM* stor_x_lista[n]/(np.sqrt(stor_x_lista[n]**2 + stor_y_lista[n]**2))**3)
stor_x_lista.append(stor_x_lista[n] + h*hastighet_x[n])
hastighet_y.append(hastighet_y[n] - h*GM*stor_y_lista[n]/(np.sqrt(stor_x_lista[n]**2 + stor_y_lista[n]**2))**3)
stor_y_lista.append(stor_y_lista[n] + h*hastighet_y[n])
'''smaller list of points to run faster'''
if n % p == 0:
liten_x_lista.append(stor_x_lista[n])
liten_y_lista.append(stor_y_lista[n])
tid_lista.append(t)
n += 1
t += h
''' function that finds the angle'''
vinkel = []
siffra = 0
while siffra < len(liten_x_lista):
if liten_y_lista[siffra ] >= 0:
vinkel.append( np.arccos( liten_x_lista[siffra]/np.sqrt( liten_x_lista[siffra]**2 + liten_y_lista[siffra]**2)))
siffra += 1
elif liten_y_lista[siffra] < 0 :
vinkel.append( np.pi + np.arccos( -liten_x_lista[siffra]/np.sqrt( liten_x_lista[siffra]**2 + liten_y_lista[siffra]**2) ))
siffra += 1
'''get rid of line to find periodic function'''
mod_lista = []
modn = 0
while modn < len(vinkel):
mod_lista.append(vinkel[modn] - (2*np.pi*tid_lista[modn])/np.sqrt(a**3))
modn += 1
'''make all inputs have period 1'''
squeeze_tid = []
squeezen = 0
while squeezen < len(tid_lista):
squeeze_tid.append(tid_lista[squeezen]/np.sqrt(a**3))
squeezen += 1
del mod_lista[-1:]
del tid_lista[-1:]
del squeeze_tid[-1:]
plt.plot(squeeze_tid,mod_lista)
plt.title('p(t) där a = ' + str(a) + ' och b = ' + str(b))
plt.show
Second more split up program (for reference values runs in ≈4:20):
import numpy as np
import matplotlib.pyplot as plt
'''function that generates the points of the orbit'''
def punkt(a,b,H,p):
GM = 39.5216489684
x_0 = a + np.sqrt(a**2 - b**2)
v_0 = np.sqrt(GM*(2/x_0 - 1/a))
konstant_period = np.sqrt(a**3)*H
h = 1/H
'''starting position given by an elliptic orbit '''
stor_x_lista = [x_0]
stor_y_lista = [0]
hastighet_x = [0]
hastighet_y = [v_0]
liten_x_lista = []
liten_y_lista = []
''' a loop that approximates the points of the orbit'''
t = 0
tid_lista = []
n = 0
while n < konstant_period:
hastighet_x.append(hastighet_x[n] - h*GM* stor_x_lista[n]/(np.sqrt(stor_x_lista[n]**2 + stor_y_lista[n]**2))**3)
stor_x_lista.append(stor_x_lista[n] + h*hastighet_x[n])
hastighet_y.append(hastighet_y[n] - h*GM*stor_y_lista[n]/(np.sqrt(stor_x_lista[n]**2 + stor_y_lista[n]**2))**3)
stor_y_lista.append(stor_y_lista[n] + h*hastighet_y[n])
'''smaller list of points to run faster'''
if n % p == 0:
liten_x_lista.append(stor_x_lista[n])
liten_y_lista.append(stor_y_lista[n])
tid_lista.append(t)
n += 1
t += h
return (liten_x_lista,liten_y_lista,tid_lista)
''' function that finds the angle'''
def vinkel(a,b,H,p):
'''import lists'''
liten_x_lista = punkt(a,b,H,p)[0]
liten_y_lista = punkt(a,b,H,p)[1]
tid_lista = punkt(a,b,H,p)[2]
'''find the angle'''
vinkel_lista = []
siffra = 0
while siffra < len(liten_x_lista):
if liten_y_lista[siffra ] >= 0:
vinkel_lista.append( np.arccos( liten_x_lista[siffra]/np.sqrt( liten_x_lista[siffra]**2 + liten_y_lista[siffra]**2)))
siffra += 1
elif liten_y_lista[siffra] < 0 :
vinkel_lista.append( np.pi + np.arccos( -liten_x_lista[siffra]/np.sqrt( liten_x_lista[siffra]**2 + liten_y_lista[siffra]**2) ))
siffra += 1
return (vinkel_lista, tid_lista)
def periodisk(a,b,H,p):
'''import lists'''
tid_lista = vinkel(a,b,H,p)[1]
vinkel_lista = vinkel(a,b,H,p)[0]
'''get rid of linear line to find p(t)'''
mod_lista = []
modn = 0
while modn < len(vinkel_lista):
mod_lista.append((vinkel_lista[modn] - (2*np.pi*tid_lista[modn])/np.sqrt(a**3)))
modn += 1
'''make all inputs have period 1'''
squeeze_tid = []
squeezen = 0
while squeezen < len(tid_lista):
squeeze_tid.append(tid_lista[squeezen]/np.sqrt(a**3))
squeezen += 1
del mod_lista[-1:]
del tid_lista[-1:]
del squeeze_tid[-1:]
return (squeeze_tid,mod_lista)
'''fixa 3d-punkt av p(a,b) a är konstant b varierar??? '''
def hitta_amp(a):
x_b = []
y_b = []
n_b = 0.1
while n_b <= a:
x_b.append(n_b)
y_b.append(punkt(a,n_b,10**5,10**3))
return 0
def graf(a,b,H,p):
plt.plot(periodisk(a,b,H,p)[0],periodisk(a,b,H,p)[1])
plt.show
I would assume the thing that is going wrong is that the program is running the same, slow code multiple times instead of just running it once and then accessing the data. Is the problem that everything is done locally and nothing is stored globally or is it something else?
Just as a heads up, the only thing I know about programming is basic syntax, I have no clue how to actually write and run programs. I ran all the code in spyder if that affects anything.
plt.plot(periodisk(a,b,H,p)[0],periodisk(a,b,H,p)[1])
This code runs periodisk twice with the same arguments, thus at this point we know we run things at least 2 times slower.
You should do some_var = periodisk(a,b,H,p) and then some_var[0], some_var[1]. Or just use unpacking:
plt.plot(*periodisk(a,b,H,p))
tid_lista = vinkel(a,b,H,p)[1]
vinkel_lista = vinkel(a,b,H,p)[0]
Again doing the same thing twice (total: 4*time of (current) vinkel function). Again, smart assignment to fix this:
vinkel_lista, tid_lista = vinkel(a,b,H,p)
liten_x_lista = punkt(a,b,H,p)[0]
liten_y_lista = punkt(a,b,H,p)[1]
tid_lista = punkt(a,b,H,p)[2]
And now you repeat yourself thrice. (total: 12 * time of current punkt function)
liten_x_lista, liten_y_lista, tid_lista = punkt(a,b,H,p)
punkt function is like in original, so we arrived as total being 12 times slower - which quite matches your time estimations. :)
You are calling the functions once per returned list, you should only call them once.
When a method returns multiple variables, (e.g. punkt):
def punkt(a,b,H,p):
# Here is all your code
return (liten_x_lista,liten_y_lista,tid_lista)
You must be careful to only call the function once:
result = punkt(a,b,H,p)
liten_x_lista = result[0]
liten_y_lista = result[1]
tid_lista = result[2]
# As opposed to:
liten_x_lista = punkt(a,b,H,p)[0] # 1st call, ignoring results 2 and 3
liten_y_lista = punkt(a,b,H,p)[1] # 2nd call, ignoring results 1 and 3
tid_lista = punkt(a,b,H,p)[2] # 3rd call, ignoring results 1 and 2
Note: I would personally not return a list, but use python's unpacking:
def punkt(a,b,H,p):
# Here is all your code
return liten_x_lista, liten_y_lista, tid_lista
And you'd access it:
liten_x_lista, liten_y_lista, tid_lista = punkt(a,b,H,p)

Categories