my viterbi code program becomes exponential. can you help me find the place i can change to make it dynamic program. I need to remember and use only the 2 previous tags of words.
thanks a lot.
from collections import defaultdict
import sys
import re
import feature_maker as fm
bla = ''
all_states = set()
#distirbuition over all of the corpus
POS_probability = fm.load_obj('probas')
POS_probability['START'] = 1.0
def cpd_tagwords(words, tag):
pattern = re.compile("\W")# to check for .,: etc.
if pattern.match(words) and tag == words:
return 1
elif pattern.match(tag):
return 0
for word in emle.split("\n"):
if word.__contains__(words) and word.__contains__(tag):
return word[word.index(":") + 2:]
#if we dont have data about the word with the tag,just retturn the probability
#to get the tag over all of the word in the corpus.
return POS_probability[tag]
def cpd_tags(early, prev, current):
lambda1 = 0
lambda3 = 0
lambda6 = 0
for word in qmle.split("\n"):
word1 = word.split()
if len(word1) > 0:
if word1[0].__contains__(current): #for tuple of 1
if len(word1) == 2:
lambda1 = word[word.index("]:") + 3:]
if len(word1) > 2 and word1[1].__contains__(prev): #for tuple of 2
if len(word1) == 3:
lambda3 = word[word.index("]:") + 3:]
if len(word1) > 3 and word1[2].__contains__(early): #for tuple of 3
if len(word1) == 4:
lambda6 = word[word.index("]:") + 3:]
return (0.6*float(lambda6)) + (0.3*float(lambda3)) + (0.1*float(lambda1))
#map: popular_copuler['POS'] = list of all pos that can come before it.
popular_copules = fm.load_obj('popular_copules')
# Viterbi Algo
def viterbi(sentence, tags1):
def findSet(index,tag):
if tag == 'ALL':
return tags1
if index in range(1, len(sentence) + 1):
possible_tags = set(popular_copules[tag])
if possible_tags == set([]):
return tags1
return set(popular_copules[tag])
elif index == 0 or index == -1:
return {'START'}
# stores (word:tag) in this whole sentence
sentence_with_tag = defaultdict(str)
# inner function to commpute pi values--start
def pi_viterbi(k, u, v, sentence):#here is the start of the bad sequence
prob = defaultdict(float)
# initialization
if k == 0 and u == 'START' and v == 'START':
return (1., 'START')
else:
for w in findSet(k - 2,u):
prev = pi_viterbi(k - 1, w, u, sentence)[0]
# tuple((w,u,v))
q = cpd_tags(w, u, v)**
e = cpd_tagwords(sentence[k - 1].lower(), v)
probability = float(prev) * q * float(e)
prob[tuple((w, u))] = probability**
#here is the end of the bad sequence
max_tuple = max(prob.items(), key=lambda x: x[1])
# print (max_tuple[1],max_tuple[0][0])
return max_tuple[1], max_tuple[0][0]
# inner function to commpute pi values--end
sentence_with_tag = list()
backpointer = defaultdict(str)
tags = defaultdict(str)
k = len(sentence)
u_glob = ''
v_glob = ''
glob = 0.
for i in range(1, k + 1):
prob = defaultdict(float)
#for current word we check all the tags
""" changed from for u in findSet(i - 1):"""
for u in findSet(i ,'ALL'):
#going backwards we call findset with u so it gives us only
# tags v that go togeter alot with u(this is purnnig)
""" changed from for v in findSet(i)"""
for v in findSet(i-1,u_glob):
#siwtched u and v
value, w = pi_viterbi(i, v, u, sentence)#the v recursion in the algorithm
prob[tuple((i, u, v))] = value
backpointer[tuple((i, u, v))] = w #bp from the algorithm
max_tuple = max(prob.items(), key=lambda x: x[1])
backpointer[tuple((i, max_tuple[0][1], max_tuple[0][-1]))] = max_tuple[0][1] # bp (k,u,v)= tag w
# sentence_with_tag.append(max_tuple[0][-1])
u_glob = max_tuple[0][-2]
v_glob = max_tuple[0][-1]
glob = max_tuple[1]
print ('Max', max_tuple)
tags[k - 1] = u_glob
tags[k] = v_glob
for i in range((k - 2), 0, -1):
tag = backpointer[tuple(((i + 2), tags[i + 1], tags[i + 2]))]
tags[i] = tag
tag_list = list()
for i in range(1, len(tags) + 1):
tag_list.append(tags[i])
file = open(sys.argv[4], 'w')
file.truncate()
for word in tag_list:
file.write(word)
# tag list as results
return tag_list
file=open(sys.argv[1],"r+")
fQ = open(sys.argv[2], 'r')
qmle = fQ.read()
fQ.close()
f = open("tags.txt",'r+')
tags = f.read()
f.close()
fe = open(sys.argv[3], 'r')
emle = fe.read()
distinct_tags = set()
# what is the list of all tags?
for word in tags.split():
distinct_tags.add(word)
sentence = []
sentence1 = []
sentence1 = file.read()
sentence = sentence1.split()
file.close()
file = open(sys.argv[4], 'w')
file.truncate()
viterbi(sentence, distinct_tags)
how can I reduce the time complexity?
Related
from app import getPhonemes
import pandas as pd
import sys
triphones = []
def phonemize(sentence):
tokens = sentence.split(' ')
phonemes = getPhonemes(tokens)
return '$'.join(phonemes)
def generateTriphones(phonemes):
triphones = []
for i in range(len(phonemes)):
for j in range(len(phonemes)):
for k in range(len(phonemes)):
triphones.append(phonemes[i] + ' ' + phonemes[j] + ' ' + phonemes[k])
return triphones
def scoreSentence(sentence,phonemes):
flag = 0
global triphones
score = 0
tokens = sentence.split('$')
uniqueTokens = set(tokens)
triphoneticTokens = [token for token in uniqueTokens if token.count(' ') > 1]
for token in triphoneticTokens:
for triphone in triphones:
if token.find(triphone) != -1:
score += 1
triphones.remove(triphone)
if triphones == []:
flag = -1
return score, flag
def Process(fil):
global triphones
file = open('itudict/vocab.phoneme', 'r',encoding='utf-8')
data = []
for line in file:
data.append(line.strip())
file.close()
phonemes = data[4:]
triphones = generateTriphones(phonemes)
data = pd.read_csv(fil+'.csv')
data = data.drop(['score','covered_vocab'],axis=1)
i = 1
while len(data) > 0:
print('Processing File: '+str(i))
sentencee = data[:10000]
data = data[10000:]
sentences = sentencee['sentence'].tolist()
phonemes = []
scores = []
for j in range(len(sentences)):
if j%1000 == 0:
print('Processing Sentence: '+str(j))
print(len(triphones))
phones = phonemize(sentences[j])
score, flag = scoreSentence(phones,phonemes)
if flag == -1:
data = []
phonemes.append(phones)
scores.append(score)
data['Phonemes'] = phonemes
data['score'] = scores
data.to_csv(fil+'phonemized'+str(i)+'.csv', index=False)
i += 1
if __name__ == '__main__':
Process(sys.argv[1])
I am trying to generate the phonemes for 800000 sentences. The model which am using is G2P which phonemizes the sentence. after phonemization i am calculating the scores. the phoneme array which i am using for calculating scores is of size 2620000.
The length of sentences are 800000 and the code is taking days, can somebody parallelize this code or suggest some solution
I want to parallelize this code to execute faster.
I am having this error for the part in bold:
Traceback (most recent call last):
File "C:/Users/appan/OneDrive/Documents/Year 3/AI Assignment Semester 1/Tabu Search/Tabu-search-on-Travelling-Salesman-Problem-master/TabuSearch2.py", line 238, in
solution, value, exec_time = tabu_search("five_d.txt")
File "C:/Users/appan/OneDrive/Documents/Year 3/AI Assignment Semester 1/Tabu Search/Tabu-search-on-Travelling-Salesman-Problem-master/TabuSearch2.py", line 175, in tabu_search
graph, max_weight = read_data(input_file_path)
File "C:/Users/appan/OneDrive/Documents/Year 3/AI Assignment Semester 1/Tabu Search/Tabu-search-on-Travelling-Salesman-Problem-master/TabuSearch2.py", line 64, in read_data
link.append(float(tmp[0]))
ValueError: could not convert string to float:
Process finished with exit code 1
can you help please
import math
from random import randint
import time
from random import shuffle
#import numpy as np
### Data Format is dict:
# data[node_name] = gives you a list of link info
# data[link_index][0] = name of node that edge goes to
# data[link_index][1] = weight of that edge
def read_data(path):
linkset = []
links = {}
max_weight = 0
'''
with open(path, "r") as f:
for line in f:
print (line)
link = []
#tmp = list(map(float,line.strip().split(' ')))
tmp=line.strip().split(' ')
arr=np.array(tmp)
print(arr)
link.append(float(tmp[0]))
link.append(float(tmp[1]))
link.append(float(tmp[2]))
linkset.append(link)
if float(tmp[2]) > max_weight:
max_weight = float(tmp[2])
link.append(int(tmp[0]))
link.append(int(tmp[1]))
link.append(int(tmp[2]))
linkset.append(link)
if int(tmp[2]) > max_weight:
max_weight = int(tmp[2])
'''
**with open(path,'r') as f:
for line in f:
#print(line)
link = []
#tmp = list(map(float,line.strip().split(' ')))
tmp = line.strip().split(' ')
#tmp = np.array()
print(tmp)
'''
for i in tmp:
link.append([i])
'''
link.append(float(tmp[0]))
link.append(float(tmp[1]))
link.append(float(tmp[2]))
linkset.append(link)
#print(link)
'''
link.append(list(map(float,tmp[0])))
link.append(list(map(float,tmp[1])))
link.append(list(map(float,tmp[2])))
linkset.append(link)
'''
if float(tmp[2]) > max_weight:
max_weight = float(tmp[2])**
for link in linkset:
try:
linklist = links[str(link[0])]
linklist.append(link[1:])
links[str(link[0])] = linklist
except:
links[str(link[0])] = [link[1:]]
return links, max_weight
def getNeighbors(state):
# return hill_climbing(state)
return two_opt_swap(state)
def hill_climbing(state):
node = randint(1, len(state) - 1)
neighbors = []
for i in range(len(state)):
if i != node and i != 0:
tmp_state = state.copy()
tmp = tmp_state[i]
tmp_state[i] = tmp_state[node]
tmp_state[node] = tmp
neighbors.append(tmp_state)
return neighbors
def two_opt_swap(state):
global neighborhood_size
neighbors = []
for i in range(neighborhood_size):
node1 = 0
node2 = 0
while node1 == node2:
node1 = randint(1, len(state) - 1)
node2 = randint(1, len(state) - 1)
if node1 > node2:
swap = node1
node1 = node2
node2 = swap
tmp = state[node1:node2]
tmp_state = state[:node1] + tmp[::-1] + state[node2:]
neighbors.append(tmp_state)
return neighbors
def fitness(route, graph):
path_length = 0
for i in range(len(route)):
if (i + 1 != len(route)):
dist = weight_distance(route[i], route[i + 1], graph)
if dist != -1:
path_length = path_length + dist
else:
return max_fitness # there is no such path
else:
dist = weight_distance(route[i], route[0], graph)
if dist != -1:
path_length = path_length + dist
else:
return max_fitness # there is no such path
return path_length
# not used in this code but some datasets has 2-or-more dimensional data points, in this case it is usable
def euclidean_distance(city1, city2):
return math.sqrt((city1[0] - city2[0]) ** 2 + ((city1[1] - city2[1]) ** 2))
def weight_distance(city1, city2, graph):
global max_fitness
neighbors = graph[str(city1)]
for neighbor in neighbors:
if neighbor[0] == int(city2):
return neighbor[1]
return -1 # there can't be minus distance, so -1 means there is not any city found in graph or there is not such edge
def tabu_search(input_file_path):
global max_fitness, start_node
graph, max_weight = read_data(input_file_path)
## Below, get the keys (node names) and shuffle them, and make start_node as start
s0 = list(graph.keys())
shuffle(s0)
if int(s0[0]) != start_node:
for i in range(len(s0)):
if int(s0[i]) == start_node:
swap = s0[0]
s0[0] = s0[i]
s0[i] = swap
break;
# max_fitness will act like infinite fitness
max_fitness = ((max_weight) * (len(s0))) + 1
sBest = s0
vBest = fitness(s0, graph)
bestCandidate = s0
tabuList = []
tabuList.append(s0)
stop = False
best_keep_turn = 0
start_time = time.time()
while not stop:
sNeighborhood = getNeighbors(bestCandidate)
bestCandidate = sNeighborhood[0]
for sCandidate in sNeighborhood:
if (sCandidate not in tabuList) and ((fitness(sCandidate, graph) < fitness(bestCandidate, graph))):
bestCandidate = sCandidate
if (fitness(bestCandidate, graph) < fitness(sBest, graph)):
sBest = bestCandidate
vBest = fitness(sBest, graph)
best_keep_turn = 0
tabuList.append(bestCandidate)
if (len(tabuList) > maxTabuSize):
tabuList.pop(0)
if best_keep_turn == stoppingTurn:
stop = True
best_keep_turn += 1
exec_time = time.time() - start_time
return sBest, vBest, exec_time
## Tabu Search Takes edge-list in a given format:
# nodefrom nodeto weight
# 0 1 5
# 3 2 4
# 1 0 3
# Undirectional edges should be written 2 times for both nodes.
# maxTabuSize = 10000
maxTabuSize = 500
neighborhood_size = 500
stoppingTurn = 500
max_fitness = 0
start_node = 0
# solution, value, exec_time = tabu_search("test.txt")
solution, value, exec_time = tabu_search("five_d.txt")
print(solution)
print(value)
print(exec_time)
I am trying to capture the data here in the second table (Field crops) titled "Prices Received, United States,July 2010, with Comparisons". I am using Panda dataframes to capture the table from the text file and then I will output it to a CSV file.
My code is as follows
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if 'Dollars' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 6
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
def take_table(txt_data):
comodity = []
q = []
w = []
e = []
t = []
p = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6 = data_row
comodity.append(col_1)
q.append(col_2)
w.append(col_3)
e.append(col_4)
t.append(col_5)
p.append(col_6)
table_data = {'comodity': comodity, 'q': q,
'w': w, 'e': e, 't': t}
return table_data
And, then I am doing this:
import requests
import pandas as pd
txt_data = requests.get("https://downloads.usda.library.cornell.edu/usda-esmis/files/c821gj76b/6w924d00c/9z903130m/AgriPric-07-30-2010.txt").text
splited_data = txt_data.split('\n')
table_title = 'Prices Received, United States'
END_TABLE_LINE = '-------------------------------------------'
_, table_start,_ = find_no_line_start_table(table_title,splited_data)
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
dict_table = take_table(txt_data)
pd.DataFrame(dict_table)
c = pd.DataFrame(dict_table)
IndexError: list assignment index out of range
However, I am getting an error here. Can anyone help me figure out what I am doing wrong?
Cause of error:
data_row is a list of 6 elements.
number_columns = 6
# ...
data_row = [''] * number_columns # [''] * 6
and index will increment with each iteration where first_column_done = True. But first_column_done will be True when : is encountered in a word, i.e
if ':' in w:
first_column_done = True
hence, for each iteration after first_column_done turns True, index will increment until it gets more than 6 which is the bound of list data_row.
def row(l):
l = l.split()
number_columns = 6
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w # error pos.
In other words, U get this error for each line that contains a number of words greater than 6 - index after the first occurence of : within a word in that line.
Fix:
Use split(':') and list comprehension as well as python tertiary operator.
def row(l):
row = [ col.strip() for col in l.split(':') ]
row[2:] = row[2].split()
return [ row[i] if i < len(row) else '' for i in range(6) ]
I read an article, named Unsupervised Personality Recognition for Social Network Sites, about the personality extraction from text. There are 22 features and 4 classes represented 4 personalities. Through counting the features in a text, we can know which class this sentence belong to, which means we can know the personality of the sentence.
The article provides the correlations for every feature and class. So the score of a class is the feature's value minus the mean of all feature's value then divided by the standard deviation of all feature's value then multiply the correlation coefficiency provided by the article. Then we can judge if the sentence belongs to the class through its score. I set the threshold to improve its accuracy but it is still not good enough. My result is around 50-60% accuracy and don't know how to improve it. Anyone can help me ?
import csv
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
import pickle
from statistics import mean, stdev
with open('mypersonality_final.csv', newline = '') as csvfile:
reader = csv.reader(csvfile)
test = []
for w in reader:
test.append(w)
def all_punctuation(text):
punctuations = ['.', ',', ';', ':']
count = 0
for w in text:
if w in punctuations:
count += 1
return count
def count_commas(text):
count = 0
for w in text:
if w == ',':
count += 1
return count
def count_pattern(text):
grammar = RegexpTokenizer(r'\#')
pattern = grammar.tokenize(text)
return len(pattern)
def count_exclamation(text):
grammar = RegexpTokenizer(r'\!')
pattern = grammar.tokenize(text)
return len(pattern)
def ex_links(text):
grammar = RegexpTokenizer(r'http?\S+\w(?:(?:\/[^\s/]*))*|www\.\S+\w(?:(?:\/[^\s/]*))*|ftp\S+\w(?:(?:\/[^\s/]*))*')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_sinpronouns(text):
sigpronouns = ['i', 'me', 'my', 'mine', 'we']
count = 0
for w in text:
if w.lower() in sigpronouns:
count += 1
return count
def negative_particle(text):
with open('negative-words.txt') as neg:
neg = neg.read()
words = nltk.word_tokenize(neg)
grammar = RegexpTokenizer(r'\w+')
nopunctuation = grammar.tokenize(text)
count = 0
for w in nopunctuation:
if w.lower() in words:
count += 1
return count
def negative_emoticon(text):
grammar = RegexpTokenizer(r"(?::|;|=)(?:-)?(?:\()")
emoticons = grammar.tokenize(text)
return len(emoticons)
def numbers(text):
grammar = RegexpTokenizer(r'\d+')
pattern = grammar.tokenize(text)
return len(pattern)
def parenthesis(text):
pat = '\([^)]*\)'
parent = re.findall(pat, text)
return len(parent)
def positive_emoticon(text):
grammar = RegexpTokenizer(r'(?::|;|=|<|>)(?:-|\.)?(?:\)|D|P|3|<)')
emoticons = grammar.tokenize(text)
return len(emoticons)
def prepositions(text):
tagged = nltk.pos_tag(text)
count = 0
for w in tagged:
if w[1] == 'IN':
count += 1
return count
def pronouns(text):
tagged = nltk.pos_tag(text)
count = 0
for w in tagged:
if (w[1] == 'PRP' or w[1] == 'PRP$' or w[1] == 'WP' or w[1] == 'WPR$'):
count += 1
return count
def count_question(text):
grammar = RegexpTokenizer(r'\?')
pattern = grammar.tokenize(text)
return len(pattern)
def long_words(text):
grammar = RegexpTokenizer(r'\w{7,}')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_pronouns(text):
firstpronouns = ['i', 'me', 'my', 'mine', 'we', 'our', 'ours', 'us']
count = 0
for w in text:
if w.lower() in firstpronouns:
count += 1
return count
def swears_count(text):
with open('swears.txt') as test:
words = test.read()
swears = re.sub(r'[^\w+\s]+', '', words)
swears = swears.split('\n')
count = 0
for w in text:
if w.lower() in swears:
count += 1
return count
def typetoken_ratio(text):
typed = set(text)
token = text
ratio = len(typed)/len(token)
return ratio
def count_words(text):
grammar = RegexpTokenizer(r'\w+')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_pluralpronouns(text):
pluralpronouns = ['we', 'our', 'ours', 'us']
count = 0
for w in text:
if w.lower() in pluralpronouns:
count += 1
return count
def sec_pronouns(text):
secpronouns = ['you', 'your', 'yours']
count = 0
for w in text:
if w.lower() in secpronouns:
count += 1
return count
def mean_freq(text):
## grammar = RegexpTokenizer(r'\w+')
words = word_tokenize(text)
wordsl = []
for w in words:
wordsl.append(w.lower())
unique = set(wordsl)
return (len(wordsl)/len(unique))
def mean_std(test):
f1 = []
f2 = []
f3 = []
f4 = []
f5 = []
f6 = []
f7 = []
f8 = []
f9 = []
f10 = []
f11 = []
f12 = []
f13 = []
f14 = []
f15 = []
f16 = []
f17 = []
f18 = []
f19 = []
f20 = []
f21 = []
f22 = []
for w in test[1:]:
f1.append(all_punctuation(word_tokenize(w[1])))
f2.append(count_commas(word_tokenize(w[1])))
f3.append(count_pattern(w[1]))
f4.append(count_exclamation(w[1]))
f5.append(ex_links(w[1]))
f6.append(firs_sinpronouns(word_tokenize(w[1])))
f7.append(negative_particle(w[1]))
f8.append(negative_emoticon(w[1]))
f9.append(numbers(w[1]))
f10.append(parenthesis(w[1]))
f11.append(positive_emoticon(w[1]))
f12.append(prepositions(word_tokenize(w[1])))
f13.append(pronouns(word_tokenize(w[1])))
f14.append(count_question(w[1]))
f15.append(long_words(w[1]))
f16.append(firs_pronouns(word_tokenize(w[1])))
f17.append(swears_count(word_tokenize(w[1])))
f18.append(typetoken_ratio(word_tokenize(w[1])))
f19.append(count_words(w[1]))
f20.append(firs_pluralpronouns(word_tokenize(w[1])))
f21.append(sec_pronouns(word_tokenize(w[1])))
f22.append(mean_freq(w[1]))
value = [f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, f20, f21, f22]
mean1 = []
stdev1 = []
for a in value:
mean1.append(round(mean(a),2))
stdev1.append(round(stdev(a),2))
return (mean1, stdev1)
##save_file = open('sample_value.pickle', 'wb')
##pickle.dump(mean_std(test), save_file)
##save_file.close()
savedfile = open('sample_value.pickle', 'rb')
trained = pickle.load(savedfile)
savedfile.close()
def evaluation(test):
ne = 0
ns = 0
na = 0
nc = 0
no = 0
for w in test[1:]:
z1 = (all_punctuation(word_tokenize(w[1])) - trained[0][0])/(trained[1][0])
z2 = (count_commas(word_tokenize(w[1]))- trained[0][1])/(trained[1][1])
z3 = (count_pattern(w[1]) - trained[0][2])/(trained[1][2])
z4 = (count_exclamation(w[1]) - trained[0][3])/(trained[1][3])
z5 = (ex_links(w[1]) - trained[0][4])/(trained[1][4])
z6 = (firs_sinpronouns(word_tokenize(w[1]))- trained[0][5])/(trained[1][5])
z7 = (negative_particle(w[1])-trained[0][6])/(trained[1][6])
z8 = (negative_emoticon(w[1]) - trained[0][7])/(trained[1][7])
z9 = (numbers(w[1])-trained[0][8])/(trained[1][8])
z10 = (parenthesis(w[1])-trained[0][9])/(trained[1][9])
z11 = (positive_emoticon(w[1])-trained[0][10])/(trained[1][10])
z12 = (prepositions(word_tokenize(w[1]))-trained[0][11])/(trained[1][11])
z13 = (pronouns(word_tokenize(w[1]))-trained[0][12])/(trained[1][12])
z14 = (count_question(w[1])-trained[0][13])/(trained[1][13])
z15 = (long_words(w[1])-trained[0][14])/(trained[1][14])
z16 = (firs_pronouns(word_tokenize(w[1]))-trained[0][15])/(trained[1][15])
z17 = (swears_count(word_tokenize(w[1]))-trained[0][16])/(trained[1][16])
z18 = (typetoken_ratio(word_tokenize(w[1]))-trained[0][17])/(trained[1][17])
z19 = (count_words(w[1])-trained[0][18])/(trained[1][18])
z20 = (firs_pluralpronouns(word_tokenize(w[1]))-trained[0][19])/(trained[1][19])
z21 = (sec_pronouns(word_tokenize(w[1]))-trained[0][20])/(trained[1][20])
z22 = (mean_freq(w[1])-trained[0][21])/(trained[1][21])
E = -0.08*z1-0.02*z2-0.07*z3-0.05*z5+0.05*z6-0.08*z7-0.03*z8-0.03*z9-0.06*z10+0.07*z11+0.07*z13-0.06*z14-0.06*z15+0.07*z16-0.01*z17-0.05*z18-0.01*z19+0.06*z20-0.01*z21+0.05*z22
S = -0.04*z1+0.01*z2+0.02*z3-0.05*z4-0.02*z5-0.15*z6+0.12*z7-0.18*z8+0.05*z9+0.03*z10+0.07*z11+0.06*z12+0.12*z13-0.05*z14+0.06*z15-0.14*z16+0.1*z18+0.02*z19+0.07*z20+0.03*z21-0.06*z22
A = -0.01*z1-0.02*z2+0.01*z3+0.06*z4-0.01*z5+0.05*z6+0.11*z7-0.11*z8-0.03*z9-0.04*z10+0.05*z11+0.04*z12+0.04*z13-0.04*z14-0.05*z15-0.06*z16-0.14*z17-0.04*z18+0.02*z19+0.04*z20-0.06*z21+0.03*z22
C = -0.04*z1-0.01*z2+0.01*z3-0.03*z5+0.04*z6-0.07*z7-0.11*z8-0.02*z9-0.01*z10+0.02*z11+0.08*z12+0.02*z13-0.06*z14+0.02*z15-0.04*z16-0.11*z17-0.05*z18-0.02*z19+0.01*z20-0.04*z21+0.06*z22
O = -10*z1+0.1*z2+0.06*z3-0.03*z4+0.09*z5-0.14*z6+0.01*z7+0.04*z8-0.06*z9+0.1*z10+0.02*z11-0.04*z12-0.06*z13+0.08*z14+0.1*z15-0.14*z16+0.08*z17+0.09*z18+0.06*z19+0.04*z20+0.11*z21-0.07*z22
if E>0.65:
if w[7] =='y':
ne+=1
if E<0.65:
if w[7]=='n':
ne+=1
if S>0.75:
if w[8] == 'y':
ns +=1
if S<0.75:
if w[8] == 'n':
ns+=1
if A>0.005:
if w[9]=='y':
na+=1
if A<0.005:
if w[9]=='n':
na+=1
if C>0.58:
if w[10]=='y':
nc+=1
if C<0.58:
if w[10]=='n':
nc+=1
if O>(-0.05):
if w[11]=='y':
no+=1
if O<(-0.05):
if w[11]=='n':
no+=1
print (round((ne/9917)*100,2), round((ns/9917)*100,2),round((na/9917)*100,2),round((nc/9917)*100,2),round((no/9917)*100,2))
evaluation(test)
The sample data is:
enter image description here
I am working on RNA sequence matching
seq = 'UCAGCUGUCAGUCAUGAUC'
sub_seq =['UGUCAG', 'CAGUCA', 'UCAGCU','GAUC']
I am matching the sub_seq to the seq, matched sub_seq is under the seq, if there is no matched, use dash line. Output looks like this:
UCAGCUGUCAGUCAUGAUC
UCAGCU--CAGUCA-GAUC
-----UGUCAG--------
I try to use the dictionary to do this
index_dict = {}
for i in xrange(len(sub_seq)):
index_dict[seq.find(sub_seq[i])] = {}
index_dict[seq.find(sub_seq[i])]['sequence'] = sub_seq[i]
index_dict[seq.find(sub_seq[i])]['end_index'] = seq.find(sub_seq[i]) + len(sub_seq[i]) - 1
I cannot figure out the algorithm to do alignment, any help will be appreciated!
seq_l = len(seq)
for ele in sub_seq:
start = seq.find(ele)
ln = len(ele)
if start != -1:
end = start + ln
print("-" * start + ele + "-"*(seq_l- end))
else:
print("-" * seq_l)
-----UGUCAG--------
--------CAGUCA-----
UCAGCU-------------
---------------GAUC
Not sure where UCAGCU--CAGUCA-GAUC comes from as you are only using a single sub sequence at a time in your code
Assuming you'll let me change your index_dict slightly, consider:
seq = 'UCAGCUGUCAGUCAUGAUC'
sub_seq =['UGUCAG', 'CAGUCA', 'UCAGCU','GAUC']
index_dict = {}
for i in xrange(len(sub_seq)):
index_dict[seq.find(sub_seq[i])] = {
'sequence': sub_seq[i],
'end_index': seq.find(sub_seq[i]) + len(sub_seq[i]) # Note this changed
}
sorted_keys = sorted(index_dict)
lines = []
while True:
if not sorted_keys: break
line = []
next_index = 0
for k in sorted_keys:
if k >= next_index:
line.append(k)
next_index = index_dict[k]['end_index']
# Remove keys we used, append line to lines
for k in line: sorted_keys.remove(k)
lines.append(line)
# Build output lines
olines = []
for line in lines:
oline = ''
for k in line:
oline += '-' * (k - len(oline)) # Add dashes before subseq
oline += index_dict[k]['sequence'] # Add subsequence
oline += '-' * (len(seq) - len(oline)) # Add trailing dashes
olines.append(oline)
print seq
print '\n'.join(olines)
Output:
UCAGCUGUCAGUCAUGAUC
UCAGCU--CAGUCA-GAUC
-----UGUCAG--------
Note this is pretty verbose, and could be condensed a bit. The while True and for line in lines loops could probably be merged into one, but it should help explain one possible approach.
Edit: This is one way you might join the last two loops:
seq = 'UCAGCUGUCAGUCAUGAUC'
sub_seq =['UGUCAG', 'CAGUCA', 'UCAGCU','GAUC']
index_dict = {}
for i in xrange(len(sub_seq)):
index_dict[seq.find(sub_seq[i])] = {
'sequence': sub_seq[i],
'end_index': seq.find(sub_seq[i]) + len(sub_seq[i]) # Note this changed
}
sorted_keys = sorted(index_dict)
lines = []
while True:
if not sorted_keys: break
line = ''
next_index = 0
keys_used = []
for k in sorted_keys:
if k >= next_index:
line += '-' * (k - len(line)) # Add dashes before subseq
line += index_dict[k]['sequence'] # Add subsequence
next_index = index_dict[k]['end_index'] # Update next_index
keys_used.append(k) # Mark key as used
for k in keys_used: sorted_keys.remove(k) # Remove used keys
line += '-' * (len(seq) - len(line)) # Add trailing dashes
lines.append(line) # Add line to lines
print seq
print '\n'.join(lines)
Output:
UCAGCUGUCAGUCAUGAUC
UCAGCU--CAGUCA-GAUC
-----UGUCAG--------