list assignment index out of range by code python? - python

I keep getting an
IndexError: list assignment index out of range.
The error on line 78
This code is written to find motif DNA to bioinformatics
How we can solve this error or the problem ?
Here is my code:
from math import log
class MotifMedianFinding(object):
def __init__(self, input_file):
super(MotifMedianFinding, self).__init__()
self.input_lines = open("C:\\Users\\A.Khassawneh\\Desktop\\fasta.txt")
def output(self):
#main method to call both functions
sequences = {}
for line in self.input_lines:
if '>' in line:
sequences[line] = self.input_lines.next()
for label, seq in sequences.iteritems():
print "DNA:" + seq + "\n\n\n\n\n"
median = self.median_string(seq, 5,5, len(seq))
self.motif(seq, median,5,len(seq))
def median_string(self, dna, t, n, l):
#bound and search method of calulating median string
start_pos = start_pos = [1,1,1,1,1]
best_dist = 1000000000
i = 1
while i > 0:
if i < l:
prefix = str(start_pos)
opt_dist = self.hamming_score(prefix, dna)
if opt_dist > best_dist:
s,i = self.bypass(start_pos,i,l,4)
else:
s,i = self.next_vertex(start_pos,i,l,4)
else:
word = str(s)
if self.hamming_score(word, dna) < best_dist:
best_dist = self.hamming_score(word, dna)
bestword = word
s,i = self.next_vertex(start_pos,i,l,4)
print "Best Word: %s (tot_dis = %s)" % (bestword,best_dist)
return bestword
def motif(self, dna, t, n, l):
#bound and search method of calculating motif
start_pos = [1,1,1,1,1]
best_score = 0
i = 1
while 1 > 0:
if i < t:
opt_score = Score(s, i, dna) + (t-1) * l
if opt_score < best_score:
start_pos, i = self.bypass(start_pos, i, t, n-l+1)
else:
start_pos, i = self.next_vertex(start_pos, i, t, n-l+1)
else:
if self.score(start_pos, dna) > best_score:
best_score = self.score(start_pos)
best_motif = str(s)
start_pos, i = self.next_vertex(start_pos, i, t, n-l+1)
print "motif consensus string: %s (consensus_score = %s) " % (best_motif, best_score)
print "motif positions/string s=(s1..st): %s" % ', '.join(start_pos)
return best_motif
def bypass(vertex, level, l, k):
#skip uncessary calculations in the tree
j = level
for ind in xrange(j,1,-1):
if a[j] < k:
a[j] = a[j] + 1
return vertex, j
return vertex, 0
def next_vertex(self, vertex, level, L, k):
#transverse the tree of a strand of genes
if level <L:
vertex[level+1] = 1
return vertex,level+1
else:
j = L
for ind in xrange(j,1,-1):
if vertex[ind] < k:
vertex[j] = vertex[j] + 1
return vertex, j
return vertex, 0
def score(start_pos):
# biggest score of motif
total = 0
for i in start_pos:
total += i
return total
def hamming_score(self, s, dna):
pass
motif_median = MotifMedianFinding('HMP-part.fa')
motif_median.output()

xrange(x,y) goes from x to y-1 (x, x+1.... y-1). In your code, it would have been fine to do xrange(1,j), because that wouldn't have included j. But if you swap it to xrange(j,1,-1), you go (j, j-1.... 2).
Basically, you probably need to change it to xrange(j-1,0,-1) depending on your intended range.

Related

k means algorithm python

Whenever k = 2, the code runs in a loop
if k > 2 it sets all, but one of the centroids location to 0,0
I've reviewed it a couple of times , and it doesn't seem like there are any errors probably some sort of logic flaw. The code starts by having a class and its methods which initiate the centroids, calculate the Euclidean distance, and reassign centroids to the average positions of the points that are in the cluster. It then runs a loop that consists of reassigning and calculating distance until a list of the assignments are equal and then plots it.
class Kmeans:
def __init__(self, K, dataset, centroids, sorting):
self.K = K
self.dataset = dataset
self.centroids = centroids
self.sorting = sorting
#sets starting position of centroids
def initializeCentroids(self):
bigX = 0
bigY = 0
self.centroids = []
for i in self.dataset:
if i[0] > bigX:
bigX = i[0]
if i[1] > bigY:
bigY = i[1]
for q in range(self.K):
self.centroids.append([random.randint(0, bigX), random.randint(0, bigY)])
plt.scatter((self.centroids[0][0], self.centroids[1][0]), (self.centroids[0][1], self.centroids[1][1]))
return self.centroids
#calculates euclidean distance
def calcDistance(self):
self.sorting = []
for w in self.dataset:
print(w)
distances = []
counter = 0
for centr in self.centroids:
distances.append(math.sqrt(abs((centr[0] - w[0] * centr[0] - w[0]) + (centr[1] - w[1] * centr[1] - w[1]))))
counter += 1
if counter > 0:
try:
if distances[0] > distances[1]:
distances.pop(0)
if distances[1] > distances[0]:
distances.pop(1)
counter -= 1
except IndexError:
pass
self.sorting.append([w, counter, distances[0]])
return self.sorting
def reassignCentroids(self):
counter3 = 1
for r in range(len(self.centroids)):
positionsX = []
positionsY = []
for t in self.sorting:
if t[1] == counter3:
positionsX.append(t[0][0])
positionsY.append(t[0][1])
population = len(positionsY)
if population == 0:
population = 1
self.centroids.append([sum(positionsX) / population, sum(positionsY) / population])
counter3 += 1
self.centroids.pop(0)
return
k = 4
dataSetSize = input("Enter the amount of tuples you want generated: ")
data_set = []
for o in range(int(dataSetSize)):
data_set.append((random.randint(0, 1000), random.randint(0, 1000)))
attempt = Kmeans(k, data_set, 0, 0)
attempt.initializeCentroids()
xvals = []
yvals = []
sortCompare = []
# plots
for p in data_set:
xvals.append(p[0])
yvals.append(p[1])
running = True
while running:
if len(sortCompare) > 1:
centroidChoice0 = []
centroidChoice1 = []
for p in sortCompare[0]:
centroidChoice0.append(p[1])
for d in sortCompare[1]:
centroidChoice1.append(d[1])
print(centroidChoice1)
print(attempt.centroids)
if centroidChoice1 == centroidChoice0:
running = False
for m in attempt.centroids:
plt.scatter((attempt.centroids[0][0], attempt.centroids[1][0]), (attempt.centroids[0][1], attempt.centroids[1][1]))
running = False
sortCompare.pop(0)
attempt.calcDistance()
sortCompare.append(attempt.sorting)
attempt.reassignCentroids()

Counting of shortest sequences of flips in pancake sorting

Now, finding the shortest sequence of flips in pancake sorting is alone NP-hard, yet I'd like to find each and all of them, and count them.
Meaning for each permutation I'd like to find all the sequences of prefix reversals that restores the identity but not longer than the shortest one.
Here's what I've got so far:
#!/bin/env python3
# coding: utf-8
from math import factorial
import itertools
from multiprocessing import cpu_count, Manager, Pool
import numpy
import scipy.sparse
def flip(x, value):
return tuple(value[:x][::-1] + value[x:])
def rank(perm):
n = len(perm)
fact = factorial(n)
r = 0
for i in range(n):
fact //= n - i
r += len([x for x in perm[i:] if x < perm[i]]) * fact
return r
def unrank(i, items):
its = items[:]
perm = []
n = len(items)
fact = factorial(n)
r = i % fact
while its:
fact //= n
c, r = divmod(r, fact)
perm.append(its.pop(c))
n -= 1
return tuple(perm)
def get_colex_row(r, n, _fact):
row = scipy.sparse.dok_matrix((
1, _fact[n - 1]), dtype=numpy.int8)
perm = unrank(r, [i for i in range(n)])
for i in range(n):
column = r - r % _fact[i] + rank(perm[:-i - 2:-1])
row[0, column] = i + 1
return row
def get_colex_matrix(n):
fact = [factorial(i) for i in range(1, n + 1)]
m = scipy.sparse.dok_matrix(
(fact[n - 1], fact[n - 1]), dtype=numpy.int8)
items = [_ for _ in range(1, n + 1)]
for r in range(fact[n - 1]):
row = get_colex_row(r, n, fact)
m[r] = row
return m
def get_distance(n, items):
nfact = factorial(n)
stack = {unrank(i, items) for i in range(nfact)}
m = get_colex_matrix(n)
distance = {unrank(nfact - 1, items)[::-1] : 0}
new_distance = {nfact - 1}
d = 0
while distance.keys() != stack:
new_new_distance = set()
d += 1
for visiting in new_distance:
for i in range(2, n + 1):
key_index = m[visiting].tolist().index(i)
key = unrank(key_index, items)[::-1]
if key not in distance:
distance[key] = d
new_new_distance.add(key_index)
new_distance = new_new_distance
return distance
def get_paths_serial(items):
n = len(items)
nfact = factorial(n)
stack = {unrank(i, items) for i in range(nfact)}
m = get_colex_matrix(n)
distance = {unrank(nfact - 1, items)[::-1]: {()}}
new_distance = {nfact - 1}
while distance.keys() != stack:
new_new_distance = set()
for visiting_index in new_distance:
for i in range(2, n + 1):
key_index = m[visiting_index].tolist().index(i)
key = unrank(key_index, items)[::-1]
visiting = unrank(visiting_index, items)[::-1]
paths = distance[visiting]
prev_sample = next(iter(paths))
if key not in distance:
distance[key] = {path + (i,) for path in paths}
new_new_distance.add(key_index)
else:
curr_sample = next(iter(distance[key]))
if len(prev_sample) + 1 < len(curr_sample):
print("Shouldn't happen!")
distance[key] = {path + (i,) for path in paths}
elif len(prev_sample) + 1 == len(curr_sample):
distance[key] |= {path + (i,) for path in paths}
else:
# not relevant
pass
new_distance = new_new_distance
return distance
def _worker(ns, index):
row = get_colex_row(index, ns.n, ns.fact).toarray().tolist()[0]
visiting = unrank(index, ns.items)[::-1]
paths = ns.distance[visiting]
prev_sample = next(iter(paths))
out = {}
my_new_distance = set()
for i in range(2, ns.n + 1):
key_index = row.index(i)
key = unrank(key_index, ns.items)[::-1]
if key not in ns.distance:
out[key] = {path + (i,) for path in paths}
my_new_distance.add(key_index)
else:
curr_sample = next(iter(ns.distance[key]))
if len(prev_sample) + 1 < len(curr_sample):
print("Shouldn't happen!")
out[key] = {path + (i,) for path in paths}
elif len(prev_sample) + 1 == len(curr_sample):
out[key].update(path + (i,) for path in paths)
return my_new_distance, out
def get_paths_parallel(items):
n = len(items)
fact = [factorial(i) for i in range(1, n + 1)]
distance = {unrank(fact[n - 1] - 1, items)[::-1]: {()}}
stack = {unrank(i, items) for i in range(fact[n - 1])}
already_visited = set()
visiting = {fact[n - 1] - 1}
mgr = Manager()
namespace = mgr.Namespace()
namespace.fact = fact
namespace.distance = distance
namespace.items = items
namespace.n = n
with Pool(2 * cpu_count()) as pool:
while distance.keys() != stack:
result = pool.starmap(_worker, ((namespace, job)
for job in visiting))
visiting = set()
for next_to_visit, visited in result:
visiting |= next_to_visit
for k, v in visited.items():
if k in distance:
distance[k] |= v
else:
distance[k] = v
visiting -= already_visited
already_visited |= visiting
namespace.distance = distance
return distance
def colex(value, other):
for i in range(len(value) - 1, 0, -1):
if value[i] == other[i]:
continue
return value[i] > other[i]
return False
def ordered_by(order_cmp):
'Convert a cmp= function into a key= function'
if order_cmp is None:
return None
class K(object):
def __init__(self, obj):
self.value = obj
def __gt__(self, other):
if len(self.value) != len(other.value):
assert "Not the same length"
return order_cmp(self.value, other.value)
return K
def get_ordered(n, order):
return sorted(itertools.permutations(range(1, n + 1)),
key=ordered_by(order))
def get_matrix(n, order=None):
stack = get_ordered(n, order)
m = numpy.zeros((len(stack), len(stack)), numpy.int8)
for i,s in enumerate(stack):
for x in range(1, n + 1):
m[i, stack.index(flip(x, s))] = x
return m
I'm not sure what I'm doing wrong, but get_paths_parallel runs slower than get_paths_serial, please help!
I really should (and probably will soon) document my code better.
So for the time being, I'll say a few additional words:
It uses co-lexicographic ordering to rank the permutations and to find the indices in the adjacency matrix. Where I store the length of the flip that transforms the permutations, e.g. A(i,j) = k if performing a k length prefix reversal on the permutation with rank i results the ranked j permutation. In order to save on memory instead of storing the whole matrix I generate the rows on demand and limit the access by excluding already visited ones also I'm using scipy.sparse.dok_matrix for the same reason.
Other than these it's simply floods the graph till all permutations are reached.
There are some functions that doesn't use all or any of the consideration above like get_matrix, but presented only to validate that others, like get_colex_matrix are working as intended.
I'm creating the key function in a little bit convoluted manner, but that's just because I've tried other sorting before I've settled on co-lex.
Using multiprocessing.Manager to share data between processes makes them slow down.
Solution is to copy the needed data into each process's memory space (passing them as argument) or to use global variables for them.
Also using scipy.sparse.dok_matrix is overkill, dict would do.
I'll grab the literature I've found on the subject and link it hare later.

Index out of Range while counting inversions

I am trying to count the number of inversions in a .txt file given as an argument in a command line input. When ever it gets to line that actually checks if there is an inversion I get a index out of range error. I have tried writing down the place and value in i and j for each loop but I can't figure out how to stop it from going out of range. Here is the error
File "./counting_inversions.py", line 31, in sortAndCountSplit
if (l[i] <= r[j]):
IndexError: list index out of range
Does any one else know a solution?
import argparse
def readFile():
arg_parser = argparse.ArgumentParser(description='Print the given input file.')
arg_parser.add_argument('filename', help='path to a file')
args = arg_parser.parse_args()
with open(args.filename, 'r') as in_file:
n = int(in_file.readline())
vals = [int(val) for val in in_file.readlines()]
return([n, vals])
def sortAndCount(invList):
if (len(invList) == 1):
return (invList, 0)
else:
midpoint = int(len(invList) / 2)
left, lc = sortAndCount(invList[:midpoint])
right, rc = sortAndCount(invList[midpoint:])
arr, sc = sortAndCountSplit(left, right)
return (arr, (lc + rc + sc))
def sortAndCountSplit(l, r):
s = []
i = j = inversions = 0
for k in range((len(l) + len(r))):
if ((i < len(l)) and (l[i] <= r[j]) or j >= len(r)):
s.append(l[i])
i += 1
else:
s.append(r[j])
j += 1
inversions += len(l) - i
return (s, inversions)
def main():
file = readFile()
print(sortAndCount(file[1]))
main()

knapsack branch and bound wrong result

I have converted the code given at this link into a python version. The code is supposed to calculate the correct value of maximum value to be filled in knapsack of weight W. I have attached the code below:
#http://www.geeksforgeeks.org/branch-and-bound-set-2-implementation-of-01-knapsack/
from queue import Queue
class Node:
def __init__(self):
self.level = None
self.profit = None
self.bound = None
self.weight = None
def __str__(self):
return "Level: %s Profit: %s Bound: %s Weight: %s" % (self.level, self.profit, self.bound, self.weight)
def bound(node, n, W, items):
if(node.weight >= W):
return 0
profit_bound = int(node.profit)
j = node.level + 1
totweight = int(node.weight)
while ((j < n) and (totweight + items[j].weight) <= W):
totweight += items[j].weight
profit_bound += items[j].value
j += 1
if(j < n):
profit_bound += (W - totweight) * items[j].value / float(items[j].weight)
return profit_bound
Q = Queue()
def KnapSackBranchNBound(weight, items, total_items):
items = sorted(items, key=lambda x: x.value/float(x.weight), reverse=True)
u = Node()
v = Node()
u.level = -1
u.profit = 0
u.weight = 0
Q.put(u)
maxProfit = 0;
while not Q.empty():
u = Q.get()
if u.level == -1:
v.level = 0
if u.level == total_items - 1:
continue
v.level = u.level + 1
v.weight = u.weight + items[v.level].weight
v.profit = u.profit + items[v.level].value
if (v.weight <= weight and v.profit > maxProfit):
maxProfit = v.profit;
v.bound = bound(v, total_items, weight, items)
if (v.bound > maxProfit):
Q.put(v)
v.weight = u.weight
v.profit = u.profit
v.bound = bound(v, total_items, weight, items)
if (v.bound > maxProfit):
# print items[v.level]
Q.put(v)
return maxProfit
if __name__ == "__main__":
from collections import namedtuple
Item = namedtuple("Item", ['index', 'value', 'weight'])
input_data = open("test.data").read()
lines = input_data.split('\n')
firstLine = lines[0].split()
item_count = int(firstLine[0])
capacity = int(firstLine[1])
print "running from main"
items = []
for i in range(1, item_count+1):
line = lines[i]
parts = line.split()
items.append(Item(i-1, int(parts[0]), float(parts[1])))
kbb = KnapSackBranchNBound(capacity, items, item_count)
print kbb
The program is supposed to calculate value of 235 for following items inside file test.data:
5 10
40 2
50 3.14
100 1.98
95 5
30 3
The first line shows number of items and knapsack weight. Lines below first line shows the value and weight of those items. Items are made using a namedtuple and sorted according to value/weight. For this problem I am getting 135 instead of 235. What am I doing wrong here?
EDIT:
I have solved the problem of finding correct items based on branch and bound. If needed, one can check it here
The problem is that you're inserting multiple references to the same Node() object into your queue. The fix is to initialize two new v objects in each iteration of the while-loop as follows:
while not Q.empty():
u = Q.get()
v = Node() # Added line
if u.level == -1:
v.level = 0
if u.level == total_items - 1:
continue
v.level = u.level + 1
v.weight = u.weight + items[v.level].weight
v.profit = u.profit + items[v.level].value
if (v.weight <= weight and v.profit > maxProfit):
maxProfit = v.profit;
v.bound = bound(v, total_items, weight, items)
if (v.bound > maxProfit):
Q.put(v)
v = Node() # Added line
v.level = u.level + 1 # Added line
v.weight = u.weight
v.profit = u.profit
v.bound = bound(v, total_items, weight, items)
if (v.bound > maxProfit):
# print(items[v.level])
Q.put(v)
Without these reinitializations, you're modifying the v object that you already inserted into the queue.
This is different from C++ where the Node objects are values that are implicitly copied into the queue to avoid aliasing problems such as these.

How to get position of opening/ending HTML tag in Python

How to solve this on Python3, using what lib, and using what sample code?
I have html file, at position Line:Col I have middle of html tag
<table ......>; how to get position of <table> tag edges (brackets < >) and position of its </table> tag edges?
(note: several table tags may be one inside another).
Like said in this SO answer, you should not use regex to parse an HTML file as the standard is highly irregular. You should instead use an HTML parsing library like html.parser : This library offers you HTMLParser.getpos() which returns you the line number and offset of the tag.
This gets you the coordinates of each tag with html.parser, where I monkeypatch the goahead function with a simple modification, calling the custom method get_endpos:
from html.parser import HTMLParser, starttagopen
from html import unescape
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.start_tags = []
self.end_tags = []
self.last_append = []
def handle_starttag(self, tag, attrs):
self.start_tags.append((tag, (self.getpos()[0], self.getpos()[1]),))
self.last_append = self.start_tags
def handle_endtag(self, tag):
self.end_tags.append((tag, (self.getpos()[0], self.getpos()[1]),))
self.last_append = self.end_tags
def get_endpos(self):
self.last_append[-1] = self.last_append[-1] + ((self.getpos()[0], self.getpos()[1]),)
def get_tags(self):
return self.start_tags, self.end_tags
def _reset(self):
HTMLParser.reset(self)
self.start_tags = []
self.end_tags = []
parser = MyHTMLParser()
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n:
if self.convert_charrefs and not self.cdata_elem:
j = rawdata.find('<', i)
if j < 0:
# if we can't find the next <, either we are at the end
# or there's more text incoming. If the latter is True,
# we can't pass the text to handle_data in case we have
# a charref cut in half at end. Try to determine if
# this is the case before proceeding by looking for an
# & near the end and see if it's followed by a space or ;.
amppos = rawdata.rfind('&', max(i, n-34))
if (amppos >= 0 and
not re.compile(r'[\s;]').search(rawdata, amppos)):
break # wait till we get all the text
j = n
else:
match = self.interesting.search(rawdata, i) # < or &
if match:
j = match.start()
else:
if self.cdata_elem:
break
j = n
if i < j:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
if i == n: break
startswith = rawdata.startswith
if startswith('<', i):
if starttagopen.match(rawdata, i): # < + letter
k = self.parse_starttag(i)
elif startswith("</", i):
k = self.parse_endtag(i)
elif startswith("<!--", i):
k = self.parse_comment(i)
elif startswith("<?", i):
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
elif (i + 1) < n:
self.handle_data("<")
k = i + 1
else:
break
if k < 0:
if not end:
break
k = rawdata.find('>', i + 1)
if k < 0:
k = rawdata.find('<', i + 1)
if k < 0:
k = i + 1
else:
k += 1
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:k]))
else:
self.handle_data(rawdata[i:k])
i = self.updatepos(i, k)
self.get_endpos() # only modification: gets end position of tags
elif startswith("&#", i):
match = charref.match(rawdata, i)
if match:
name = match.group()[2:-1]
self.handle_charref(name)
k = match.end()
if not startswith(';', k-1):
k = k - 1
i = self.updatepos(i, k)
continue
else:
if ";" in rawdata[i:]: # bail by consuming &#
self.handle_data(rawdata[i:i+2])
i = self.updatepos(i, i+2)
break
elif startswith('&', i):
match = entityref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_entityref(name)
k = match.end()
if not startswith(';', k-1):
k = k - 1
i = self.updatepos(i, k)
continue
match = incomplete.match(rawdata, i)
if match:
# match.group() will contain at least 2 chars
if end and match.group() == rawdata[i:]:
k = match.end()
if k <= i:
k = n
i = self.updatepos(i, i + 1)
# incomplete
break
elif (i + 1) < n:
# not the end of the buffer, and can't be confused
# with some other construct
self.handle_data("&")
i = self.updatepos(i, i + 1)
else:
break
else:
assert 0, "interesting.search() lied"
# end while
if end and i < n and not self.cdata_elem:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
i = self.updatepos(i, n)
self.rawdata = rawdata[i:]
MyHTMLParser.goahead = goahead
parser.feed(your_html_file_as_a_string)
print(parser.get_tags())

Categories