knapsack branch and bound wrong result - python

I have converted the code given at this link into a python version. The code is supposed to calculate the correct value of maximum value to be filled in knapsack of weight W. I have attached the code below:
#http://www.geeksforgeeks.org/branch-and-bound-set-2-implementation-of-01-knapsack/
from queue import Queue
class Node:
def __init__(self):
self.level = None
self.profit = None
self.bound = None
self.weight = None
def __str__(self):
return "Level: %s Profit: %s Bound: %s Weight: %s" % (self.level, self.profit, self.bound, self.weight)
def bound(node, n, W, items):
if(node.weight >= W):
return 0
profit_bound = int(node.profit)
j = node.level + 1
totweight = int(node.weight)
while ((j < n) and (totweight + items[j].weight) <= W):
totweight += items[j].weight
profit_bound += items[j].value
j += 1
if(j < n):
profit_bound += (W - totweight) * items[j].value / float(items[j].weight)
return profit_bound
Q = Queue()
def KnapSackBranchNBound(weight, items, total_items):
items = sorted(items, key=lambda x: x.value/float(x.weight), reverse=True)
u = Node()
v = Node()
u.level = -1
u.profit = 0
u.weight = 0
Q.put(u)
maxProfit = 0;
while not Q.empty():
u = Q.get()
if u.level == -1:
v.level = 0
if u.level == total_items - 1:
continue
v.level = u.level + 1
v.weight = u.weight + items[v.level].weight
v.profit = u.profit + items[v.level].value
if (v.weight <= weight and v.profit > maxProfit):
maxProfit = v.profit;
v.bound = bound(v, total_items, weight, items)
if (v.bound > maxProfit):
Q.put(v)
v.weight = u.weight
v.profit = u.profit
v.bound = bound(v, total_items, weight, items)
if (v.bound > maxProfit):
# print items[v.level]
Q.put(v)
return maxProfit
if __name__ == "__main__":
from collections import namedtuple
Item = namedtuple("Item", ['index', 'value', 'weight'])
input_data = open("test.data").read()
lines = input_data.split('\n')
firstLine = lines[0].split()
item_count = int(firstLine[0])
capacity = int(firstLine[1])
print "running from main"
items = []
for i in range(1, item_count+1):
line = lines[i]
parts = line.split()
items.append(Item(i-1, int(parts[0]), float(parts[1])))
kbb = KnapSackBranchNBound(capacity, items, item_count)
print kbb
The program is supposed to calculate value of 235 for following items inside file test.data:
5 10
40 2
50 3.14
100 1.98
95 5
30 3
The first line shows number of items and knapsack weight. Lines below first line shows the value and weight of those items. Items are made using a namedtuple and sorted according to value/weight. For this problem I am getting 135 instead of 235. What am I doing wrong here?
EDIT:
I have solved the problem of finding correct items based on branch and bound. If needed, one can check it here

The problem is that you're inserting multiple references to the same Node() object into your queue. The fix is to initialize two new v objects in each iteration of the while-loop as follows:
while not Q.empty():
u = Q.get()
v = Node() # Added line
if u.level == -1:
v.level = 0
if u.level == total_items - 1:
continue
v.level = u.level + 1
v.weight = u.weight + items[v.level].weight
v.profit = u.profit + items[v.level].value
if (v.weight <= weight and v.profit > maxProfit):
maxProfit = v.profit;
v.bound = bound(v, total_items, weight, items)
if (v.bound > maxProfit):
Q.put(v)
v = Node() # Added line
v.level = u.level + 1 # Added line
v.weight = u.weight
v.profit = u.profit
v.bound = bound(v, total_items, weight, items)
if (v.bound > maxProfit):
# print(items[v.level])
Q.put(v)
Without these reinitializations, you're modifying the v object that you already inserted into the queue.
This is different from C++ where the Node objects are values that are implicitly copied into the queue to avoid aliasing problems such as these.

Related

k means algorithm python

Whenever k = 2, the code runs in a loop
if k > 2 it sets all, but one of the centroids location to 0,0
I've reviewed it a couple of times , and it doesn't seem like there are any errors probably some sort of logic flaw. The code starts by having a class and its methods which initiate the centroids, calculate the Euclidean distance, and reassign centroids to the average positions of the points that are in the cluster. It then runs a loop that consists of reassigning and calculating distance until a list of the assignments are equal and then plots it.
class Kmeans:
def __init__(self, K, dataset, centroids, sorting):
self.K = K
self.dataset = dataset
self.centroids = centroids
self.sorting = sorting
#sets starting position of centroids
def initializeCentroids(self):
bigX = 0
bigY = 0
self.centroids = []
for i in self.dataset:
if i[0] > bigX:
bigX = i[0]
if i[1] > bigY:
bigY = i[1]
for q in range(self.K):
self.centroids.append([random.randint(0, bigX), random.randint(0, bigY)])
plt.scatter((self.centroids[0][0], self.centroids[1][0]), (self.centroids[0][1], self.centroids[1][1]))
return self.centroids
#calculates euclidean distance
def calcDistance(self):
self.sorting = []
for w in self.dataset:
print(w)
distances = []
counter = 0
for centr in self.centroids:
distances.append(math.sqrt(abs((centr[0] - w[0] * centr[0] - w[0]) + (centr[1] - w[1] * centr[1] - w[1]))))
counter += 1
if counter > 0:
try:
if distances[0] > distances[1]:
distances.pop(0)
if distances[1] > distances[0]:
distances.pop(1)
counter -= 1
except IndexError:
pass
self.sorting.append([w, counter, distances[0]])
return self.sorting
def reassignCentroids(self):
counter3 = 1
for r in range(len(self.centroids)):
positionsX = []
positionsY = []
for t in self.sorting:
if t[1] == counter3:
positionsX.append(t[0][0])
positionsY.append(t[0][1])
population = len(positionsY)
if population == 0:
population = 1
self.centroids.append([sum(positionsX) / population, sum(positionsY) / population])
counter3 += 1
self.centroids.pop(0)
return
k = 4
dataSetSize = input("Enter the amount of tuples you want generated: ")
data_set = []
for o in range(int(dataSetSize)):
data_set.append((random.randint(0, 1000), random.randint(0, 1000)))
attempt = Kmeans(k, data_set, 0, 0)
attempt.initializeCentroids()
xvals = []
yvals = []
sortCompare = []
# plots
for p in data_set:
xvals.append(p[0])
yvals.append(p[1])
running = True
while running:
if len(sortCompare) > 1:
centroidChoice0 = []
centroidChoice1 = []
for p in sortCompare[0]:
centroidChoice0.append(p[1])
for d in sortCompare[1]:
centroidChoice1.append(d[1])
print(centroidChoice1)
print(attempt.centroids)
if centroidChoice1 == centroidChoice0:
running = False
for m in attempt.centroids:
plt.scatter((attempt.centroids[0][0], attempt.centroids[1][0]), (attempt.centroids[0][1], attempt.centroids[1][1]))
running = False
sortCompare.pop(0)
attempt.calcDistance()
sortCompare.append(attempt.sorting)
attempt.reassignCentroids()

N Puzzle with Depth First Search

I'm trying solve N Puzzle with Depth First Search using python 3.
With 3 x 3 puzzle it run good and fast but with 4 x 4 puzzle, it runs too slow and can't find solution with error: "MemoryError".
I also use "h(n) = depth + number of wrong tiles" to evaluate priority of each node.
I'm a newbie to python so hope you can help me with this
Here is my code:
import sys
import getopt
import random
import time
class State:
def __init__(self, parent, board, move, depth):
self.parent = parent
self.previousMove = move
self.board = board
self.map = ''.join(str(e) for e in board)
self.depth = depth
self.cost = self.calculateCost()
def calculateCost(self):
pos = 1
count = 0
for tile in self.board:
if tile == pos:
count += 1
pos += 1
return self.depth + 8 - count
class Puzzle:
def __init__(self, k, customBoard = None):
self.k = k
self.n = k*k - 1
self.sizeOfBoard = k*k
self.timeOfSolving = 0
self.timeOfGenerateSuccessors = 0
self.maxDeepSearch = 0
self.inititalState = State(None, self.createInitialBoard(customBoard), 'Start', 0)
self.goalBoard = self.createGoalBoard()
self.finalState = None
self.stateStorage = set() # Store states that have visited
self.path = [] # Store states that lead to goalstate
self.stack = []
def isSolvable(self, board):
# count invertion in puzzle's board
invCount = 0
for i in range(0, self.sizeOfBoard - 1):
if board[i] == 0:
continue
for j in range(i+1, self.sizeOfBoard):
if board[j] == 0:
continue
if board[i] > board[j]:
invCount += 1
# print(invCount)
if (invCount % 2 == 0):
return True
return False
def createInitialBoard(self, customBoard):
print("Creating initial state")
if customBoard is None:
board = []
lstAddSuccess = []
while 1:
board.clear()
lstAddSuccess.clear()
for count in range(0, self.k*self.k):
newTile = random.randint(0, self.n)
while newTile in lstAddSuccess:
newTile = random.randint(0, self.n)
lstAddSuccess += [newTile]
board += [newTile]
if self.isSolvable(board):
break
else:
board = [int(e) for e in customBoard]
if not self.isSolvable(board):
print("Cant find solution with this puzzle! Exiting...")
exit(-1)
return board
def createGoalBoard(self):
board = []
for count in range(1, self.n + 1):
board += [count]
board += [0]
return board
def printBoard(self, board):
for row in range(0, self.sizeOfBoard, self.k):
# for col in range(row, row + self.k):
print(board[row:row + self.k])
def generateSuccessors(self, currentState):
indexOfZero = currentState.board.index(0)
rowIndexOfZero = indexOfZero % self.k
colIndexOfZero = indexOfZero // self.k
lstSuccessors = []
# Slide to zero to up
if colIndexOfZero != 0:
newState = currentState.board.copy()
newState[indexOfZero] = newState[indexOfZero - self.k]
newState[indexOfZero - self.k] = 0
lstSuccessors.append(
State(currentState, newState, 'up', currentState.depth + 1))
# Slide zero to down
if colIndexOfZero != self.k - 1:
newState = currentState.board.copy()
newState[indexOfZero] = newState[indexOfZero + self.k]
newState[indexOfZero + self.k] = 0
lstSuccessors.append(
State(currentState, newState, 'down', currentState.depth + 1))
# slide zero to left
if rowIndexOfZero != 0:
newState = currentState.board.copy()
newState[indexOfZero] = newState[indexOfZero - 1]
newState[indexOfZero - 1] = 0
lstSuccessors.append(
State(currentState, newState, 'left', currentState.depth + 1))
# Slide zero to right
if rowIndexOfZero != self.k - 1:
newState = currentState.board.copy()
newState[indexOfZero] = newState[indexOfZero + 1]
newState[indexOfZero + 1] = 0
lstSuccessors.append(
State(currentState, newState, 'right', currentState.depth + 1))
lstSuccessorsCost = [ele.cost for ele in lstSuccessors]
lstSuccessorsInOrderOfCost = []
for i in range(0, len(lstSuccessorsCost)):
lstSuccessorsInOrderOfCost.append(lstSuccessors[lstSuccessorsCost.index(min(lstSuccessorsCost))])
lstSuccessorsCost[lstSuccessorsCost.index(min(lstSuccessorsCost))] = 100
return lstSuccessorsInOrderOfCost
def solvePuzzle(self, currentState):
self.stack.append(currentState)
self.stateStorage.add(currentState.map)
while len(self.stack) > 0:
currentState = self.stack.pop()
if currentState.board == self.goalBoard:
# find path
# self.printBoard(currentState.board)
self.finalState = currentState
print("Solving " + str(self.n) + " puzzle done!")
return
start_time_gen = time.time()
lstSuccessor = self.generateSuccessors(currentState)
end_time_gen = time.time()
timeOfGen = end_time_gen - start_time_gen
self.timeOfGenerateSuccessors += timeOfGen
for successor in lstSuccessor[::-1]:
if successor.map not in self.stateStorage:
self.stack.append(successor)
self.stateStorage.add(successor.map)
if successor.depth > self.maxDeepSearch:
self.maxDeepSearch += 1
print("Cant solve puzzle! Exiting...")
exit(-1)
def solve(self):
start_time = time.time()
self.solvePuzzle(self.inititalState)
end_time = time.time()
self.timeOfSolving = end_time - start_time
print("Running time: " + str(self.timeOfSolving))
print("Max Search Dept: " + str(self.maxDeepSearch))
print("Final State Dept: " + str(self.finalState.depth))
def printInitialBoard(self):
self.printBoard(self.inititalState.board)
def printPath(self):
if self.finalState is None:
print("No solution found!")
return
path = []
state = self.finalState
while (state is not None):
if state.previousMove is not None:
path.append(state.previousMove)
state = state.parent
print("path: "),
print(path[::-1])
def main(argv):
# if (len(argv) != 1 or int(argv[0]) not in range(1, 10000)):
# print("Input must be k of integer, which is k*k matrix of puzzle")
# exit()
# eight_puzzle = Puzzle(int(argv[0]))
k = int(input("Enter size of k * k puzzle, k = "))
while k not in range(2, 100):
print("k must be in range 2 - 100")
k = int(input("Enter size of k * k puzzle, k = "))
print("""
Choose:
1. Randome puzzle
2. Custome puzzle
""")
file = input()
if int(file) == 1:
puzzle = Puzzle(k)
elif int(file) == 2:
board = input("Enter puzzle: ")
puzzle = Puzzle(k ,list(board.split(" ")))
puzzle.printInitialBoard()
puzzle.solve()
puzzle.printPath()
if __name__ == "__main__":
main(sys.argv[1:])

Python implementation of Determining DNA Health algorithm from HackerRank

I am trying to solve Determining DNA Health challenge from Hackerrank using python. (I have to add I am somewhat new to python 3. Still learning the language)
My solution fails for test cases 7, 8 and 9 with a message reading "Wrong Answer".
When I run the following code locally, I can confirm that for these test cases my implementation produces the expected output.
I am wondering what would be the problem.
I am a bit puzzled at the moment. Is there a problem with my implementation? If so how come it produces correct answers for 28 test cases but fails on these 3? Or is it a misleading/confusing result message from Hacker Rank, as I happen to know that people find these 3 test cases (7, 8 and 9) problematic from what I learnt from reading discussions.
Any help would be highly appreciated.
Here is the code I wrote:
from bisect import bisect_left
from bisect import bisect_right
import sys
from unittest.mock import right
class TrieNode(object):
def __init__(self):
self.subnodes = {}
self.isTerminal = False
self.indexList = []
self.healthList = []
def addSubnode(self, aChar):
if (self.subnodes.get(aChar)):
return self.subnodes[aChar]
else:
newNode = TrieNode()
self.subnodes[aChar] = newNode
return newNode
def addIndexAndValue(self, index, health):
self.isTerminal = True
self.indexList.append(index)
lastHealth = 0
healthLength = len(self.healthList)
if (healthLength>0):
lastHealth = self.healthList[healthLength-1]
self.healthList.append(lastHealth + health)
def getSubnodeFor(self, aChar):
return self.subnodes.get(aChar)
def getValueForIndexes(self, startIndex, endIndex):
listSize = len(self.indexList)
if listSize < 1:
return 0
elif listSize == 1:
if startIndex <= self.indexList[0] and endIndex >= self.indexList[0]:
return self.healthList[0]
else:
return 0
else: # listSize > 1
rightInd = bisect_left(self.indexList, endIndex)
if rightInd < listSize and endIndex < self.indexList[0]:
return 0
big = 0
if rightInd >= listSize:
big = self.healthList[listSize - 1]
else:
if endIndex >= self.indexList[rightInd]:
big = self.healthList[rightInd]
else:
big = self.healthList[rightInd-1]
leftInd = bisect_left(self.indexList, startIndex)
small = 0
if leftInd >= listSize:
return 0
else:
if startIndex <= self.indexList[leftInd]:
if (leftInd > 0):
small = self.healthList[leftInd - 1]
else:
small = 0
else:
small = self.healthList[leftInd]
return big - small
class Trie(object):
def __init__(self):
self.root = TrieNode()
def getRoot(self):
return self.root
def createTrie(self, genes, healths):
for i in range(len(genes)):
node = self.root
for c in genes[i]:
node = node.addSubnode(c)
node.addIndexAndValue(i, healths[i])
def calculateHealth(trie, d, first, last):
total = 0
dLength = len(d)
for i in range(0, dLength):
node = trie.getRoot()
for j in range(i, dLength):
node = node.getSubnodeFor(d[j])
if node != None:
if node.isTerminal:
val = node.getValueForIndexes(first, last)
total = total + val
else:
break
return total
def readFromFile(aFileName):
inputArr = None
with open('../hackerRank/src/' + aFileName, encoding='utf-8') as aFile:
inputArr = aFile.read().splitlines()
return inputArr
def runFor(fileName, minimumValue, maximumValue):
inp = readFromFile(fileName)
n = inp[0]
genes = inp[1].rstrip().split()
healths = list(map(int, inp[2].rstrip().split()))
trie = Trie()
trie.createTrie(genes, healths)
s = int(inp[3])
minVal = sys.maxsize
maxVal = -1
for fItr in range(s):
line = inp[fItr+4].split()
first = int(line[0])
last = int(line[1])
d = line[2]
val = calculateHealth(trie, d, first, last)
if val < minVal:
minVal = val
if val > maxVal:
maxVal = val
print (minVal,maxVal)
assert minimumValue == minVal
assert maximumValue == maxVal
# TextX.txt 's are simple text files, which hold test data for regarding test case
# following the file name are real expected numbers for each relevant test case
# I got those from hacker rank
runFor('Test2.txt', 15806635, 20688978289)
runFor('Test7.txt', 0, 7353994)
runFor('Test8.txt', 0, 8652768)
runFor('Test9.txt', 0, 9920592)
runFor('Test33.txt', 11674463, 11674463)
One reference that might assist can be found at:
https://gist.github.com/josephmisiti/940cee03c97f031188ba7eac74d03a4f
Please read the notes he has included.
This is the input I have been using.
6
a b c aa d b
1 2 3 4 5 6
3
1 5 caaab
0 4 xyz
2 4 bcdybc

Counting of shortest sequences of flips in pancake sorting

Now, finding the shortest sequence of flips in pancake sorting is alone NP-hard, yet I'd like to find each and all of them, and count them.
Meaning for each permutation I'd like to find all the sequences of prefix reversals that restores the identity but not longer than the shortest one.
Here's what I've got so far:
#!/bin/env python3
# coding: utf-8
from math import factorial
import itertools
from multiprocessing import cpu_count, Manager, Pool
import numpy
import scipy.sparse
def flip(x, value):
return tuple(value[:x][::-1] + value[x:])
def rank(perm):
n = len(perm)
fact = factorial(n)
r = 0
for i in range(n):
fact //= n - i
r += len([x for x in perm[i:] if x < perm[i]]) * fact
return r
def unrank(i, items):
its = items[:]
perm = []
n = len(items)
fact = factorial(n)
r = i % fact
while its:
fact //= n
c, r = divmod(r, fact)
perm.append(its.pop(c))
n -= 1
return tuple(perm)
def get_colex_row(r, n, _fact):
row = scipy.sparse.dok_matrix((
1, _fact[n - 1]), dtype=numpy.int8)
perm = unrank(r, [i for i in range(n)])
for i in range(n):
column = r - r % _fact[i] + rank(perm[:-i - 2:-1])
row[0, column] = i + 1
return row
def get_colex_matrix(n):
fact = [factorial(i) for i in range(1, n + 1)]
m = scipy.sparse.dok_matrix(
(fact[n - 1], fact[n - 1]), dtype=numpy.int8)
items = [_ for _ in range(1, n + 1)]
for r in range(fact[n - 1]):
row = get_colex_row(r, n, fact)
m[r] = row
return m
def get_distance(n, items):
nfact = factorial(n)
stack = {unrank(i, items) for i in range(nfact)}
m = get_colex_matrix(n)
distance = {unrank(nfact - 1, items)[::-1] : 0}
new_distance = {nfact - 1}
d = 0
while distance.keys() != stack:
new_new_distance = set()
d += 1
for visiting in new_distance:
for i in range(2, n + 1):
key_index = m[visiting].tolist().index(i)
key = unrank(key_index, items)[::-1]
if key not in distance:
distance[key] = d
new_new_distance.add(key_index)
new_distance = new_new_distance
return distance
def get_paths_serial(items):
n = len(items)
nfact = factorial(n)
stack = {unrank(i, items) for i in range(nfact)}
m = get_colex_matrix(n)
distance = {unrank(nfact - 1, items)[::-1]: {()}}
new_distance = {nfact - 1}
while distance.keys() != stack:
new_new_distance = set()
for visiting_index in new_distance:
for i in range(2, n + 1):
key_index = m[visiting_index].tolist().index(i)
key = unrank(key_index, items)[::-1]
visiting = unrank(visiting_index, items)[::-1]
paths = distance[visiting]
prev_sample = next(iter(paths))
if key not in distance:
distance[key] = {path + (i,) for path in paths}
new_new_distance.add(key_index)
else:
curr_sample = next(iter(distance[key]))
if len(prev_sample) + 1 < len(curr_sample):
print("Shouldn't happen!")
distance[key] = {path + (i,) for path in paths}
elif len(prev_sample) + 1 == len(curr_sample):
distance[key] |= {path + (i,) for path in paths}
else:
# not relevant
pass
new_distance = new_new_distance
return distance
def _worker(ns, index):
row = get_colex_row(index, ns.n, ns.fact).toarray().tolist()[0]
visiting = unrank(index, ns.items)[::-1]
paths = ns.distance[visiting]
prev_sample = next(iter(paths))
out = {}
my_new_distance = set()
for i in range(2, ns.n + 1):
key_index = row.index(i)
key = unrank(key_index, ns.items)[::-1]
if key not in ns.distance:
out[key] = {path + (i,) for path in paths}
my_new_distance.add(key_index)
else:
curr_sample = next(iter(ns.distance[key]))
if len(prev_sample) + 1 < len(curr_sample):
print("Shouldn't happen!")
out[key] = {path + (i,) for path in paths}
elif len(prev_sample) + 1 == len(curr_sample):
out[key].update(path + (i,) for path in paths)
return my_new_distance, out
def get_paths_parallel(items):
n = len(items)
fact = [factorial(i) for i in range(1, n + 1)]
distance = {unrank(fact[n - 1] - 1, items)[::-1]: {()}}
stack = {unrank(i, items) for i in range(fact[n - 1])}
already_visited = set()
visiting = {fact[n - 1] - 1}
mgr = Manager()
namespace = mgr.Namespace()
namespace.fact = fact
namespace.distance = distance
namespace.items = items
namespace.n = n
with Pool(2 * cpu_count()) as pool:
while distance.keys() != stack:
result = pool.starmap(_worker, ((namespace, job)
for job in visiting))
visiting = set()
for next_to_visit, visited in result:
visiting |= next_to_visit
for k, v in visited.items():
if k in distance:
distance[k] |= v
else:
distance[k] = v
visiting -= already_visited
already_visited |= visiting
namespace.distance = distance
return distance
def colex(value, other):
for i in range(len(value) - 1, 0, -1):
if value[i] == other[i]:
continue
return value[i] > other[i]
return False
def ordered_by(order_cmp):
'Convert a cmp= function into a key= function'
if order_cmp is None:
return None
class K(object):
def __init__(self, obj):
self.value = obj
def __gt__(self, other):
if len(self.value) != len(other.value):
assert "Not the same length"
return order_cmp(self.value, other.value)
return K
def get_ordered(n, order):
return sorted(itertools.permutations(range(1, n + 1)),
key=ordered_by(order))
def get_matrix(n, order=None):
stack = get_ordered(n, order)
m = numpy.zeros((len(stack), len(stack)), numpy.int8)
for i,s in enumerate(stack):
for x in range(1, n + 1):
m[i, stack.index(flip(x, s))] = x
return m
I'm not sure what I'm doing wrong, but get_paths_parallel runs slower than get_paths_serial, please help!
I really should (and probably will soon) document my code better.
So for the time being, I'll say a few additional words:
It uses co-lexicographic ordering to rank the permutations and to find the indices in the adjacency matrix. Where I store the length of the flip that transforms the permutations, e.g. A(i,j) = k if performing a k length prefix reversal on the permutation with rank i results the ranked j permutation. In order to save on memory instead of storing the whole matrix I generate the rows on demand and limit the access by excluding already visited ones also I'm using scipy.sparse.dok_matrix for the same reason.
Other than these it's simply floods the graph till all permutations are reached.
There are some functions that doesn't use all or any of the consideration above like get_matrix, but presented only to validate that others, like get_colex_matrix are working as intended.
I'm creating the key function in a little bit convoluted manner, but that's just because I've tried other sorting before I've settled on co-lex.
Using multiprocessing.Manager to share data between processes makes them slow down.
Solution is to copy the needed data into each process's memory space (passing them as argument) or to use global variables for them.
Also using scipy.sparse.dok_matrix is overkill, dict would do.
I'll grab the literature I've found on the subject and link it hare later.

list assignment index out of range by code python?

I keep getting an
IndexError: list assignment index out of range.
The error on line 78
This code is written to find motif DNA to bioinformatics
How we can solve this error or the problem ?
Here is my code:
from math import log
class MotifMedianFinding(object):
def __init__(self, input_file):
super(MotifMedianFinding, self).__init__()
self.input_lines = open("C:\\Users\\A.Khassawneh\\Desktop\\fasta.txt")
def output(self):
#main method to call both functions
sequences = {}
for line in self.input_lines:
if '>' in line:
sequences[line] = self.input_lines.next()
for label, seq in sequences.iteritems():
print "DNA:" + seq + "\n\n\n\n\n"
median = self.median_string(seq, 5,5, len(seq))
self.motif(seq, median,5,len(seq))
def median_string(self, dna, t, n, l):
#bound and search method of calulating median string
start_pos = start_pos = [1,1,1,1,1]
best_dist = 1000000000
i = 1
while i > 0:
if i < l:
prefix = str(start_pos)
opt_dist = self.hamming_score(prefix, dna)
if opt_dist > best_dist:
s,i = self.bypass(start_pos,i,l,4)
else:
s,i = self.next_vertex(start_pos,i,l,4)
else:
word = str(s)
if self.hamming_score(word, dna) < best_dist:
best_dist = self.hamming_score(word, dna)
bestword = word
s,i = self.next_vertex(start_pos,i,l,4)
print "Best Word: %s (tot_dis = %s)" % (bestword,best_dist)
return bestword
def motif(self, dna, t, n, l):
#bound and search method of calculating motif
start_pos = [1,1,1,1,1]
best_score = 0
i = 1
while 1 > 0:
if i < t:
opt_score = Score(s, i, dna) + (t-1) * l
if opt_score < best_score:
start_pos, i = self.bypass(start_pos, i, t, n-l+1)
else:
start_pos, i = self.next_vertex(start_pos, i, t, n-l+1)
else:
if self.score(start_pos, dna) > best_score:
best_score = self.score(start_pos)
best_motif = str(s)
start_pos, i = self.next_vertex(start_pos, i, t, n-l+1)
print "motif consensus string: %s (consensus_score = %s) " % (best_motif, best_score)
print "motif positions/string s=(s1..st): %s" % ', '.join(start_pos)
return best_motif
def bypass(vertex, level, l, k):
#skip uncessary calculations in the tree
j = level
for ind in xrange(j,1,-1):
if a[j] < k:
a[j] = a[j] + 1
return vertex, j
return vertex, 0
def next_vertex(self, vertex, level, L, k):
#transverse the tree of a strand of genes
if level <L:
vertex[level+1] = 1
return vertex,level+1
else:
j = L
for ind in xrange(j,1,-1):
if vertex[ind] < k:
vertex[j] = vertex[j] + 1
return vertex, j
return vertex, 0
def score(start_pos):
# biggest score of motif
total = 0
for i in start_pos:
total += i
return total
def hamming_score(self, s, dna):
pass
motif_median = MotifMedianFinding('HMP-part.fa')
motif_median.output()
xrange(x,y) goes from x to y-1 (x, x+1.... y-1). In your code, it would have been fine to do xrange(1,j), because that wouldn't have included j. But if you swap it to xrange(j,1,-1), you go (j, j-1.... 2).
Basically, you probably need to change it to xrange(j-1,0,-1) depending on your intended range.

Categories