Cannot figure out improper string splicing - python

I am making a function which takes a tuple containing strings, selects one string, then selects a start and end index within that string. Within this index, the string is chopped up into pieces and permuted randomly and then the tuple is returned with the modified string. However, there appears to be some sort of error when defining the start and end index in the function and then splicing the string based on it. In some runs it works as expected, and in others it does not and hence throws an error which I avoid in the while loop. Can anyone tell me what could be going on here:
def chromothripsist(seqs, keep_frequency = 0.9):
seqs = list(seqs)
c = list(range(len(seqs)))
chrom = random.sample(c, 1)
orig_seq = seqs[chrom[0]]
n = len(orig_seq)
NotLongEnough = True
while(NotLongEnough):
splits = random.randint(5,10)
##IF SPLITS > len(seq) then this bugs
print(orig_seq)
lowbd = int(0.1*n)
upbd = int(0.9*n)
distance = int(np.random.uniform(lowbd, upbd))
stidx = random.randint(0,n-1)
edidx = max(stidx + distance, n-1)
dist = edidx - stidx
print(splits, stidx, edidx)
if(dist > splits):
NotLongEnough = False
break
first_part = orig_seq[:stidx]
last_part = orig_seq[edidx:]
seq = orig_seq[stidx:edidx]
# THE ABOVE LINES ARE NOT WORKING AS EXPECTED
print(seq)
print(stidx, edidx)
print(len(seq), splits)
breakpoints = np.random.choice(len(seq), splits-1, replace = False)
breakpoints.sort()
subseq = []
curridx = 0
for i in breakpoints:
subseq.append(seq[curridx:i])
curridx = i
subseq.append(seq[breakpoints[-1]:])
rearrange = np.random.permutation(splits)
#n_to_select= int(splits *keep_frequency)
n_to_select = len(rearrange)
rearrange = random.sample(list(rearrange), n_to_select)
build_seq = ''
for i in rearrange:
build_seq += subseq[i]
seqs[chrom[0]] = first_part + build_seq + last_part
breakpoints = list(breakpoints)
return tuple(seqs), [stidx, edidx, breakpoints, rearrange, chrom[0]]

Related

How do I get my function to find and replace elements on a list?

Essentailly, my program is supposed generate a list, then have the user input a character they want to replace and a character they want to replace it with. This issue is that my functions for these processes just straight up aren't working.
`import random
MAX_ELEMENTS = 25
MIN_CHAR = 33
MAX_CHAR = 126
# args(none)
# return(a randomly generated character) from ! to \~
def generateElement():
\# Declare variables for function
aChar = " "
dieRoll = -1
dieRoll = random.randrange(MIN_CHAR, (MAX_CHAR + 1))
aChar = chr(dieRoll) # Function to convert ASCII code to a character
return aChar
# args(none)
# Creates a list of 1 - 100 elements each element a string of length 1
# return (reference to the list)
def createList():
\# Declare variables for function
aList = \[\]
size = -1
i = -1
size = random.randrange(1, (MAX_ELEMENTS + 1))
i = 0
while i < size:
aList.append(generateElement())
i = i + 1
return aList, size
# args(none)
# displays an index followed by each list element on it's own line
# return(nothing)
def display(aList, size):
\# Declare variables for function
i = -1
i = 0
while i < size:
print("Element at index[%d]=%s" % (i, aList[i]))
i = i + 1
# Starting execution point for the program
def getFindAndReplaceCharacters():
findChar = int("Enter the character you want to find here (ONE character only):")
replaceChar = int("Enter the character you want to replace here (ONE character only):")
ListToReplace = aList
return ListToReplace, findChar, replaceChar
def findAndReplace(ListToReplace, findChar, replaceChar):
return \[replaceChar if Character == findChar else Character for Character in ListToReplace\]
replacedList = findAndReplace(aList, findChar, replaceChar)
def start():
\# Declare variables local to start
aList = \[\]
size = -1
aList, size = createList()
display(aList, size)
getFindAndReplaceCharacters()
findAndReplace(aList, findChar, replaceChar)
print(replacedList)
start()
`
I've tried to move around the parameters and even added another function to try and troubleshoot, but I've gotten no luck so far. All I keep getting is "aList isn't defined." What's the issue with my code??

generate a sequence with respect to subsequences in python

I try to generate the following sequences.
text = ACCCEBCE
target = 000000D0
a random text of different characters is generated. In the text sequence, if the following subsequences are found, the target is going to be D or E. Otherwise, the target will be 0.
ABC --> D
BCD --> E
I write the following code. It works well if I generate a small number of characters. But it does not give any output if I make timesteps = 1000 etc.
import string
import random as rn
import numpy as np
def is_subseq(x, y):
it = iter(y)
return all(any(c == ch for c in it) for ch in x)
def count(a, b, m, n):
# If both first and second string
# is empty, or if second string
# is empty, return 1
if ((m == 0 and n == 0) or n == 0):
return 1
# If only first string is empty
# and second string is not empty,
# return 0
if (m == 0):
return 0
# If last characters are same
# Recur for remaining strings by
# 1. considering last characters
# of both strings
# 2. ignoring last character
# of first string
if (a[m - 1] == b[n - 1]):
return (count(a, b, m - 1, n - 1) +
count(a, b, m - 1, n))
else:
# If last characters are different,
# ignore last char of first string
# and recur for remaining string
return count(a, b, m - 1, n)
# create a sequence classification instance
def get_sequence(n_timesteps):
alphabet="ABCDE"#string.ascii_uppercase
text = ''.join(rn.choices(alphabet, k=n_timesteps))
print(text)
seq_length=3
subseqX = []
subseqY = []
for i in range(0, len(alphabet) - seq_length, 1):
seq_in = alphabet[i:i + seq_length]
seq_out = alphabet[i + seq_length]
subseqX.append([char for char in seq_in])
subseqY.append(seq_out)
print(seq_in, "\t-->\t",seq_out)
y2 = []
match = 0
countlist=np.zeros(len(subseqX))
for i, val in enumerate(text):
found = False
counter = 0
for g, val2 in enumerate(subseqX):
listToStr = ''.join(map(str, subseqX[g]))
howmany = count(text[:i], listToStr, len(text[:i]),len(listToStr))
if is_subseq(listToStr, text[:i]):
if countlist[g] < howmany:
match = match + howmany
countlist[g] = howmany
temp = g
found = True
if found:
y2.append(subseqY[temp])
else:
y2.append(0)
print("counter:\t", counter)
print(text)
print(y2)
# define problem properties
n_timesteps = 100
get_sequence(n_timesteps)
It might be because of the depth of the recursive function. But I need to generate 1000 or 10000 characters.
How can I fix this problem? Any ideas?
I'm not sure I understand all you're trying to do (lots of code there), but I believe this simplified form of the function should work. It maintains a set of subsequences seen so far. It only extends them by adding the next letter when it is encountered. This allows the flagging to know if the prefix to the sequence up to the current character has been seen before.
def flagSequence(S,letters="ABCDE",seqLen=3):
subSeqs = set()
result = "0"
for c in S[:-1]:
p = letters.index(c)
subSeqs.add(c)
if p>0:
subSeqs.update([s+c for s in subSeqs if s[-1]==letters[p-1]])
if p in range(seqLen-1,len(letters)-1) and letters[p-seqLen+1:p+1] in subSeqs:
result += letters[p+1]
else:
result += "0"
return result
output:
text = "BDBACCBECEECAEAEDCAACBCCDDDBBDEEDABDBDE"
print(text)
print(flagSequence(text))
BDBACCBECEECAEAEDCAACBCCDDDBBDEEDABDBDE
000000000D00D0000ED00D0DDEEE00E00E00E0E
with more letters:
alphabet=string.ascii_uppercase
text = ''.join(rn.choices(alphabet, k=10000))
flags = flagSequence(text,alphabet)
print(text[:60])
print(flags[:60])
CHUJKAMWCAAIBXGIZFHALAWWFDDELXREMOQQVXFPNYJRQESRVEJKIAQILYSJ...
000000000000000000000M000000FM00FN00000G0OZK0RFTS0FKLJ0RJMZT...
with longer sequences:
alphabet=string.ascii_uppercase
text = ''.join(rn.choices(alphabet, k=10000))
flags = flagSequence(text,alphabet,seqLen=10)
print(text[200:260])
print(flags[200:260])
...PMZCDQXAOHVMTRLYCNCJABGGNZYAWIHJJCQKMMAENQFHNQTOQOPPGHVQZXZU...
...00N0000Y000WN000Z0O0K0000O0Z0X00KK00LNN00O000O00P0PQQ00WR0Y0...

Compute the number of comparisons used to sort the by QuickSort, Python

So I have this code counting the number of comparisons used to sort by QuickSort method under three different ways in choosing the pivot point. Choose the first element in the array; choose the final element in the array; choose the median of the first, middle (n-th element if len(array)=2n), and final element.
The input file contains all of the integers between 1 and 10,000 (inclusive, with no repeats) in unsorted order.
This code however only outputs the result of the first case correctly, and outputs zero as the result of the later two cases, I have no idea which part caused this please help!
This is the code
from pathlib import Path
file_path = Path("c:/Users/M/Desktop/Al/")
inFile = open(file_path / "QuickSort.txt", 'r')
with inFile as f:
lines = [x.split() for x in f] # a list of lists
f.close()
tempList = zip(*lines) # a list of tuples := transposed lines
tempList = list(tempList)
tempList = tempList[0] # a list of strings
A = map(int,tempList) # a list of integers
def choosePivot(alist,first,last,pivotID):
if pivotID == 'first':
pass
if pivotID == 'last':
(alist[first], alist[last]) = (alist[last], alist[first])
elif pivotID == 'middle':
mid = (last-first)//2 + first
listTemp = [alist[first], alist[last], alist[mid]]
listTemp.sort()
if listTemp[1] == alist[first]:
pivotIndex = first
elif listTemp[1] == alist[last]:
pivotIndex = last
else:
pivotIndex = mid
(alist[first], alist[pivotIndex]) = (alist[pivotIndex], alist[first])
def partition(alist, first, last):
pivotVal = alist[first] # initialise pivot as the first element
leftmark = first+1
for rightmark in range(first+1,last+1):
if alist[rightmark] < pivotVal:
(alist[leftmark],alist[rightmark]) = (alist[rightmark],alist[leftmark])
leftmark = leftmark + 1
(alist[leftmark-1],alist[first]) = (alist[first],alist[leftmark-1])
return leftmark-1 # partition point := where pivot finally occupies
def quickSort(alist,first,last,pivotID):
numComp = last -first
if last <= first:
return (alist, 0)
else:
choosePivot(alist,first,last,pivotID)
splitpoint = partition(alist,first,last)
(Lsorted,numCompL) = quickSort(alist, first, splitpoint-1, pivotID)
(Rsorted,numCompR) = quickSort(alist, splitpoint+1, last, pivotID)
numComp = numComp + numCompL + numCompR
return (alist, numComp)
def main(fileName):
pivotList = ['first', 'middle', 'last']
for pivotID in pivotList:
A = list(fileName)
(A, n) = quickSort(A,0,len(A)-1,pivotID)
print ("number of comparisons: %d", n)
if __name__ == '__main__':
# unit test
# main('test.txt')
main(A)
That's because (A, n) = quickSort(A,0,len(A)-1,pivotID) sorts the list. The first algo returns correct results, subsequent ones return 0 because the list is already sorted by now.
To avoid this, create a copy of the main list using copy = A[:] and then pass a new copy in each iteration. Something like
def main(fileName):
pivotList = ['first', 'middle', 'last']
mainList = list(filename)
for pivotID in pivotList:
A = mainList[:]
(A, n) = quickSort(A,0,len(A)-1,pivotID)
print ("number of comparisons: %d", n)

Conditional Probability- Python

I'm working on this python problem:
Given a sequence of the DNA bases {A, C, G, T}, stored as a string, returns a conditional probability table in a data structure such that one base (b1) can be looked up, and then a second (b2), to get the probability p(b2 | b1) of the second base occurring immediately after the first. (Assumes the length of seq is >= 3, and that the probability of any b1 and b2 which have never been seen together is 0. Ignores the probability that b1 will be followed by the end of the string.)
You may use the collections module, but no other libraries.
However I'm running into a roadblock:
word = 'ATCGATTGAGCTCTAGCG'
def dna_prob2(seq):
tbl = dict()
levels = set(word)
freq = dict.fromkeys(levels, 0)
for i in seq:
freq[i] += 1
for i in levels:
tbl[i] = {x:0 for x in levels}
lastlevel = ''
for i in tbl:
if lastlevel != '':
tbl[lastlevel][i] += 1
lastlevel = i
for i in tbl:
print(i,tbl[i][i] / freq[i])
return tbl
tbl['T']['T'] / freq[i]
Basically, the end result is supposed to be the final line tbl you see above. However, when I try to do that in print(i,tbl[i][i] /freq[i), and run dna_prob2(word), I get 0.0s for everything.
Wondering if anyone here can help out.
Thanks!
I am not sure what it is your code is doing, but this works:
def makeprobs(word):
singles = {}
probs = {}
thedict={}
ll = len(word)
for i in range(ll-1):
x1 = word[i]
x2 = word[i+1]
singles[x1] = singles.get(x1, 0)+1.0
thedict[(x1, x2)] = thedict.get((x1, x2), 0)+1.0
for i in thedict:
probs[i] = thedict[i]/singles[i[0]]
return probs
I finally got back to my professor. This is what it was trying to accomplish:
word = 'ATCGATTGAGCTCTAGCG'
def dna_prob2(seq):
tbl = dict()
levels = set(seq)
freq = dict.fromkeys(levels, 0)
for i in seq:
freq[i] += 1
for i in levels:
tbl[i] = {x:0 for x in levels}
lastlevel = ''
for i in seq:
if lastlevel != '':
tbl[lastlevel][i] += 1
lastlevel = i
return tbl, freq
condfreq, freq = dna_prob2(word)
print(condfreq['T']['T']/freq['T'])
print(condfreq['G']['A']/freq['A'])
print(condfreq['C']['G']/freq['G'])
Hope this helps.

How to rearrange a string's characters such that none of it's adjacent characters are the same, using Python

In my attempt to solve the above question, I've written the following code:
Logic: Create a frequency dict for each character in the string (key= character, value= frequency of the character). If any character's frequency is greater than ceil(n/2), there is no solution. Else, print the most frequent character followed by reducing its frequency in the dict/
import math, operator
def rearrangeString(s):
# Fill this in.
n = len(s)
freqDict = {}
for i in s:
if i not in freqDict.keys():
freqDict[i] = 1
else:
freqDict[i] += 1
for j in list(freqDict.values()):
if j > math.ceil(n / 2):
return None
return maxArrange(freqDict)[:-4]
temp = ""
def maxArrange(inp):
global temp
n = len(inp)
if list(inp.values()) != [0] * n:
resCh = max(inp.items(), key=operator.itemgetter(1))[0]
if resCh is not None and resCh != temp:
inp[resCh] -= 1
# Terminates with None
temp = resCh
return resCh + str(maxArrange(inp))
# Driver
print(rearrangeString("abbccc"))
# cbcabc
print(rearrangeString("abbcccc"))
In the first try, with input abbccc, it gives the right answer, i.e. cbcabc, but fails for the input abbcccc, returning ccbcabc, without handling it using the temp variable, else returning cbcabc and skipping c altogether when handled using temp
How should I modify the logic, or is there a better approach?

Categories