I try to generate the following sequences.
text = ACCCEBCE
target = 000000D0
a random text of different characters is generated. In the text sequence, if the following subsequences are found, the target is going to be D or E. Otherwise, the target will be 0.
ABC --> D
BCD --> E
I write the following code. It works well if I generate a small number of characters. But it does not give any output if I make timesteps = 1000 etc.
import string
import random as rn
import numpy as np
def is_subseq(x, y):
it = iter(y)
return all(any(c == ch for c in it) for ch in x)
def count(a, b, m, n):
# If both first and second string
# is empty, or if second string
# is empty, return 1
if ((m == 0 and n == 0) or n == 0):
return 1
# If only first string is empty
# and second string is not empty,
# return 0
if (m == 0):
return 0
# If last characters are same
# Recur for remaining strings by
# 1. considering last characters
# of both strings
# 2. ignoring last character
# of first string
if (a[m - 1] == b[n - 1]):
return (count(a, b, m - 1, n - 1) +
count(a, b, m - 1, n))
else:
# If last characters are different,
# ignore last char of first string
# and recur for remaining string
return count(a, b, m - 1, n)
# create a sequence classification instance
def get_sequence(n_timesteps):
alphabet="ABCDE"#string.ascii_uppercase
text = ''.join(rn.choices(alphabet, k=n_timesteps))
print(text)
seq_length=3
subseqX = []
subseqY = []
for i in range(0, len(alphabet) - seq_length, 1):
seq_in = alphabet[i:i + seq_length]
seq_out = alphabet[i + seq_length]
subseqX.append([char for char in seq_in])
subseqY.append(seq_out)
print(seq_in, "\t-->\t",seq_out)
y2 = []
match = 0
countlist=np.zeros(len(subseqX))
for i, val in enumerate(text):
found = False
counter = 0
for g, val2 in enumerate(subseqX):
listToStr = ''.join(map(str, subseqX[g]))
howmany = count(text[:i], listToStr, len(text[:i]),len(listToStr))
if is_subseq(listToStr, text[:i]):
if countlist[g] < howmany:
match = match + howmany
countlist[g] = howmany
temp = g
found = True
if found:
y2.append(subseqY[temp])
else:
y2.append(0)
print("counter:\t", counter)
print(text)
print(y2)
# define problem properties
n_timesteps = 100
get_sequence(n_timesteps)
It might be because of the depth of the recursive function. But I need to generate 1000 or 10000 characters.
How can I fix this problem? Any ideas?
I'm not sure I understand all you're trying to do (lots of code there), but I believe this simplified form of the function should work. It maintains a set of subsequences seen so far. It only extends them by adding the next letter when it is encountered. This allows the flagging to know if the prefix to the sequence up to the current character has been seen before.
def flagSequence(S,letters="ABCDE",seqLen=3):
subSeqs = set()
result = "0"
for c in S[:-1]:
p = letters.index(c)
subSeqs.add(c)
if p>0:
subSeqs.update([s+c for s in subSeqs if s[-1]==letters[p-1]])
if p in range(seqLen-1,len(letters)-1) and letters[p-seqLen+1:p+1] in subSeqs:
result += letters[p+1]
else:
result += "0"
return result
output:
text = "BDBACCBECEECAEAEDCAACBCCDDDBBDEEDABDBDE"
print(text)
print(flagSequence(text))
BDBACCBECEECAEAEDCAACBCCDDDBBDEEDABDBDE
000000000D00D0000ED00D0DDEEE00E00E00E0E
with more letters:
alphabet=string.ascii_uppercase
text = ''.join(rn.choices(alphabet, k=10000))
flags = flagSequence(text,alphabet)
print(text[:60])
print(flags[:60])
CHUJKAMWCAAIBXGIZFHALAWWFDDELXREMOQQVXFPNYJRQESRVEJKIAQILYSJ...
000000000000000000000M000000FM00FN00000G0OZK0RFTS0FKLJ0RJMZT...
with longer sequences:
alphabet=string.ascii_uppercase
text = ''.join(rn.choices(alphabet, k=10000))
flags = flagSequence(text,alphabet,seqLen=10)
print(text[200:260])
print(flags[200:260])
...PMZCDQXAOHVMTRLYCNCJABGGNZYAWIHJJCQKMMAENQFHNQTOQOPPGHVQZXZU...
...00N0000Y000WN000Z0O0K0000O0Z0X00KK00LNN00O000O00P0PQQ00WR0Y0...
Related
I am making a function which takes a tuple containing strings, selects one string, then selects a start and end index within that string. Within this index, the string is chopped up into pieces and permuted randomly and then the tuple is returned with the modified string. However, there appears to be some sort of error when defining the start and end index in the function and then splicing the string based on it. In some runs it works as expected, and in others it does not and hence throws an error which I avoid in the while loop. Can anyone tell me what could be going on here:
def chromothripsist(seqs, keep_frequency = 0.9):
seqs = list(seqs)
c = list(range(len(seqs)))
chrom = random.sample(c, 1)
orig_seq = seqs[chrom[0]]
n = len(orig_seq)
NotLongEnough = True
while(NotLongEnough):
splits = random.randint(5,10)
##IF SPLITS > len(seq) then this bugs
print(orig_seq)
lowbd = int(0.1*n)
upbd = int(0.9*n)
distance = int(np.random.uniform(lowbd, upbd))
stidx = random.randint(0,n-1)
edidx = max(stidx + distance, n-1)
dist = edidx - stidx
print(splits, stidx, edidx)
if(dist > splits):
NotLongEnough = False
break
first_part = orig_seq[:stidx]
last_part = orig_seq[edidx:]
seq = orig_seq[stidx:edidx]
# THE ABOVE LINES ARE NOT WORKING AS EXPECTED
print(seq)
print(stidx, edidx)
print(len(seq), splits)
breakpoints = np.random.choice(len(seq), splits-1, replace = False)
breakpoints.sort()
subseq = []
curridx = 0
for i in breakpoints:
subseq.append(seq[curridx:i])
curridx = i
subseq.append(seq[breakpoints[-1]:])
rearrange = np.random.permutation(splits)
#n_to_select= int(splits *keep_frequency)
n_to_select = len(rearrange)
rearrange = random.sample(list(rearrange), n_to_select)
build_seq = ''
for i in rearrange:
build_seq += subseq[i]
seqs[chrom[0]] = first_part + build_seq + last_part
breakpoints = list(breakpoints)
return tuple(seqs), [stidx, edidx, breakpoints, rearrange, chrom[0]]
This question was asked in an exam but my code (given below) passed just 2 cases out of 7 cases.
Input Format : single line input seperated by comma
Input: str = “abcd,b”
Output: 6
“ab”, “abc”, “abcd”, “b”, “bc” and “bcd” are the required sub-strings.
def slicing(s, k, n):
loop_value = n - k + 1
res = []
for i in range(loop_value):
res.append(s[i: i + k])
return res
x, y = input().split(',')
n = len(x)
res1 = []
for i in range(1, n + 1):
res1 += slicing(x, i, n)
count = 0
for ele in res1:
if y in ele:
count += 1
print(count)
When the target string (ts) is found in the string S, you can compute the number of substrings containing that instance by multiplying the number of characters before the target by the number of characters after the target (plus one on each side).
This will cover all substrings that contain this instance of the target string leaving only the "after" part to analyse further, which you can do recursively.
def countsubs(S,ts):
if ts not in S: return 0 # shorter or no match
before,after = S.split(ts,1) # split on target
result = (len(before)+1)*(len(after)+1) # count for this instance
return result + countsubs(ts[1:]+after,ts) # recurse with right side
print(countsubs("abcd","b")) # 6
This will work for single character and multi-character targets and will run much faster than checking all combinations of substrings one by one.
Here is a simple solution without recursion:
def my_function(s):
l, target = s.split(',')
result = []
for i in range(len(l)):
for j in range(i+1, len(l)+1):
ss = l[i] + l[i+1:j]
if target in ss:
result.append(ss)
return f'count = {len(result)}, substrings = {result}'
print(my_function("abcd,b"))
#count = 6, substrings = ['ab', 'abc', 'abcd', 'b', 'bc', 'bcd']
Here you go, this should help
from itertools import combinations
output = []
initial = input('Enter string and needed letter seperated by commas: ') #Asking for input
list1 = initial.split(',') #splitting the input into two parts i.e the actual text and the letter we want common in output
text = list1[0]
final = [''.join(l) for i in range(len(text)) for l in combinations(text, i+1)] #this is the core part of our code, from this statement we get all the available combinations of the set of letters (all the way from 1 letter combinations to nth letter)
for i in final:
if 'b' in i:
output.append(i) #only outputting the results which have the required letter/phrase in it
yamxxopd
yndfyamxx
Output: 5
I am not quite sure how to find the number of the most amount of shared characters between two strings. For example (the strings above) the most amount of characters shared together is "yamxx" which is 5 characters long.
xx would not be a solution because that is not the most amount of shared characters. In this case the most is yamxx which is 5 characters long so the output would be 5.
I am quite new to python and stack overflow so any help would be much appreciated!
Note: They should be the same order in both strings
Here is simple, efficient solution using dynamic programming.
def longest_subtring(X, Y):
m,n = len(X), len(Y)
LCSuff = [[0 for k in range(n+1)] for l in range(m+1)]
result = 0
for i in range(m + 1):
for j in range(n + 1):
if (i == 0 or j == 0):
LCSuff[i][j] = 0
elif (X[i-1] == Y[j-1]):
LCSuff[i][j] = LCSuff[i-1][j-1] + 1
result = max(result, LCSuff[i][j])
else:
LCSuff[i][j] = 0
print (result )
longest_subtring("abcd", "arcd") # prints 2
longest_subtring("yammxdj", "nhjdyammx") # prints 5
This solution starts with sub-strings of longest possible lengths. If, for a certain length, there are no matching sub-strings of that length, it moves on to the next lower length. This way, it can stop at the first successful match.
s_1 = "yamxxopd"
s_2 = "yndfyamxx"
l_1, l_2 = len(s_1), len(s_2)
found = False
sub_length = l_1 # Let's start with the longest possible sub-string
while (not found) and sub_length: # Loop, over decreasing lengths of sub-string
for start in range(l_1 - sub_length + 1): # Loop, over all start-positions of sub-string
sub_str = s_1[start:(start+sub_length)] # Get the sub-string at that start-position
if sub_str in s_2: # If found a match for the sub-string, in s_2
found = True # Stop trying with smaller lengths of sub-string
break # Stop trying with this length of sub-string
else: # If no matches found for this length of sub-string
sub_length -= 1 # Let's try a smaller length for the sub-strings
print (f"Answer is {sub_length}" if found else "No common sub-string")
Output:
Answer is 5
s1 = "yamxxopd"
s2 = "yndfyamxx"
# initializing counter
counter = 0
# creating and initializing a string without repetition
s = ""
for x in s1:
if x not in s:
s = s + x
for x in s:
if x in s2:
counter = counter + 1
# display the number of the most amount of shared characters in two strings s1 and s2
print(counter) # display 5
In my attempt to solve the above question, I've written the following code:
Logic: Create a frequency dict for each character in the string (key= character, value= frequency of the character). If any character's frequency is greater than ceil(n/2), there is no solution. Else, print the most frequent character followed by reducing its frequency in the dict/
import math, operator
def rearrangeString(s):
# Fill this in.
n = len(s)
freqDict = {}
for i in s:
if i not in freqDict.keys():
freqDict[i] = 1
else:
freqDict[i] += 1
for j in list(freqDict.values()):
if j > math.ceil(n / 2):
return None
return maxArrange(freqDict)[:-4]
temp = ""
def maxArrange(inp):
global temp
n = len(inp)
if list(inp.values()) != [0] * n:
resCh = max(inp.items(), key=operator.itemgetter(1))[0]
if resCh is not None and resCh != temp:
inp[resCh] -= 1
# Terminates with None
temp = resCh
return resCh + str(maxArrange(inp))
# Driver
print(rearrangeString("abbccc"))
# cbcabc
print(rearrangeString("abbcccc"))
In the first try, with input abbccc, it gives the right answer, i.e. cbcabc, but fails for the input abbcccc, returning ccbcabc, without handling it using the temp variable, else returning cbcabc and skipping c altogether when handled using temp
How should I modify the logic, or is there a better approach?
I have a version number in a file like this:
Testing x.x.x.x
So I am grabbing it off like this:
import re
def increment(match):
# convert the four matches to integers
a,b,c,d = [int(x) for x in match.groups()]
# return the replacement string
return f'{a}.{b}.{c}.{d}'
lines = open('file.txt', 'r').readlines()
lines[3] = re.sub(r"\b(\d+)\.(\d+)\.(\d+)\.(\d+)\b", increment, lines[3])
I want to make it so if the last digit is a 9... then change it to 0 and then change the previous digit to a 1. So 1.1.1.9 changes to 1.1.2.0.
I did that by doing:
def increment(match):
# convert the four matches to integers
a,b,c,d = [int(x) for x in match.groups()]
# return the replacement string
if (d == 9):
return f'{a}.{b}.{c+1}.{0}'
elif (c == 9):
return f'{a}.{b+1}.{0}.{0}'
elif (b == 9):
return f'{a+1}.{0}.{0}.{0}'
Issue occurs when its 1.1.9.9 or 1.9.9.9. Where multiple digits need to rounded. How can I handle this issue?
Use integer addition?
def increment(match):
# convert the four matches to integers
a,b,c,d = [int(x) for x in match.groups()]
*a,b,c,d = [int(x) for x in str(a*1000 + b*100 + c*10 + d + 1)]
a = ''.join(map(str,a)) # fix for 2 digit 'a'
# return the replacement string
return f'{a}.{b}.{c}.{d}'
If your versions are never going to go beyond 10, it is better to just convert it to an integer, increment it and then convert back to a string.
This allows you to go up to as many version numbers as you require and you are not limited to thousands.
def increment(match):
match = match.replace('.', '')
match = int(match)
match += 1
match = str(match)
output = '.'.join(match)
return output
Add 1 to the last element. If it's more than 9, set it to 0 and do the same for the previous element. Repeat as necessary:
import re
def increment(match):
# convert the four matches to integers
g = [int(x) for x in match.groups()]
# increment, last one first
pos = len(g)-1
g[pos] += 1
while pos > 0:
if g[pos] > 9:
g[pos] = 0
pos -= 1
g[pos] += 1
else:
break
# return the replacement string
return '.'.join(str(x) for x in g)
print (re.sub(r"\b(\d+)\.(\d+)\.(\d+)\.(\d+)\b", increment, '1.8.9.9'))
print (re.sub(r"\b(\d+)\.(\d+)\.(\d+)\.(\d+)\b", increment, '1.9.9.9'))
print (re.sub(r"\b(\d+)\.(\d+)\.(\d+)\.(\d+)\b", increment, '9.9.9.9'))
Result:
1.9.0.0
2.0.0.0
10.0.0.0