concatenate strings in list if separated by 0s - python

Basically, whenever two strings in a list are separated by one or more zeroes, I want to join them together. ['a',0,'b'] => ["ab"].
I've tried yield and I really can't find a good way to say if you find a zero in the list, concatenate the next non-zero to the previous string.
I've used yield before, but I am just not approaching this correctly. Mind you, I don't insist on using yield, it just seemed the most likely approach to work, since a simple list comprehension won't do it.
Sample data and expected outputs:
dataexp = [
#input #expected
(["a"], ["a"]),
([0,0,"a","b",], ["a","b"]),
([0,"a","0",], ["a","0"]),
(["a",0,"b",], ["ab"]),
(["a",0,0,"b",], ["ab"]),
(["a","b",0], ["a","b"]),
(["a","b","c"], ["a","b","c"]),
(["a",0,"b",0, "c"], ["abc"]),
]
Some sample code
I just don't handle the concatenate logic correctly and only filter5 is a serious attempt.
dataexp = [
#input #expected
([0,0,"a","b",], ["a","b"]),
([0,"a","0",], ["a","0"]),
(["a",0,"b",], ["ab"]),
(["a",0,0,"b",], ["ab"]),
(["a","b",0], ["a","b"]),
(["a","b","c"], ["a","b","c"]),
(["a",0,"b",0, "c"], ["abc"]),
]
def filter0(li):
return [val for val in li if isinstance(val, str)]
def filter3(li):
pos = -1
len_li = len(li)
while pos < len_li-1:
pos += 1
if li[pos] == 0:
continue
else:
res = li[pos]
yield res
def filter5(li):
len_li = len(li)
pos = 2
p0 = p1 = None
while pos < len_li-1:
cur = li[pos]
if p0 in (0, None):
p0 = cur
pos +=1
continue
if cur == 0:
p1 = cur
pos += 1
continue
elif p1 == 0:
p0 = p0 + cur
pos += 1
continue
else:
p1 = cur
pos += 1
yield p0
if p0:
yield p0
if p1:
yield p1
for fn in [filter0, filter3, filter5]:
name = fn.__name__
print(f"\n\n{name}:")
for inp, exp in dataexp:
try:
got = list(fn(inp))
except (Exception,) as e:
got = str(e)
msg = "%-20.20s for %-80.80s \nexp :%s:\ngot :%-80.80s:" % (name, inp, exp, got)
if exp == got:
print(f"\n✅{msg}")
else:
print(f"\n❌{msg}")
I am generating html dynamically by pushing strings into a big List[str] then "\n".join() it. Most of the time, that's fine, browsers ignore whitespace, but Cypress does care about the \n in <td>xyz\n</td>. So, rather than changing everything, I thought I'd find a way to suppress the newline by using mylist.extend(0, "</td>"). But now I am just curious at the look-behind/ahead nature of this list problem. And, if you think Django or Jinja Templates are better suited, you'd be correct, except that this is generating Django Templates, rather than the final html.

I see no benefit of using a generator here. You can just keep track of the state determining your concat condition and either append or concatenate:
from typing import List, Literal, List
def process_list(l: List[Union[str, Literal[0]]]) -> List[str]:
result, concat = [], False
for e in l:
if e == 0:
concat = True
continue
if concat and result:
result[-1] += e
else:
result.append(e)
concat = False
return result

Related

Recursive Decompression of Strings

I'm trying to decompress strings using recursion. For example, the input:
3[b3[a]]
should output:
baaabaaabaaa
but I get:
baaaabaaaabaaaabbaaaabaaaabaaaaa
I have the following code but it is clearly off. The first find_end function works as intended. I am absolutely new to using recursion and any help understanding / tracking where the extra letters come from or any general tips to help me understand this really cool methodology would be greatly appreciated.
def find_end(original, start, level):
if original[start] != "[":
message = "ERROR in find_error, must start with [:", original[start:]
raise ValueError(message)
indent = level * " "
index = start + 1
count = 1
while count != 0 and index < len(original):
if original[index] == "[":
count += 1
elif original[index] == "]":
count -= 1
index += 1
if count != 0:
message = "ERROR in find_error, mismatched brackets:", original[start:]
raise ValueError(message)
return index - 1
def decompress(original, level):
# set the result to an empty string
result = ""
# for any character in the string we have not looked at yet
for i in range(len(original)):
# if the character at the current index is a digit
if original[i].isnumeric():
# the character of the current index is the number of repetitions needed
repititions = int(original[i])
# start = the next index containing the '[' character
x = 0
while x < (len(original)):
if original[x].isnumeric():
start = x + 1
x = len(original)
else:
x += 1
# last = the index of the matching ']'
last = find_end(original, start, level)
# calculate a substring using `original[start + 1:last]
sub_original = original[start + 1 : last]
# RECURSIVELY call decompress with the substring
# sub = decompress(original, level + 1)
# concatenate the result of the recursive call times the number of repetitions needed to the result
result += decompress(sub_original, level + 1) * repititions
# set the current index to the index of the matching ']'
i = last
# else
else:
# concatenate the letter at the current index to the result
if original[i] != "[" and original[i] != "]":
result += original[i]
# return the result
return result
def main():
passed = True
ORIGINAL = 0
EXPECTED = 1
# The test cases
provided = [
("3[b]", "bbb"),
("3[b3[a]]", "baaabaaabaaa"),
("3[b2[ca]]", "bcacabcacabcaca"),
("5[a3[b]1[ab]]", "abbbababbbababbbababbbababbbab"),
]
# Run the provided tests cases
for t in provided:
actual = decompress(t[ORIGINAL], 0)
if actual != t[EXPECTED]:
print("Error decompressing:", t[ORIGINAL])
print(" Expected:", t[EXPECTED])
print(" Actual: ", actual)
print()
passed = False
# print that all the tests passed
if passed:
print("All tests passed")
if __name__ == '__main__':
main()
From what I gathered from your code, it probably gives the wrong result because of the approach you've taken to find the last matching closing brace at a given level (I'm not 100% sure, the code was a lot). However, I can suggest a cleaner approach using stacks (almost similar to DFS, without the complications):
def decomp(s):
stack = []
for i in s:
if i.isalnum():
stack.append(i)
elif i == "]":
temp = stack.pop()
count = stack.pop()
if count.isnumeric():
stack.append(int(count)*temp)
else:
stack.append(count+temp)
for i in range(len(stack)-2, -1, -1):
if stack[i].isnumeric():
stack[i] = int(stack[i])*stack[i+1]
else:
stack[i] += stack[i+1]
return stack[0]
print(decomp("3[b]")) # bbb
print(decomp("3[b3[a]]")) # baaabaaabaaa
print(decomp("3[b2[ca]]")) # bcacabcacabcaca
print(decomp("5[a3[b]1[ab]]")) # abbbababbbababbbababbbababbbab
This works on a simple observation: rather tha evaluating a substring after on reading a [, evaluate the substring after encountering a ]. That would allow you to build the result AFTER the pieces have been evaluated individually as well. (This is similar to the prefix/postfix evaluation using programming).
(You can add error checking to this as well, if you wish. It would be easier to check if the string is semantically correct in one pass and evaluate it in another pass, rather than doing both in one go)
Here is the solution with the similar idea from above:
we go through string putting everything on stack until we find ']', then we go back until '[' taking everything off, find the number, multiply and put it back on stack
It's much less consuming as we don't add strings, but work with lists
Note: multiply number can't be more than 9 as we parse it as one element string
def decompress(string):
stack = []
letters = []
for i in string:
if i != ']':
stack.append(i)
elif i == ']':
letter = stack.pop()
while letter != '[':
letters.append(letter)
letter = stack.pop()
word = ''.join(letters[::-1])
letters = []
stack.append(''.join([word for j in range(int(stack.pop()))]))
return ''.join(stack)

Conditional Probability- Python

I'm working on this python problem:
Given a sequence of the DNA bases {A, C, G, T}, stored as a string, returns a conditional probability table in a data structure such that one base (b1) can be looked up, and then a second (b2), to get the probability p(b2 | b1) of the second base occurring immediately after the first. (Assumes the length of seq is >= 3, and that the probability of any b1 and b2 which have never been seen together is 0. Ignores the probability that b1 will be followed by the end of the string.)
You may use the collections module, but no other libraries.
However I'm running into a roadblock:
word = 'ATCGATTGAGCTCTAGCG'
def dna_prob2(seq):
tbl = dict()
levels = set(word)
freq = dict.fromkeys(levels, 0)
for i in seq:
freq[i] += 1
for i in levels:
tbl[i] = {x:0 for x in levels}
lastlevel = ''
for i in tbl:
if lastlevel != '':
tbl[lastlevel][i] += 1
lastlevel = i
for i in tbl:
print(i,tbl[i][i] / freq[i])
return tbl
tbl['T']['T'] / freq[i]
Basically, the end result is supposed to be the final line tbl you see above. However, when I try to do that in print(i,tbl[i][i] /freq[i), and run dna_prob2(word), I get 0.0s for everything.
Wondering if anyone here can help out.
Thanks!
I am not sure what it is your code is doing, but this works:
def makeprobs(word):
singles = {}
probs = {}
thedict={}
ll = len(word)
for i in range(ll-1):
x1 = word[i]
x2 = word[i+1]
singles[x1] = singles.get(x1, 0)+1.0
thedict[(x1, x2)] = thedict.get((x1, x2), 0)+1.0
for i in thedict:
probs[i] = thedict[i]/singles[i[0]]
return probs
I finally got back to my professor. This is what it was trying to accomplish:
word = 'ATCGATTGAGCTCTAGCG'
def dna_prob2(seq):
tbl = dict()
levels = set(seq)
freq = dict.fromkeys(levels, 0)
for i in seq:
freq[i] += 1
for i in levels:
tbl[i] = {x:0 for x in levels}
lastlevel = ''
for i in seq:
if lastlevel != '':
tbl[lastlevel][i] += 1
lastlevel = i
return tbl, freq
condfreq, freq = dna_prob2(word)
print(condfreq['T']['T']/freq['T'])
print(condfreq['G']['A']/freq['A'])
print(condfreq['C']['G']/freq['G'])
Hope this helps.

How to rearrange a string's characters such that none of it's adjacent characters are the same, using Python

In my attempt to solve the above question, I've written the following code:
Logic: Create a frequency dict for each character in the string (key= character, value= frequency of the character). If any character's frequency is greater than ceil(n/2), there is no solution. Else, print the most frequent character followed by reducing its frequency in the dict/
import math, operator
def rearrangeString(s):
# Fill this in.
n = len(s)
freqDict = {}
for i in s:
if i not in freqDict.keys():
freqDict[i] = 1
else:
freqDict[i] += 1
for j in list(freqDict.values()):
if j > math.ceil(n / 2):
return None
return maxArrange(freqDict)[:-4]
temp = ""
def maxArrange(inp):
global temp
n = len(inp)
if list(inp.values()) != [0] * n:
resCh = max(inp.items(), key=operator.itemgetter(1))[0]
if resCh is not None and resCh != temp:
inp[resCh] -= 1
# Terminates with None
temp = resCh
return resCh + str(maxArrange(inp))
# Driver
print(rearrangeString("abbccc"))
# cbcabc
print(rearrangeString("abbcccc"))
In the first try, with input abbccc, it gives the right answer, i.e. cbcabc, but fails for the input abbcccc, returning ccbcabc, without handling it using the temp variable, else returning cbcabc and skipping c altogether when handled using temp
How should I modify the logic, or is there a better approach?

Delete elements from list based on substring in Python

I have a huge list of strings where a couple of strings only differ in 2 or three characters like this:
ENSH-DFFEV1-5F
ENSH-DFFEV2-5F
ENSH-DFFEV3-5F
FVB.DFFVRV2-4T
FVB.DFFVRV3-4T
What I would like to do is to keep only those elements for which the number after the 'V' is the largest. From the above example I would like to have
ENSH-DFFEV3-5F
FVB.DFFVRV3-4T
Is there a simple way to do this in Python?
#stevieb is right, but anyway, I did the effort for you.
s = """
ENSH-DFFEV1-5F
ENSH-DFFEV2-5F
ENSH-DFFEV3-5F
FVB.DFFVRV2-4T
FVB.DFFVRV3-4T
""".split()
def custom_filter(s):
out = []
current_max = -1
for r in s:
v = int(r.rsplit('-', 1)[0][-1]) # <- you should probably edit this line to fit your data structure
if v > current_max:
current_max = v
out = []
if v == current_max:
out += [r]
return out
for e in custom_filter(s):
print e

why the output is not good in some cases?

i feel stupid and don't know why the output is not good in some cases.
here is the output (at the end)
it sould find the first sub string in given string for example:
sim('banan','na') -> 'na'
def rev (str):
rev_str=''
i = len(str)-1;
while (i >= 0):
rev_str += str[i];
i = i-1;
return rev_str;
######################################
def sim (str,sub):
sub_len = len (sub);
start = str.index(sub);
rev_str = rev(str)
rev_sub = rev(sub)
if (start ==0):
start =1;
end = start + rev_str.index(rev_sub,start-1);
ret_val = ''
print start , end
for n in range (start,end):
ret_val += str[n];
return ret_val;
print sim('abcdef', 'abc')
print sim('abcdef', 'bc')
print sim('banana', 'na')
the output :
1 4
bcd
1 4
bcd
2 4
na
def sim(haystack, needle):
if needle in haystack:
return needle
If you want indices:
def sim(haystack, needle):
index = haystack.index(needle) # throws ValueError when not found
return (index, index + len(needle))
I agree with Cat's solution. FWIW, you probably want to study a bit about slice syntax. If you're doing string manipulation, slices are a basic tool. I don't see from your code why you wanted to reverse your string, but if you must, try this:
my_string = "abcdefg"
reversed = my_string[::-1] # here's the slice magic
print(reversed)

Categories