How to return alphabetical substrings? - python

I'm trying to write a function that takes a string s as an input and returns a list of those substrings within s that are alphabetical. For example, s = 'acegibdh' should return ['acegi', 'bdh'].
Here's the code I've come up with:
s = 'acegibdh'
ans = []
subs = []
i = 0
while i != len(s) - 1:
while s[i] < s[i+1]:
subs.append(s[i])
i += 1
if s[i] > s[i-1]:
subs.append(s[i])
i += 1
subs = ''.join(subs)
ans.append(subs)
subs = []
print ans
It keeps having trouble with the last letter of the string, because of the i+1 test going beyond the index range. I've spent a long time tinkering with it to try and come up with a way to avoid that problem. Does anyone know how to do this?

Why not hard-code the first letter into ans, and then just work with the rest of the string? You can just iterate over the string itself instead of using indices.
>>> s = 'acegibdh'
>>> ans = []
>>> ans.append(s[0])
>>> for letter in s[1:]:
... if letter >= ans[-1][-1]:
... ans[-1] += letter
... else:
... ans.append(letter)
...
>>> ans
['acegi', 'bdh']

s = 'acegibdh'
ans = []
subs = []
subs.append(s[0])
for x in range(len(s)-1):
if s[x] <= s[x+1]:
subs.append(s[x+1])
if s[x] > s[x+1]:
subs = ''.join(subs)
ans.append(subs)
subs = []
subs.append(s[x+1])
subs = ''.join(subs)
ans.append(subs)
print ans
I decided to change your code a bit let me know if you have any questions

Just for fun, a one line solution.
>>> s='acegibdh'
>>> [s[l:r] for l,r in (lambda seq:zip(seq,seq[1:]))([0]+[idx+1 for idx in range(len(s)-1) if s[idx]>s[idx+1]]+[len(s)])]
['acegi', 'bdh']

You should try to avoid loops that increment the position by more than one char per iteration.
Often it is more clear to introduce an additional variable to store information about the previous state:
s = 'acegibdh'
prev = None
ans = []
subs = []
for ch in s:
if prev is None or ch > prev:
subs.append(ch)
else:
ans.append(''.join(subs))
subs = [ch]
prev = ch
ans.append(''.join(subs))
I think this read more straight forward (if there is no previous character or it's still alphabetical with the current char append, else start a new substring). Also you can't get index out of range problems with this approch.

More than one while loop is overkill. I think this is simpler and satisfies your requirement. Note, this fails on empty string.
s = 'acegibdh'
ans = []
current = str(s[0])
i = 1
while i < len(s):
if s[i] > s[i-1]:
current += s[i]
else:
ans.append(current)
current = ''
i += 1
if current != '':
ans.append(current)
print ans

just for fun cause I like doing things a little different sometimes
from itertools import groupby,chain,cycle
def my_gen(s):
check = cycle([1,0])
for k,v in groupby(zip(s,s[1:]),lambda x:x[0]<x[1]):
if k:
v = zip(*v)
yield v[0] + (v[1][-1],)
print list(my_gen('acegibdhabcdefghijk'))

Some of the solutions posted have an index error for empty strings.
Also, instead of keeping a list of characters, or doing repeated string concatenations, you can track the start index, i, of a solution substring and yield s[i:j] where s[j] < s[j-1], then set i to j.
Generator that yields substrings when the next letter is lexicographically less than the previous:
def alpha_subs(s):
i, j = 0, 1
while j < len(s):
if s[j] < s[j-1]:
yield s[i:j]
i = j
j += 1
if s[i:j]:
yield s[i:j]
print(list(alpha_subs('')))
print(list(alpha_subs('acegibdh')))
print(list(alpha_subs('acegibdha')))
[]
['acegi', 'bdh']
['acegi', 'bdh', 'a']
For case insensitivity:
def alpha_subs(s, ignore_case=False):
qs = s.lower() if ignore_case else s
i, j = 0, 1
while j < len(s):
if qs[j] < qs[j-1]:
yield s[i:j]
i = j
j += 1
if s[i:j]:
yield s[i:j]
print(list(alpha_subs('acEgibDh', True)))
print(list(alpha_subs('acEgibDh')))
['acEgi', 'bDh']
['ac', 'Egi', 'b', 'Dh']

Related

Code for consecutive strings works but can't pass random tests

In this problem, I'm given an array(list) strarr of strings and an integer k. My task is to return the first longest string consisting of k consecutive strings taken in the array. My code passed all the sample tests from CodeWars but can't seem to pass the random tests.
Here's the link to the problem.
I did it in two days. I found the max consecutively combined string first. Here's the code for that.
strarr = []
def longest_consec(strarr, k):
strarr.append('')
length = len(strarr)
cons_list = []
end = k
start = 0
freq = -length/2
final_string = []
largest = max(strarr, key=len, default='')
if k == 1:
return largest
elif 1 < k < length:
while(freq <= 1):
cons_list.append(strarr[start:end])
start += k-1
end += k-1
freq += 1
for index in cons_list:
final_string.append(''.join(index))
return max(final_string, key=len, default='')
else:
return ""
Since that didn't pass all the random tests, I compared the combined k strings on both sides of the single largest string. But, this way, the code doesn't account for the case when the single largest string is in the middle. Please help.
strarr = []
def longest_consec(strarr, k):
strarr.append('')
length = len(strarr)
largest = max(strarr, key=len, default='')
pos = int(strarr.index(largest))
if k == 1:
return largest
elif 1 < k < length:
prev_string = ''.join(strarr[pos+1-k:pos+1])
next_string = ''.join(strarr[pos:pos+k])
if len(prev_string) >= len(next_string):
res = prev_string
else:
res = next_string
return res
else:
return ""
print(longest_consec(["zone", "abigail", "theta", "form", "libe"], 2))
Let's start from the first statement of your function:
if k == 1:
while(p <= 1):
b.append(strarr[j:i])
j += 1
i += 1
p += 1
for w in b:
q.append(''.join(w))
return max(q, key=len)
Here q is finally equal strarr so you can shorten this code to:
if k == 1:
return max(strarr, key=len)
I see that second statement's condition checks if k value is between 1 and length of string array inclusive:
elif k > 1 and k <= 2*a:
...
If you want no errors remove equality symbol, last element of every array has index lesser than its length (equal exactly length of it minus 1).
Ceiling and division is not necessary in a definition, so you can shorten this:
a = ceil(len(strarr)/2)
into this:
a = len(strarr)
then your elif statement may look like below:
elif 1 < k < a: # Same as (k > 1 and k < a)
...
again, I see you want to concatenate (add) the longest string to k next strings using this code:
while(p <= 1):
b.append(strarr[j:i])
j += k-1
i += k-1
p += 1
for w in b:
q.append(''.join(w))
return max(q, key=len)
the more clearer way of doing this:
longest = max(strarr, key=len) # Longest string in array.
index = 0 # Index of the current item.
for string in strarr:
# If current string is equal the longest one ...
if string == longest:
# Join 'k' strings from current index (longest string index).
return ''.join(strarr[index:index + k])
index += 1 # Increase current index.
And the last statement which is:
elif k > 2*a or k<1:
return ""
if all previous statements failed then value is invalid so you can instead write:
return "" # Same as with else.
Now everything should work. I advice you learning the basics (especially lists, strings and slices), and please name your variables wisely so they are more readable.
You can try this as well
this has passed all the test cases on the platform you suggested.
def longest_consec(strarr, k):
i = 0
max_ = ""
res = ""
if (k<=0) or (k>len(strarr)):
return ""
while i<=(len(strarr)-k):
start = "".join(strarr[i:i+k])
max_ = max(max_, start, key=len)
if max_==start:
res=strarr[i:i+k]
i+=1
return max_
#output: ["zone", "abigail", "theta", "form", "libe", "zas", "theta", "abigail"], 2 -> abigailtheta
#output: ["zones", "abigail", "theta", "form", "libe", "zas", "theta", "abigail"],2 -> zonesabigail

Leetcode 5: Longes Palindrome Substring

I have been working on the LeetCode problem 5. Longest Palindromic Substring:
Given a string s, return the longest palindromic substring in s.
But I kept getting time limit exceeded on large test cases.
I used dynamic programming as follows:
dp[(i, j)] = True implies that s[i] to s[j] is a palindrome. So if s[i] == str[j] and dp[(i+1, j-1]) is set to True, that means S[i] to S[j] is also a palindrome.
How can I improve the performance of this implementation?
class Solution:
def longestPalindrome(self, s: str) -> str:
dp = {}
res = ""
for i in range(len(s)):
# single character is always a palindrome
dp[(i, i)] = True
res = s[i]
#fill in the table diagonally
for x in range(len(s) - 1):
i = 0
j = x + 1
while j <= len(s)-1:
if s[i] == s[j] and (j - i == 1 or dp[(i+1, j-1)] == True):
dp[(i, j)] = True
if(j-i+1) > len(res):
res = s[i:j+1]
else:
dp[(i, j)] = False
i += 1
j += 1
return res
I think the judging system for this problem is kind of too tight, it took some time to make it pass, improved version:
class Solution:
def longestPalindrome(self, s: str) -> str:
dp = {}
res = ""
for i in range(len(s)):
dp[(i, i)] = True
res = s[i]
for x in range(len(s)): # iterate till the end of the string
for i in range(x): # iterate up to the current state (less work) and for loop looks better here
if s[i] == s[x] and (dp.get((i + 1, x - 1), False) or x - i == 1):
dp[(i, x)] = True
if x - i + 1 > len(res):
res = s[i:x + 1]
return res
Here is another idea to improve the performance:
The nested loop will check over many cases where the DP value is already False for smaller ranges. We can avoid looking at large spans, by looking for palindromes from inside-out and stop extending the span as soon as it no longer is a palindrome. This process should be repeated at every offset in the source string, but this could still save some processing.
The inputs for which then most time is wasted, are those where there are lots of the same letters after each other, like "aaaaaaabcaaaaaaa". These lead to many iterations: each "a" or "aa" could be the center of a palindrome, but "growing" each of them is a waste of time. We should just consider all consecutive "a" together from the start and expand from there onwards.
You can specifically deal with these cases by first grouping consecutive letters which are the same. So the above example would be turned into 4 groups: a(7)b(1)c(1)a(7)
Then let each group in turn be taken as the center of a palindrome. For each group, "fan out" to potentially include one or more neighboring groups at both sides in "tandem". Continue fanning out until either the outside groups are not about the same letter, or they have a different group size. From that result you can derive what the largest palindrome is around that center. In particular, when the case is that the letters of the outer groups are the same, but not their sizes, you still include that letter at the outside of the palindrome, but with a repetition that corresponds to the least of these two mismatching group sizes.
Here is an implementation. I used named tuples to make it more readable:
from itertools import groupby
from collections import namedtuple
Group = namedtuple("Group", "letter,size,end")
class Solution:
def longestPalindrome(self, s: str) -> str:
longest = ""
x = 0
groups = [Group(group[0], len(group), x := x + len(group)) for group in
("".join(group[1]) for group in groupby(s))]
for i in range(len(groups)):
for j in range(0, min(i+1, len(groups) - i)):
if groups[i - j].letter != groups[i + j].letter:
break
left = groups[i - j]
right = groups[i + j]
if left.size != right.size:
break
size = right.end - (left.end - left.size) - abs(left.size - right.size)
if size > len(longest):
x = left.end - left.size + max(0, left.size - right.size)
longest = s[x:x+size]
return longest
Alternatively, you can try this approach, it seems to be faster than 96% Python submission.
def longestPalindrome(self, s: str) -> str:
N = len(s)
if N == 0:
return 0
max_len, start = 1, 0
for i in range(N):
df = i - max_len
if df >= 1 and s[df-1: i+1] == s[df-1: i+1][::-1]:
start = df - 1
max_len += 2
continue
if df >= 0 and s[df: i+1] == s[df: i+1][::-1]:
start= df
max_len += 1
return s[start: start + max_len]
If you want to improve the performance, you should create a variable for len(s) at the beginning of the function and use it. That way instead of calling len(s) 3 times, you would do it just once.
Also, I see no reason to create a class for this function. A simple function will outrun a class method, albeit very slightly.

Trouble trying to find length of longest substring

I wrote the following code. It should return to me the length of the longest subscript in a string without a repeat in letters.
def lengthOfLongestSubstring(s):
lst = []
y = 0
final = 0
count = len(s)
while len(s) > 0:
s = s[y:]
for i in range(len(s)):
if s[i] in lst:
y += 1
count = len(lst)
lst =[]
break
else:
lst.append(s[i])
if count > final:
final=count
return(final)
when entering the string "tmmzuxt" i expect to get an output of 5 (length of "mzuxt") but instead get 4. I have debugged to figure out the problem seems to be that my function skips over the second 'm' when indexing but I can't figure out why. Any suggestions?
Realized I somehow missed a line. Hope this makes more sense.
Your issue here is that you are modifying s while you are running your code.
Consider that in the first iteration, you are getting s = s[0:], so s will now be 'tmmzuxt'. In your next iteration, you are getting s = s[1:], from the modified s. This is still not a problem, because you just get 'mmzuxt'. However, in your third iteration, you are getting s = s[2:], which is now 'zuxt'.
So you need a different variable than s to hold the substring of s that you are actually testing.
here, in your code(line 7) you are updating your string value inside function, everytime your for loop iterates.
for e.g., after every break inside for loop. you string(which is "tmmzuxt") is becoming short and short.
i created a new variable which contains your original string.
def lengthOfLongestSubstring(s):
lst = []
y = 0
final = 0
count = len(s)
main_string = s;#change done here
while len(s) > 0:
s = main_string[y:] #change done here
for i in range(len(s)):
if s[i] in lst:
y += 1
count = len(lst)
lst =[]
break
else:
lst.append(s[i])
if count > final:
final =count
print(final)
return(final)
lengthOfLongestSubstring("tmmzuxt")
The main problem with your code is that you incremented y, even though it should only ever remove the first character. There is no need for a variable y. Try this:
def lengthOfLongestSubstring(s):
final = 0
while len(s) > 0:
count = len(s)
lst = []
for i in range(len(s)):
if s[i] in lst:
count = i - 1
break
lst.append(s[i])
if count > final:
final = count
s = s[1:]
return final
print(lengthOfLongestSubstring("tmmzuxt"))
Here is an edited code. removing #lst =[] and #break lines.
[Code]
def lengthOfLongestSubstring(s):
lst = []
y = 0
final = 0
count = len(s)
while len(s) > 0:
s = s[y:]
for i in range(len(s)):
if s[i] in lst:
y += 1
count = len(lst)
#lst =[]
#break
else:
lst.append(s[i])
if count > final:
final=count
return(final)
s="tmmzuxt"
print(lengthOfLongestSubstring(s))
[Output]
5
I'm not sure if I understand your code, or if the while loop is needed here, actually. Try this instead:
def lengthOfLongestSubstring(s):
max_length = 0
length = 0
previous = ''
for thisCharacter in s:
if thisCharacter != previous:
length += 1
else:
max_length = max(length, max_length)
length = 1
return max_length

iterating through loops without using Regex [duplicate]

lst = 'AB[CD]EF[GH]'
Output: ['A','B','CD','E','F','GH']
This is what I've tried but it's not working...
while(index < len(my_string)):
curr_char = my_string[index]
if(curr_char == '['):
while(curr_char != ']'):
multi = my_string[index + 1]
index += 1
lst += multi
Can anybody please help? Without importing Regex or whatever. I wanna do this without using it.
The problems with the original code seemed to be:
1) lst, index and multi are not initialised
2) the loop is infinite because the loop variable (index) isn't incremented on each iteration.
3) the close bracket needs to be skipped when detected to avoid including it in the final list
This code is an example of how to fix those issues:
def getList(s):
outList=[]
lIndex=0
while lIndex < len(s):
if s[lIndex] == "[":
letters=""
lIndex+=1
while s[lIndex] != "]":
letters+=s[lIndex]
lIndex+=1
outList.append(letters)
else:
outList.append(s[lIndex])
lIndex+=1
return outList
print(getList('AB[CD]EF[GH]'))
You can't use
lst += multi
because you can't concatenate a string with a list.
Moreover, your code enters an infinite loop, because you aren't updating the curr_char variable inside the inner loop, so the condition will always be True.
Also, you are not handling the case when curr_char != '['. And more errors there are.
You can use this code which fixes the above errors while using the same basic logic as your code:
index = 0
multi = ""
res = []
my_str = 'AB[CD]EF[GH]'
while (index < len(my_str)):
curr_char = my_str[index]
if curr_char == '[':
multi += curr_char
while curr_char != ']':
index += 1
curr_char = my_str[index]
multi += curr_char
res.append(multi)
multi = ""
else:
res.append(curr_char)
index += 1
print(res)
Output:
['A', 'B', '[CD]', 'E', 'F', '[GH]']
Please try the following code snippet.
my_string = 'AB[CD]EF[GH]'
lst = []
ind = 0
n = len(my_string)
while (ind < n):
if my_string[ind] == '[':
# if '[' is found, look for the next ']' but ind should not exceed n.
# Your code does not do a ind < n check. It may enter an infinite loop.
ind += 1 # this is done to skip the '[' in result list
temp = '' # create a temporary string to store chars inside '[]'
while ind < n and my_string[ind] != ']':
temp = temp + my_string[ind]
ind+=1
lst.append(temp) # add this temp string to list
ind += 1 # do this to skip the ending ']'.
else:
# If its not '[', simply append char to list.
lst.append(my_string[ind])
ind += 1
print(lst)

List of strings, get common substring of n elements, Python

My problem is maybe similar to this, but another situation.
Consider this list in input :
['ACCCACCCGTGG','AATCCC','CCCTGAGG']
And the other input is n,n is a number, the dimension of the substring in common in every element of the list. So the output has to be the maximum occorence substring with the number of occorences, similar to this:
{'CCC' : 4}
4 becouse in the first element of list are twice, and one time in the other two strings.CCC becouse is the longhest substring with 3 elements,that repeats at least 1 time per string
I started in that way :
def get_n_repeats_list(n,seq_list):
max_substring={}
list_seq=list(seq_list)
for i in range(0,len(list_seq)):
if i+1<len(list_seq):
#Idea : to get elements in common,comparing two strings at time
#in_common=set(list_seq[i])-set(list_seq[i+1])
#max_substring...
return max_substring
Maybe here a solution
import operator
LL = ['ACCCACCCGTGG','AATCCC','CCCTGAGG']
def createLenList(n,LL):
stubs = {}
for l in LL:
for i,e in enumerate(l):
stub = l[i:i+n]
if len(stub) == n:
if stub not in stubs: stubs[stub] = 1
else: stubs[stub] += 1
maxKey = max(stubs.iteritems(), key=operator.itemgetter(1))[0]
return [maxKey,stubs[maxKey]]
maxStub = createLenList(3,LL)
print maxStub
So this is my take on it. It is definitely not the prettiest thing on the planet but it should work just fine.
a = ['ACCCWCCCGTGG', 'AATCCC', 'CCCTGAGG']
def occur(the_list, a_substr):
i_found = 0
for a_string in the_list:
for i_str in range(len(a_string) - len(a_substr) + 1):
#print('Comparing {:s} to {:s}'.format(substr, a_string[i_str:i_str + len(substr)]))
if a_substr == a_string[i_str:i_str + len(a_substr)]:
i_found += 1
return i_found
def found_str(original_List, n):
result_dict = {}
if n > min(map(len, original_List)):
print("The substring has to be shorter than the shortest string!")
exit()
specialChar = '|'
b = specialChar.join(item for item in original_List)
str_list = []
for i in range(len(b) - n):
currStr = b[i:i+n]
if specialChar not in currStr:
str_list.append(currStr)
else:
continue
str_list = set(str_list)
for sub_strs in str_list:
i_found = 0
for strs in original_List:
if sub_strs in strs:
i_found += 1
if i_found == len(original_List):
#print("entered with sub = {:s}".format(sub_strs))
#print(occur(original_List, sub_strs))
result_dict[sub_strs] = occur(original_List, sub_strs)
if result_dict == {}:
print("No common substings of length {:} were found".format(n))
return result_dict
end = found_str(a, 3)
print(end)
returns: {'CCC': 4}
def long_substr(data):
substr = ''
if len(data) > 1 and len(data[0]) > 0:
for i in range(len(data[0])):
for j in range(len(data[0])-i+1):
if j > len(substr) and is_substr(data[0][i:i+j], data):
substr = data[0][i:i+j]
return substr
def is_substr(find, data):
if len(data) < 1 and len(find) < 1:
return False
for i in range(len(data)):
if find not in data[i]:
return False
return True
input_list = ['A', 'ACCCACCCGTGG','AATCCC','CCCTGAGG']
longest_common_str = long_substr(input_list)
if longest_common_str:
frequency = 0
for common in input_list:
frequency += common.count(longest_common_str)
print (longest_common_str, frequency)
else:
print ("nothing common")
Output
A 6

Categories